|
发表于 2023-10-5 17:25:35
|
显示全部楼层
不使用adx指令,多用幾個寄存器,效率接近- .section .text.globl mainmain:addq $-128, %rspmovq %rsp, %rdileaq 64(%rsp), %rcxleaq 128(%rsp), %rsixorl %ebx, %ebxlp:// mul256 startmovq (%rcx), %rdxmulxq (%rsi), %r8, %r12movq %r8, (%rdi)mulxq 8(%rsi), %r9, %r13addq %r12, %r9mulxq 16(%rsi), %r10, %r14adcq %r13, %r10mulxq 24(%rsi), %r11, %r8adcq %r14, %r11adcq $0, %r8movq 8(%rcx), %rdxmulxq (%rsi), %rax, %r12addq %rax, %r9movq %r9, 8(%rdi)mulxq 8(%rsi), %rax, %r13adcq %rax, %r10mulxq 16(%rsi), %rax, %r14adcq %rax, %r11mulxq 24(%rsi), %rax, %r9adcq %rax, %r8adcq $0, %r9addq %r12, %r10adcq %r13, %r11adcq %r14, %r8adcq $0, %r9movq 16(%rcx), %rdxmulxq (%rsi), %rax, %r12addq %rax, %r10movq %r10, 16(%rdi)mulxq 8(%rsi), %rax, %r13adcq %rax, %r11mulxq 16(%rsi), %rax, %r14adcq %rax, %r8mulxq 24(%rsi), %rax, %r9adcq %rax, %r9adcq $0, %r10addq %r12, %r11adcq %r13, %r8adcq %r14, %r9adcq $0, %r10movq 24(%rcx), %rdxmulxq (%rsi), %rax, %r12addq %rax, %r11movq %r11, 24(%rdi)mulxq 8(%rsi), %rax, %r13adcq %rax, %r8mulxq 16(%rsi), %rax, %r14adcq %rax, %r9mulxq 24(%rsi), %rax, %r9adcq %rax, %r10adcq $0, %r10addq %r12, %r8movq %r8, 32(%rsi)adcq %r13, %r9movq %r9, 40(%rsi)adcq %r14, %r10movq %r10, 48(%rsi)adcq $0, %r11movq %r11, 56(%rsi)// mul256 endsubl $1, %ebxjnz lpsubq $-128, %rspret// rdi(dst) rsi(A) rdx=>rcx(B)// r8-r11 current considered 4 bytes// r12-r14 himul high// rax(tmp) rdx(mulx reg)/*mul256:movq %rdx, %rcxmovq (%rdx), %rdxmovq %r12, -8(%rsp)mulxq (%rsi), %r8, %r12movq %r8, (%rdi)movq %r13, -16(%rsp)mulxq 8(%rsi), %r9, %r13addq %r12, %r9movq %r14, -24(%rsp)mulxq 16(%rsi), %r10, %r14adcq %r13, %r10mulxq 24(%rsi), %r11, %r8adcq %r14, %r11adcq $0, %r8movq 8(%rcx), %rdxmulxq (%rsi), %rax, %r12addq %rax, %r9movq %r9, 8(%rdi)mulxq 8(%rsi), %rax, %r13adcq %rax, %r10mulxq 16(%rsi), %rax, %r14adcq %rax, %r11mulxq 24(%rsi), %rax, %r9adcq %rax, %r8adcq $0, %r9addq %r12, %r10adcq %r13, %r11adcq %r14, %r8adcq $0, %r9movq 16(%rcx), %rdxmulxq (%rsi), %rax, %r12addq %rax, %r10movq %r10, 16(%rdi)mulxq 8(%rsi), %rax, %r13adcq %rax, %r11mulxq 16(%rsi), %rax, %r14adcq %rax, %r8mulxq 24(%rsi), %rax, %r9adcq %rax, %r9adcq $0, %r10addq %r12, %r11adcq %r13, %r8adcq %r14, %r9adcq $0, %r10movq 24(%rcx), %rdxmulxq (%rsi), %rax, %r12addq %rax, %r11movq %r11, 24(%rdi)mulxq 8(%rsi), %rax, %r13adcq %rax, %r8mulxq 16(%rsi), %rax, %r14adcq %rax, %r9mulxq 24(%rsi), %rax, %r9adcq %rax, %r10adcq $0, %r10addq %r12, %r8movq %r8, 32(%rsi)adcq %r13, %r9movq %r9, 40(%rsi)adcq %r14, %r10movq %r10, 48(%rsi)adcq $0, %r11movq %r11, 56(%rsi)movq -8(%rsp), %r12movq -16(%rsp), %r13movq -24(%rsp), %r14ret*/
复制代码- Performance counter stats for './a.out': 25,636.00 msec task-clock # 0.999 CPUs utilized 4,351 context-switches # 169.722 /sec 283 cpu-migrations # 11.039 /sec 47 page-faults # 1.833 /sec 100,564,298,132 cycles # 3.923 GHz (83.32%) 304,933,814 stalled-cycles-frontend # 0.30% frontend cycles idle (83.33%) 77,948,700,897 stalled-cycles-backend # 77.51% backend cycles idle (83.34%) 262,154,531,908 instructions # 2.61 insn per cycle # 0.30 stalled cycles per insn (83.33%) 4,334,044,774 branches # 169.061 M/sec (83.34%) 1,346,833 branch-misses # 0.03% of all branches (83.34%) 25.669427577 seconds time elapsed 25.625846000 seconds user 0.011995000 seconds sys
复制代码- Timeline view: 0123456789 0123Index 0123456789 0123456789 [0,0] DeER . . . . . . . movq %rdx, %rcx[0,1] DeeeeeER . . . . . . movq (%rdx), %rdx[0,2] DeE----R . . . . . . movq %r12, -8(%rsp)[0,3] .DeeeeeeeeeER . . . . . mulxq (%rsi), %r8, %r12[0,4] .D========eER . . . . . movq %r8, (%rdi)[0,5] .D=========eER . . . . . movq %r13, -16(%rsp)[0,6] . DeeeeeeeeeER . . . . . mulxq 8(%rsi), %r9, %r13[0,7] . D========eER . . . . . addq %r12, %r9[0,8] . D=========eER. . . . . movq %r14, -24(%rsp)[0,9] . DeeeeeeeeeER. . . . . mulxq 16(%rsi), %r10, %r14[0,10] . D========eER. . . . . adcq %r13, %r10[0,11] . DeeeeeeeeeER . . . . mulxq 24(%rsi), %r11, %r8[0,12] . D========eER . . . . adcq %r14, %r11[0,13] . D=========eER . . . . adcq $0, %r8[0,14] . DeeeeeE----R . . . . movq 8(%rcx), %rdx[0,15] . DeeeeeeeeeER . . . . mulxq (%rsi), %rax, %r12[0,16] . D========eER . . . . addq %rax, %r9[0,17] . .D========eER . . . . movq %r9, 8(%rdi)[0,18] . .DeeeeeeeeeER . . . . mulxq 8(%rsi), %rax, %r13[0,19] . .D========eER . . . . adcq %rax, %r10[0,20] . . DeeeeeeeeeER . . . . mulxq 16(%rsi), %rax, %r14[0,21] . . D========eER . . . . adcq %rax, %r11[0,22] . . DeeeeeeeeeER. . . . mulxq 24(%rsi), %rax, %r9[0,23] . . D========eER. . . . adcq %rax, %r8[0,24] . . D=========eER . . . adcq $0, %r9[0,25] . . D======eE--R . . . addq %r12, %r10[0,26] . . D=======eE-R . . . adcq %r13, %r11[0,27] . . D========eER . . . adcq %r14, %r8[0,28] . . D=========eER . . . adcq $0, %r9[0,29] . . DeeeeeE-----R . . . movq 16(%rcx), %rdx[0,30] . . DeeeeeeeeeER . . . mulxq (%rsi), %rax, %r12[0,31] . . D========eER . . . addq %rax, %r10[0,32] . . D=========eER . . . movq %r10, 16(%rdi)[0,33] . . .DeeeeeeeeeER . . . mulxq 8(%rsi), %rax, %r13[0,34] . . .D========eER . . . adcq %rax, %r11[0,35] . . . DeeeeeeeeeER . . . mulxq 16(%rsi), %rax, %r14[0,36] . . . D========eER . . . adcq %rax, %r8[0,37] . . . DeeeeeeeeeER. . . mulxq 24(%rsi), %rax, %r9[0,38] . . . D=========eER . . adcq %rax, %r9[0,39] . . . D==========eER . . adcq $0, %r10[0,40] . . . D======eE---R . . addq %r12, %r11[0,41] . . . D=======eE--R . . adcq %r13, %r8[0,42] . . . D=========eER . . adcq %r14, %r9[0,43] . . . D==========eER . . adcq $0, %r10[0,44] . . . DeeeeeE------R . . movq 24(%rcx), %rdx[0,45] . . . DeeeeeeeeeE-R . . mulxq (%rsi), %rax, %r12[0,46] . . . D========eE-R . . addq %rax, %r11[0,47] . . . D=========eER . . movq %r11, 24(%rdi)[0,48] . . . .DeeeeeeeeeER . . mulxq 8(%rsi), %rax, %r13[0,49] . . . .D========eER . . adcq %rax, %r8[0,50] . . . . DeeeeeeeeeER . . mulxq 16(%rsi), %rax, %r14[0,51] . . . . D========eER . . adcq %rax, %r9[0,52] . . . . DeeeeeeeeeER. . mulxq 24(%rsi), %rax, %r9[0,53] . . . . D========eER. . adcq %rax, %r10[0,54] . . . . D=========eER . adcq $0, %r10[0,55] . . . . D======eE--R . addq %r12, %r8[0,56] . . . . D=======eE-R . movq %r8, 32(%rsi)[0,57] . . . . D========eER . adcq %r13, %r9[0,58] . . . . D=========eER . movq %r9, 40(%rsi)[0,59] . . . . D=========eER . adcq %r14, %r10[0,60] . . . . D==========eER. movq %r10, 48(%rsi)[0,61] . . . . D=========eER. adcq $0, %r11[0,62] . . . . D==========eER movq %r11, 56(%rsi)[0,63] . . . . DeeeeeE------R movq -8(%rsp), %r12[0,64] . . . . DeeeeeE------R movq -16(%rsp), %r13[0,65] . . . . D=eeeeeE-----R movq -24(%rsp), %r14[0,66] . . . . .DeeeeeeeE---R retq
复制代码 补充内容 (2023-5-5 09:32):
程序修復後測試了正確性及效率 https://pastebin.com/nu1xu5jE |
|