diff --git a/mpn/x86_64w/core2/lshift.asm b/mpn/x86_64w/core2/lshift.asm index 513b6b70..7c317baa 100644 --- a/mpn/x86_64w/core2/lshift.asm +++ b/mpn/x86_64w/core2/lshift.asm @@ -20,9 +20,9 @@ ; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ; Boston, MA 02110-1301, USA. ; -; mpn_lshift(mp_ptr rdi, mp_ptr rsi, mp_size_t rdx, mp_limb_t rcx) +; mpn_lshift(mp_ptr rdi, mp_ptr rsi, mp_size_t rdx, mp_limb_t rcx) ; rcx rdx r8d r9 -; rax=carry +; rax=carry %include "..\yasm_mac.inc" @@ -33,78 +33,78 @@ FRAME_PROC mpn_lshift, 0, reg_save_list ; odd and even n seem to have different runtimes - movsxd rbx, r8d - lea rsi, [rdx+24] - lea rdi, [rcx+24] - mov rcx, r9 - - mov rdx, [rsi+rbx*8-32] - xor rax, rax - shld rax, rdx, cl - sub rbx, 5 - js L_skiplp - xalign 16 + movsxd rbx, r8d + lea rsi, [rdx+24] + lea rdi, [rcx+24] + mov rcx, r9 + + mov rdx, [rsi+rbx*8-32] + xor rax, rax + shld rax, rdx, cl + sub rbx, 5 + js L_skiplp + xalign 16 L_lp: - mov r8, [rsi+rbx*8] - mov r11, [rsi+rbx*8-24] - mov r9, [rsi+rbx*8-8] - shld rdx, r8, cl - mov [rdi+rbx*8+8], rdx - mov rdx, r11 - mov r10, [rsi+rbx*8-16] - shld r8, r9, cl - shld r9, r10, cl - mov [rdi+rbx*8], r8 - mov [rdi+rbx*8-8], r9 - shld r10, r11, cl - sub rbx, 4 - mov [rdi+rbx*8+16], r10 - jns L_lp + mov r8, [rsi+rbx*8] + mov r11, [rsi+rbx*8-24] + mov r9, [rsi+rbx*8-8] + shld rdx, r8, cl + mov [rdi+rbx*8+8], rdx + mov rdx, r11 + mov r10, [rsi+rbx*8-16] + shld r8, r9, cl + shld r9, r10, cl + mov [rdi+rbx*8], r8 + mov [rdi+rbx*8-8], r9 + shld r10, r11, cl + sub rbx, 4 + mov [rdi+rbx*8+16], r10 + jns L_lp L_skiplp: - cmp rbx, -2 - ja L_case3 - je L_case2 - jp L_case1 + cmp rbx, -2 + ja L_case3 + je L_case2 + jp L_case1 ; ALIGN(16) L_case0: - shl rdx, cl - mov [rdi+rbx*8+8], rdx - jmp L_xit + shl rdx, cl + mov [rdi+rbx*8+8], rdx + jmp L_xit - xalign 16 + xalign 16 L_case3: - mov r8, [rsi+rbx*8] - mov r9, [rsi+rbx*8-8] - shld rdx, r8, cl - mov [rdi+rbx*8+8], rdx - mov r10, [rsi+rbx*8-16] - shld r8, r9, cl - shld r9, r10, cl - mov [rdi+rbx*8], r8 - mov [rdi+rbx*8-8], r9 - shl r10, cl - mov [rdi+rbx*8-16], r10 - jmp L_xit + mov r8, [rsi+rbx*8] + mov r9, [rsi+rbx*8-8] + shld rdx, r8, cl + mov [rdi+rbx*8+8], rdx + mov r10, [rsi+rbx*8-16] + shld r8, r9, cl + shld r9, r10, cl + mov [rdi+rbx*8], r8 + mov [rdi+rbx*8-8], r9 + shl r10, cl + mov [rdi+rbx*8-16], r10 + jmp L_xit - xalign 16 + xalign 16 L_case2: - mov r8, [rsi+rbx*8] - mov r9, [rsi+rbx*8-8] - shld rdx, r8, cl - mov [rdi+rbx*8+8], rdx - shld r8, r9, cl - shl r9, cl - mov [rdi+rbx*8], r8 - mov [rdi+rbx*8-8], r9 - jmp L_xit - - xalign 16 + mov r8, [rsi+rbx*8] + mov r9, [rsi+rbx*8-8] + shld rdx, r8, cl + mov [rdi+rbx*8+8], rdx + shld r8, r9, cl + shl r9, cl + mov [rdi+rbx*8], r8 + mov [rdi+rbx*8-8], r9 + jmp L_xit + + xalign 16 L_case1: - mov r8, [rsi+rbx*8] - shld rdx, r8, cl - mov [rdi+rbx*8+8], rdx - shl r8, cl - mov [rdi+rbx*8], r8 + mov r8, [rsi+rbx*8] + shld rdx, r8, cl + mov [rdi+rbx*8+8], rdx + shl r8, cl + mov [rdi+rbx*8], r8 L_xit: END_PROC reg_save_list