diff --git a/mpn/x86_64w/skylake/avx/add_n.asm b/mpn/x86_64w/skylake/avx/add_n.asm index 0f7895de..77d155fc 100644 --- a/mpn/x86_64w/skylake/avx/add_n.asm +++ b/mpn/x86_64w/skylake/avx/add_n.asm @@ -37,6 +37,16 @@ %define SizeB R9B ; check if this fits to code alignment! %define Count R11 + align 32 + + LEAF_PROC mpn_add_nc + mov CarryD, [rsp+40] + mov Count, Size + shr Count, 3 + inc Count + vpor YMM0, YMM0, YMM0 ; see comment in main loop below + jmp One + align 32 LEAF_PROC mpn_add_n @@ -44,18 +54,16 @@ mov Count, Size shr Count, 3 inc Count - vpor YMM0, YMM0, YMM0 ; see comment in main loop below ; unrolling the loop from small to high gives better timings ; when considering all sizes 1-100 limb - .One: - + One: test SizeB, 1 je .Two - - mov Limb0, [byte Op1] ; I am using implicit code alignment through- - add Limb0, [byte Op2] ; out the following to get all branch targets + shr CarryB, 1 + mov Limb0, [Op1] ; I am using implicit code alignment through- + adc Limb0, [Op2] ; out the following to get all branch targets mov [Op3], Limb0 ; on 16 byte alignments - check this if non- setc CarryB ; Linux register allocation is used! diff --git a/mpn/x86_64w/skylake/avx/sub_n.asm b/mpn/x86_64w/skylake/avx/sub_n.asm index 6220ccf7..a863f579 100644 --- a/mpn/x86_64w/skylake/avx/sub_n.asm +++ b/mpn/x86_64w/skylake/avx/sub_n.asm @@ -38,24 +38,30 @@ %define Count R11 align 32 + LEAF_PROC mpn_sub_nc + mov BorrowD, [rsp+40] + mov Count, Size + shr Count, 3 + inc Count + vpor YMM0, YMM0, YMM0 ; see comment in main loop below + jmp One + align 32 LEAF_PROC mpn_sub_n xor BorrowD, BorrowD mov Count, Size shr Count, 3 inc Count - vpor YMM0, YMM0, YMM0 ; see comment in main loop below ; unrolling the loop from small to high gives better timings ; when considering all sizes 1-100 limb - .One: - + One: test SizeB, 1 je .Two - - mov Limb0, [byte Op1] ; I am using implicit code alignment through- - sub Limb0, [byte Op2] ; out the following to get all branch targets + shr BorrowB, 1 + mov Limb0, [Op1] ; I am using implicit code alignment through- + sbb Limb0, [Op2] ; out the following to get all branch targets mov [Op3], Limb0 ; on 16 byte alignments - check this if non- setc BorrowB ; Linux register allocation is used!