extend add_n/sub_n assembler code for Skylake AVX to provide add_nc/sub_nc

This commit is contained in:
Brian Gladman 2017-02-09 12:26:38 +00:00
parent bd83f26a70
commit b961c90963
2 changed files with 26 additions and 12 deletions

View File

@ -37,6 +37,16 @@
%define SizeB R9B ; check if this fits to code alignment!
%define Count R11
align 32
LEAF_PROC mpn_add_nc
mov CarryD, [rsp+40]
mov Count, Size
shr Count, 3
inc Count
vpor YMM0, YMM0, YMM0 ; see comment in main loop below
jmp One
align 32
LEAF_PROC mpn_add_n
@ -44,18 +54,16 @@
mov Count, Size
shr Count, 3
inc Count
vpor YMM0, YMM0, YMM0 ; see comment in main loop below
; unrolling the loop from small to high gives better timings
; when considering all sizes 1-100 limb
.One:
One:
test SizeB, 1
je .Two
mov Limb0, [byte Op1] ; I am using implicit code alignment through-
add Limb0, [byte Op2] ; out the following to get all branch targets
shr CarryB, 1
mov Limb0, [Op1] ; I am using implicit code alignment through-
adc Limb0, [Op2] ; out the following to get all branch targets
mov [Op3], Limb0 ; on 16 byte alignments - check this if non-
setc CarryB ; Linux register allocation is used!

View File

@ -38,24 +38,30 @@
%define Count R11
align 32
LEAF_PROC mpn_sub_nc
mov BorrowD, [rsp+40]
mov Count, Size
shr Count, 3
inc Count
vpor YMM0, YMM0, YMM0 ; see comment in main loop below
jmp One
align 32
LEAF_PROC mpn_sub_n
xor BorrowD, BorrowD
mov Count, Size
shr Count, 3
inc Count
vpor YMM0, YMM0, YMM0 ; see comment in main loop below
; unrolling the loop from small to high gives better timings
; when considering all sizes 1-100 limb
.One:
One:
test SizeB, 1
je .Two
mov Limb0, [byte Op1] ; I am using implicit code alignment through-
sub Limb0, [byte Op2] ; out the following to get all branch targets
shr BorrowB, 1
mov Limb0, [Op1] ; I am using implicit code alignment through-
sbb Limb0, [Op2] ; out the following to get all branch targets
mov [Op3], Limb0 ; on 16 byte alignments - check this if non-
setc BorrowB ; Linux register allocation is used!