411 lines
11 KiB
NASM
411 lines
11 KiB
NASM
dnl *********************************************************************
|
|
dnl Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and
|
|
dnl add the result to a second limb vector.
|
|
dnl
|
|
dnl Copyright 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
|
|
dnl
|
|
dnl This code is distributed under the terms of the Gnu Lesser
|
|
dnl General Public License.
|
|
dnl
|
|
dnl This code is part of a patch for the GNU MP Library (GMP).
|
|
dnl However, this code is not maintained by the GMP group, so please
|
|
dnl do not bother them with questions/bugs/etc. related to this code.
|
|
dnl If you find a bug in this code, please contact Jason Worth Martin
|
|
dnl at jason.worth.martin@gmail.com
|
|
dnl
|
|
dnl This patch is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of the GNU Lesser General Public License as published
|
|
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
|
|
dnl your option) any later version.
|
|
dnl
|
|
dnl This patch is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
dnl License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public License
|
|
dnl along with the GNU MP Library; see the file LICENSE.TXT. If not, write
|
|
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
dnl Boston, MA 02110-1301, USA.
|
|
dnl
|
|
dnl *********************************************************************
|
|
dnl
|
|
dnl
|
|
dnl CREDITS
|
|
dnl
|
|
dnl This code is based largely on Pierrick Gaudry's excellent assembly
|
|
dnl support for the AMD64 architecture. (Note that Intel64 and AMD64,
|
|
dnl while using the same instruction set, have very different
|
|
dnl microarchitectures. So, this code performs very poorly on AMD64
|
|
dnl machines even though it is near-optimal on Intel64.)
|
|
dnl
|
|
dnl Roger Golliver works for Intel and provided insightful improvements
|
|
dnl particularly in using the "lea" instruction to perform additions
|
|
dnl and register-to-register moves.
|
|
dnl
|
|
dnl Eric Bainville has a brilliant exposition of optimizing arithmetic for
|
|
dnl AMD64 (http://www.bealto.it). I adapted many of the ideas he
|
|
dnl describes to Intel64.
|
|
dnl
|
|
dnl Agner Fog is a demigod in the x86 world. If you are reading assembly
|
|
dnl code files and you haven't heard of Agner Fog, then take a minute to
|
|
dnl look over his software optimization manuals (http://www.agner.org/).
|
|
dnl They are superb.
|
|
dnl
|
|
|
|
|
|
dnl With a 4-way unroll (UNROLL_EXPONENT = 2) the code has
|
|
dnl
|
|
dnl cycles/limb
|
|
dnl Hammer: 4.8
|
|
dnl Woodcrest: 4.6
|
|
dnl
|
|
dnl With increased unrolling, it appears to converge to 4 cycles/limb
|
|
dnl on Intel Core 2 machines. I believe that this is optimal, however
|
|
dnl it requires such absurd unrolling that it becomes unusable for all
|
|
dnl but the largest inputs. A 4-way unroll seems like a good balance
|
|
dnl to me because then commonly used input sizes (e.g. 1024bit Public
|
|
dnl keys) still benifit from the speed up.
|
|
|
|
dnl
|
|
dnl This is just a check to see if we are in my code testing sandbox
|
|
dnl or if we are actually in the GMP source tree
|
|
dnl
|
|
ifdef(`__JWM_Test_Code__',`
|
|
include(`./config.m4')
|
|
define(`MPN_PREFIX',`jwm_mpn_')',`
|
|
include(`../config.m4')')
|
|
|
|
|
|
dnl *********************************************************************
|
|
dnl *********************************************************************
|
|
dnl
|
|
dnl Here are the important m4 parameters for the code
|
|
dnl
|
|
dnl BpL is Bytes per Limb (8 since this is 64bit code)
|
|
dnl
|
|
dnl UNROLL_EXPONENT is the power of 2 for which we unroll the code.
|
|
dnl possible values are 1,2,...,8. A reasonable
|
|
dnl value is probably 2. If really large inputs
|
|
dnl are expected, then 4 is probably good. Larger
|
|
dnl values are really only useful for flashy
|
|
dnl benchmarks and testing asymptotic behavior.
|
|
dnl
|
|
dnl THRESHOLD is the minimum number of limbs needed before we bother
|
|
dnl using the complicated loop. A reasonable value is
|
|
dnl 2*UNROLL_SIZE + 6
|
|
dnl
|
|
dnl *********************************************************************
|
|
dnl *********************************************************************
|
|
define(`BpL',`8')
|
|
define(`UNROLL_EXPONENT',`2')
|
|
define(`UNROLL_SIZE',eval(2**UNROLL_EXPONENT))
|
|
define(`UNROLL_MASK',eval(UNROLL_SIZE-1))
|
|
define(`THRESHOLD',eval(2*UNROLL_SIZE+6))
|
|
|
|
dnl Here is a convenient Macro for addressing
|
|
dnl memory. Entries of the form
|
|
dnl
|
|
dnl ADDR(ptr,index,displacement)
|
|
dnl
|
|
dnl get converted to
|
|
dnl
|
|
dnl displacement*BpL(ptr,index,BpL)
|
|
dnl
|
|
define(`ADDR',`eval(`$3'*BpL)($1,$2,BpL)')
|
|
|
|
|
|
|
|
C Register Usage
|
|
C -------- -----
|
|
C rax low word from mul
|
|
C rbx*
|
|
C rcx s2limb
|
|
C rdx high word from mul
|
|
C rsi s1p
|
|
C rdi rp
|
|
C rbp* Base Pointer
|
|
C rsp* Stack Pointer
|
|
C r8 A_x
|
|
C r9 A_y
|
|
C r10 A_z
|
|
C r11 B_x
|
|
C r12* B_y
|
|
C r13* B_z
|
|
C r14* temp
|
|
C r15* index
|
|
C
|
|
C * indicates that the register must be
|
|
C preserved for the caller.
|
|
define(`s2limb',`%rcx')
|
|
define(`s1p',`%rsi')
|
|
define(`rp',`%rdi')
|
|
define(`A_x',`%r8')
|
|
define(`A_y',`%r9')
|
|
define(`A_z',`%r10')
|
|
define(`B_x',`%r11')
|
|
define(`B_y',`%r12')
|
|
define(`B_z',`%r13')
|
|
define(`temp',`%r14')
|
|
define(`index',`%r15')
|
|
|
|
|
|
C INPUT PARAMETERS
|
|
C rp rdi
|
|
C s1p rsi
|
|
C n rdx
|
|
C s2limb rcx
|
|
|
|
ASM_START()
|
|
PROLOGUE(mpn_addmul_1)
|
|
C Compare the limb count
|
|
C with the threshold value.
|
|
C If the limb count is small
|
|
C we just use the small loop,
|
|
C otherwise we jump to the
|
|
C more complicated loop.
|
|
cmp $THRESHOLD,%rdx
|
|
jge L_mpn_addmul_1_main_loop_prep
|
|
movq %rdx, %r11
|
|
leaq (%rsi,%rdx,8), %rsi
|
|
leaq (%rdi,%rdx,8), %rdi
|
|
negq %r11
|
|
xorq %r8, %r8
|
|
xorq %r10, %r10
|
|
jmp L_mpn_addmul_1_small_loop
|
|
ALIGN(16)
|
|
L_mpn_addmul_1_small_loop:
|
|
movq (%rsi,%r11,8), %rax
|
|
mulq %rcx
|
|
addq (%rdi,%r11,8), %rax
|
|
adcq %r10, %rdx
|
|
addq %r8, %rax
|
|
movq %r10, %r8
|
|
movq %rax, (%rdi,%r11,8)
|
|
adcq %rdx, %r8
|
|
incq %r11
|
|
jne L_mpn_addmul_1_small_loop
|
|
|
|
movq %r8, %rax
|
|
ret
|
|
|
|
L_mpn_addmul_1_main_loop_prep:
|
|
pushq %r15
|
|
pushq %r14
|
|
pushq %r13
|
|
pushq %r12
|
|
C If n is even, we need to do three
|
|
C pre-multiplies, if n is odd we only
|
|
C need to do two.
|
|
movq %rdx, temp
|
|
movq $0, index
|
|
movq $0, A_x
|
|
movq $0, A_y
|
|
andq $1, %rdx
|
|
jnz L_mpn_addmul_1_odd_n
|
|
|
|
C Case n is even
|
|
movq ADDR(s1p,index,0), %rax
|
|
mulq s2limb
|
|
addq %rax, ADDR(rp,index,0)
|
|
adcq %rdx, A_x
|
|
addq $1, index
|
|
C At this point
|
|
C temp = n (even)
|
|
C index = 1
|
|
|
|
L_mpn_addmul_1_odd_n:
|
|
C Now
|
|
C temp = n
|
|
C index = 1 if n even
|
|
C = 0 if n odd
|
|
C
|
|
movq ADDR(s1p,index,0), %rax
|
|
mulq s2limb
|
|
movq ADDR(rp,index,0), A_z
|
|
addq %rax, A_x
|
|
adcq %rdx, A_y
|
|
|
|
movq ADDR(s1p,index,1), %rax
|
|
mulq s2limb
|
|
movq ADDR(rp,index,1), B_z
|
|
movq %rax, B_x
|
|
movq %rdx, B_y
|
|
movq ADDR(s1p,index,2), %rax
|
|
|
|
addq $3, index
|
|
leaq ADDR(s1p,temp,-1), s1p
|
|
leaq ADDR(rp,temp,-1), rp
|
|
negq temp
|
|
addq temp, index
|
|
C At this point:
|
|
C s1p = address of last s1limb
|
|
C rp = address of last rplimb
|
|
C temp = -n
|
|
C index = 4 - n if n even
|
|
C = 3 - n if n odd
|
|
C
|
|
C So, index is a (negative) even
|
|
C number.
|
|
C
|
|
C *****************************************
|
|
C ATTENTION:
|
|
C
|
|
C From here on, I will use array
|
|
C indexing notation in the comments
|
|
C because it is convenient. So, I
|
|
C will pretend that index is positive
|
|
C because then a comment like
|
|
C B_z = rp[index-1]
|
|
C is easier to read.
|
|
C However, keep in mind that index is
|
|
C actually a negative number indexing
|
|
C back from the end of the array.
|
|
C This is a common trick to remove one
|
|
C compare operation from the main loop.
|
|
C *****************************************
|
|
|
|
C
|
|
C Now we enter a spin-up loop the
|
|
C will make sure that the index is
|
|
C a multiple of UNROLL_SIZE before
|
|
C going to our main unrolled loop.
|
|
movq index, temp
|
|
negq temp
|
|
andq $UNROLL_MASK, temp
|
|
jz L_mpn_addmul_1_main_loop
|
|
shrq $1, temp
|
|
L_mpn_addmul_1_main_loop_spin_up:
|
|
C At this point we should have:
|
|
C
|
|
C A_x = low_mul[index-2] + carry_in
|
|
C A_y = high_mul[index-2] + CF
|
|
C A_z = rp[index-2]
|
|
C
|
|
C B_x = low_mul[index-1]
|
|
C B_y = high_mul[index-1]
|
|
C B_z = rp[index-1]
|
|
C
|
|
C rax = s1p[index]
|
|
mulq s2limb
|
|
addq A_x, A_z
|
|
leaq (%rax), A_x
|
|
movq ADDR(s1p,index,1), %rax
|
|
adcq A_y, B_x
|
|
movq A_z, ADDR(rp,index,-2)
|
|
movq ADDR(rp,index,0), A_z
|
|
adcq $0, B_y
|
|
leaq (%rdx), A_y
|
|
C At this point we should have:
|
|
C
|
|
C B_x = low_mul[index-1] + carry_in
|
|
C B_y = high_mul[index-1] + CF
|
|
C B_z = rp[index-1]
|
|
C
|
|
C A_x = low_mul[index]
|
|
C A_y = high_mul[index]
|
|
C A_z = rp[index]
|
|
C
|
|
C rax = s1p[index+1]
|
|
mulq s2limb
|
|
addq B_x, B_z
|
|
leaq (%rax), B_x
|
|
movq ADDR(s1p,index,2), %rax
|
|
adcq B_y, A_x
|
|
movq B_z, ADDR(rp,index,-1)
|
|
movq ADDR(rp,index,1), B_z
|
|
adcq $0, A_y
|
|
leaq (%rdx), B_y
|
|
|
|
addq $2, index
|
|
subq $1, temp
|
|
jnz L_mpn_addmul_1_main_loop_spin_up
|
|
|
|
jmp L_mpn_addmul_1_main_loop
|
|
ALIGN(16)
|
|
L_mpn_addmul_1_main_loop:
|
|
C The code here is really the same
|
|
C logic as the spin-up loop. It's
|
|
C just been unrolled.
|
|
forloop(`unroll_index', 0, eval(UNROLL_SIZE/2 - 1),`
|
|
mulq s2limb
|
|
addq A_x, A_z
|
|
leaq (%rax), A_x
|
|
movq ADDR(s1p,index,eval(2*unroll_index+1)), %rax
|
|
adcq A_y, B_x
|
|
movq A_z, ADDR(rp,index,eval(2*unroll_index-2))
|
|
movq ADDR(rp,index,eval(2*unroll_index)), A_z
|
|
adcq $0, B_y
|
|
leaq (%rdx), A_y
|
|
|
|
mulq s2limb
|
|
addq B_x, B_z
|
|
leaq (%rax), B_x
|
|
movq ADDR(s1p,index,eval(2*unroll_index+2)), %rax
|
|
adcq B_y, A_x
|
|
movq B_z, ADDR(rp,index,eval(2*unroll_index-1))
|
|
movq ADDR(rp,index,eval(2*unroll_index+1)), B_z
|
|
adcq $0, A_y
|
|
leaq (%rdx), B_y
|
|
')
|
|
|
|
addq $UNROLL_SIZE, index
|
|
jnz L_mpn_addmul_1_main_loop
|
|
|
|
L_mpn_addmul_1_finish:
|
|
C At this point we should have:
|
|
C
|
|
C index = n-1
|
|
C
|
|
C A_x = low_mul[index-2] + carry_in
|
|
C A_y = high_mul[index-2] + CF
|
|
C A_z = rp[index-2]
|
|
C
|
|
C B_x = low_mul[index-1]
|
|
C B_y = high_mul[index-1]
|
|
C B_z = rp[index-1]
|
|
C
|
|
C rax = s1p[index]
|
|
mulq s2limb
|
|
addq A_x, A_z
|
|
movq %rax, A_x
|
|
movq A_z, ADDR(rp,index,-2)
|
|
movq ADDR(rp,index,0), A_z
|
|
adcq A_y, B_x
|
|
adcq $0, B_y
|
|
movq %rdx, A_y
|
|
C At this point we should have:
|
|
C
|
|
C index = n-1
|
|
C
|
|
C B_x = low_mul[index-1] + carry_in
|
|
C B_y = high_mul[index-1] + CF
|
|
C B_z = rp[index-1]
|
|
C
|
|
C A_x = low_mul[index]
|
|
C A_y = high_mul[index]
|
|
C A_z = rp[index]
|
|
addq B_x, B_z
|
|
movq B_z, ADDR(rp,index,-1)
|
|
adcq B_y, A_x
|
|
adcq $0, A_y
|
|
C At this point we should have:
|
|
C
|
|
C index = n-1
|
|
C
|
|
C A_x = low_mul[index] + carry_in
|
|
C A_y = high_mul[index] + CF
|
|
C A_z = rp[index]
|
|
C
|
|
addq A_x, A_z
|
|
movq A_z, ADDR(rp,index,0)
|
|
adcq $0, A_y
|
|
|
|
movq A_y, %rax
|
|
popq %r12
|
|
popq %r13
|
|
popq %r14
|
|
popq %r15
|
|
ret
|
|
EPILOGUE()
|