dnl ********************************************************************* dnl Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and dnl add the result to a second limb vector. dnl dnl Copyright 2006 Jason Worth Martin dnl dnl This code is distributed under the terms of the Gnu Lesser dnl General Public License. dnl dnl This code is part of a patch for the GNU MP Library (GMP). dnl However, this code is not maintained by the GMP group, so please dnl do not bother them with questions/bugs/etc. related to this code. dnl If you find a bug in this code, please contact Jason Worth Martin dnl at jason.worth.martin@gmail.com dnl dnl This patch is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 2.1 of the License, or (at dnl your option) any later version. dnl dnl This patch is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library; see the file LICENSE.TXT. If not, write dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, dnl Boston, MA 02110-1301, USA. dnl dnl ********************************************************************* dnl dnl dnl CREDITS dnl dnl This code is based largely on Pierrick Gaudry's excellent assembly dnl support for the AMD64 architecture. (Note that Intel64 and AMD64, dnl while using the same instruction set, have very different dnl microarchitectures. So, this code performs very poorly on AMD64 dnl machines even though it is near-optimal on Intel64.) dnl dnl Roger Golliver works for Intel and provided insightful improvements dnl particularly in using the "lea" instruction to perform additions dnl and register-to-register moves. dnl dnl Eric Bainville has a brilliant exposition of optimizing arithmetic for dnl AMD64 (http://www.bealto.it). I adapted many of the ideas he dnl describes to Intel64. dnl dnl Agner Fog is a demigod in the x86 world. If you are reading assembly dnl code files and you haven't heard of Agner Fog, then take a minute to dnl look over his software optimization manuals (http://www.agner.org/). dnl They are superb. dnl dnl With a 4-way unroll (UNROLL_EXPONENT = 2) the code has dnl dnl cycles/limb dnl Hammer: 4.8 dnl Woodcrest: 4.6 dnl dnl With increased unrolling, it appears to converge to 4 cycles/limb dnl on Intel Core 2 machines. I believe that this is optimal, however dnl it requires such absurd unrolling that it becomes unusable for all dnl but the largest inputs. A 4-way unroll seems like a good balance dnl to me because then commonly used input sizes (e.g. 1024bit Public dnl keys) still benifit from the speed up. dnl dnl This is just a check to see if we are in my code testing sandbox dnl or if we are actually in the GMP source tree dnl ifdef(`__JWM_Test_Code__',` include(`./config.m4') define(`MPN_PREFIX',`jwm_mpn_')',` include(`../config.m4')') dnl ********************************************************************* dnl ********************************************************************* dnl dnl Here are the important m4 parameters for the code dnl dnl BpL is Bytes per Limb (8 since this is 64bit code) dnl dnl UNROLL_EXPONENT is the power of 2 for which we unroll the code. dnl possible values are 1,2,...,8. A reasonable dnl value is probably 2. If really large inputs dnl are expected, then 4 is probably good. Larger dnl values are really only useful for flashy dnl benchmarks and testing asymptotic behavior. dnl dnl THRESHOLD is the minimum number of limbs needed before we bother dnl using the complicated loop. A reasonable value is dnl 2*UNROLL_SIZE + 6 dnl dnl ********************************************************************* dnl ********************************************************************* define(`BpL',`8') define(`UNROLL_EXPONENT',`2') define(`UNROLL_SIZE',eval(2**UNROLL_EXPONENT)) define(`UNROLL_MASK',eval(UNROLL_SIZE-1)) define(`THRESHOLD',eval(2*UNROLL_SIZE+6)) dnl Here is a convenient Macro for addressing dnl memory. Entries of the form dnl dnl ADDR(ptr,index,displacement) dnl dnl get converted to dnl dnl displacement*BpL(ptr,index,BpL) dnl define(`ADDR',`eval(`$3'*BpL)($1,$2,BpL)') C Register Usage C -------- ----- C rax low word from mul C rbx* C rcx s2limb C rdx high word from mul C rsi s1p C rdi rp C rbp* Base Pointer C rsp* Stack Pointer C r8 A_x C r9 A_y C r10 A_z C r11 B_x C r12* B_y C r13* B_z C r14* temp C r15* index C C * indicates that the register must be C preserved for the caller. define(`s2limb',`%rcx') define(`s1p',`%rsi') define(`rp',`%rdi') define(`A_x',`%r8') define(`A_y',`%r9') define(`A_z',`%r10') define(`B_x',`%r11') define(`B_y',`%r12') define(`B_z',`%r13') define(`temp',`%r14') define(`index',`%r15') C INPUT PARAMETERS C rp rdi C s1p rsi C n rdx C s2limb rcx ASM_START() PROLOGUE(mpn_addmul_1) C Compare the limb count C with the threshold value. C If the limb count is small C we just use the small loop, C otherwise we jump to the C more complicated loop. cmp $THRESHOLD,%rdx jge L_mpn_addmul_1_main_loop_prep movq %rdx, %r11 leaq (%rsi,%rdx,8), %rsi leaq (%rdi,%rdx,8), %rdi negq %r11 xorq %r8, %r8 xorq %r10, %r10 jmp L_mpn_addmul_1_small_loop ALIGN(16) L_mpn_addmul_1_small_loop: movq (%rsi,%r11,8), %rax mulq %rcx addq (%rdi,%r11,8), %rax adcq %r10, %rdx addq %r8, %rax movq %r10, %r8 movq %rax, (%rdi,%r11,8) adcq %rdx, %r8 incq %r11 jne L_mpn_addmul_1_small_loop movq %r8, %rax ret L_mpn_addmul_1_main_loop_prep: pushq %r15 pushq %r14 pushq %r13 pushq %r12 C If n is even, we need to do three C pre-multiplies, if n is odd we only C need to do two. movq %rdx, temp movq $0, index movq $0, A_x movq $0, A_y andq $1, %rdx jnz L_mpn_addmul_1_odd_n C Case n is even movq ADDR(s1p,index,0), %rax mulq s2limb addq %rax, ADDR(rp,index,0) adcq %rdx, A_x addq $1, index C At this point C temp = n (even) C index = 1 L_mpn_addmul_1_odd_n: C Now C temp = n C index = 1 if n even C = 0 if n odd C movq ADDR(s1p,index,0), %rax mulq s2limb movq ADDR(rp,index,0), A_z addq %rax, A_x adcq %rdx, A_y movq ADDR(s1p,index,1), %rax mulq s2limb movq ADDR(rp,index,1), B_z movq %rax, B_x movq %rdx, B_y movq ADDR(s1p,index,2), %rax addq $3, index leaq ADDR(s1p,temp,-1), s1p leaq ADDR(rp,temp,-1), rp negq temp addq temp, index C At this point: C s1p = address of last s1limb C rp = address of last rplimb C temp = -n C index = 4 - n if n even C = 3 - n if n odd C C So, index is a (negative) even C number. C C ***************************************** C ATTENTION: C C From here on, I will use array C indexing notation in the comments C because it is convenient. So, I C will pretend that index is positive C because then a comment like C B_z = rp[index-1] C is easier to read. C However, keep in mind that index is C actually a negative number indexing C back from the end of the array. C This is a common trick to remove one C compare operation from the main loop. C ***************************************** C C Now we enter a spin-up loop the C will make sure that the index is C a multiple of UNROLL_SIZE before C going to our main unrolled loop. movq index, temp negq temp andq $UNROLL_MASK, temp jz L_mpn_addmul_1_main_loop shrq $1, temp L_mpn_addmul_1_main_loop_spin_up: C At this point we should have: C C A_x = low_mul[index-2] + carry_in C A_y = high_mul[index-2] + CF C A_z = rp[index-2] C C B_x = low_mul[index-1] C B_y = high_mul[index-1] C B_z = rp[index-1] C C rax = s1p[index] mulq s2limb addq A_x, A_z leaq (%rax), A_x movq ADDR(s1p,index,1), %rax adcq A_y, B_x movq A_z, ADDR(rp,index,-2) movq ADDR(rp,index,0), A_z adcq $0, B_y leaq (%rdx), A_y C At this point we should have: C C B_x = low_mul[index-1] + carry_in C B_y = high_mul[index-1] + CF C B_z = rp[index-1] C C A_x = low_mul[index] C A_y = high_mul[index] C A_z = rp[index] C C rax = s1p[index+1] mulq s2limb addq B_x, B_z leaq (%rax), B_x movq ADDR(s1p,index,2), %rax adcq B_y, A_x movq B_z, ADDR(rp,index,-1) movq ADDR(rp,index,1), B_z adcq $0, A_y leaq (%rdx), B_y addq $2, index subq $1, temp jnz L_mpn_addmul_1_main_loop_spin_up jmp L_mpn_addmul_1_main_loop ALIGN(16) L_mpn_addmul_1_main_loop: C The code here is really the same C logic as the spin-up loop. It's C just been unrolled. forloop(`unroll_index', 0, eval(UNROLL_SIZE/2 - 1),` mulq s2limb addq A_x, A_z leaq (%rax), A_x movq ADDR(s1p,index,eval(2*unroll_index+1)), %rax adcq A_y, B_x movq A_z, ADDR(rp,index,eval(2*unroll_index-2)) movq ADDR(rp,index,eval(2*unroll_index)), A_z adcq $0, B_y leaq (%rdx), A_y mulq s2limb addq B_x, B_z leaq (%rax), B_x movq ADDR(s1p,index,eval(2*unroll_index+2)), %rax adcq B_y, A_x movq B_z, ADDR(rp,index,eval(2*unroll_index-1)) movq ADDR(rp,index,eval(2*unroll_index+1)), B_z adcq $0, A_y leaq (%rdx), B_y ') addq $UNROLL_SIZE, index jnz L_mpn_addmul_1_main_loop L_mpn_addmul_1_finish: C At this point we should have: C C index = n-1 C C A_x = low_mul[index-2] + carry_in C A_y = high_mul[index-2] + CF C A_z = rp[index-2] C C B_x = low_mul[index-1] C B_y = high_mul[index-1] C B_z = rp[index-1] C C rax = s1p[index] mulq s2limb addq A_x, A_z movq %rax, A_x movq A_z, ADDR(rp,index,-2) movq ADDR(rp,index,0), A_z adcq A_y, B_x adcq $0, B_y movq %rdx, A_y C At this point we should have: C C index = n-1 C C B_x = low_mul[index-1] + carry_in C B_y = high_mul[index-1] + CF C B_z = rp[index-1] C C A_x = low_mul[index] C A_y = high_mul[index] C A_z = rp[index] addq B_x, B_z movq B_z, ADDR(rp,index,-1) adcq B_y, A_x adcq $0, A_y C At this point we should have: C C index = n-1 C C A_x = low_mul[index] + carry_in C A_y = high_mul[index] + CF C A_z = rp[index] C addq A_x, A_z movq A_z, ADDR(rp,index,0) adcq $0, A_y movq A_y, %rax popq %r12 popq %r13 popq %r14 popq %r15 ret EPILOGUE()