; ************************************************************************** ; Intel64 mpn_submul_1 -- Multiply a limb vector with a limb and ; subtract the result from a second limb vector. ; ; Copyright (C) 2006 Jason Worth Martin ; ; This program is free software; you can redistribute it and/or modify ; it under the terms of the GNU Lesser General Public License as published ; by the Free Software Foundation; either version 2 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public License ; along with this program; if not, write to the Free Software Foundation, ; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ; ; ************************************************************************** ; ; ; CREDITS ; ; This code is based largely on Pierrick Gaudry's excellent assembly ; support for the AMD64 architecture. (Note that Intel64 and AMD64, ; while using the same instruction set, have very different ; microarchitectures. So, this code performs very poorly on AMD64 ; machines even though it is near-optimal on Intel64.) ; ; Roger Golliver works for Intel and provided insightful improvements ; particularly in using the "lea" instruction to perform additions ; and register-to-register moves. ; ; Eric Bainville has a brilliant exposition of optimizing arithmetic for ; AMD64 (http://www.bealto.it). I adapted many of the ideas he ; describes to Intel64. ; ; Agner Fog is a demigod in the x86 world. If you are reading assembly ; code files and you haven't heard of Agner Fog, then take a minute to ; look over his software optimization manuals (http://www.agner.org/). ; They are superb. ; ; ********************************************************************* ; With a 4-way unroll the code has ; ; cycles/limb ; Hammer: 4.6 ; Woodcrest: 4.6 ; ; With increased unrolling, it appears to converge to 4 cycles/limb ; on Intel Core 2 machines. I believe that this is optimal, however ; it requires such absurd unrolling that it becomes unusable for all ; but the largest inputs. A 4-way unroll seems like a good balance ; to me because then commonly used input sizes (e.g. 1024bit Public ; keys) still benifit from the speed up. ; ; This is just a check to see if we are in my code testing sandbox ; or if we are actually in the GMP source tree ; %include 'yasm_mac.inc' ; ********************************************************************* ; ********************************************************************* ; ; Here are the important macro parameters for the code ; ; BpL is Bytes per Limb (8 since this is 64bit code) ; ; UNROLL_SIZE is a power of 2 for which we unroll the code. ; possible values are 2,4,8,15,..., 256. A reasonable ; value is probably 4. If really large inputs ; are expected, then 16 is probably good. Larger ; values are really only useful for flashy ; benchmarks and testing asymptotic behavior. ; ; THRESHOLD is the minimum number of limbs needed before we bother ; using the complicated loop. A reasonable value is ; 2*UNROLL_SIZE + 6 ; ; ********************************************************************* ; ********************************************************************* %define BpL 8 %define UNROLL_SIZE 4 %define UNROLL_MASK UNROLL_SIZE-1 %define THRESHOLD 2*UNROLL_SIZE+6 ; Here is a convenient Macro for addressing ; memory. Entries of the form ; ; ADDR(ptr,index,displacement) ; ; get converted to ; ; [displacement*BpL + ptr + index*BpL] ; %define ADDR(a,b,c) [c*BpL+a+b*BpL] ; Register Usage ; -------- ----- ; rax low word from mul ; rbx* ; rcx s2limb ; rdx high word from mul ; rsi s1p ; rdi rp ; rbp* Base Pointer ; rsp* Stack Pointer ; r8 A_x ; r9 A_y ; r10 A_z ; r11 B_x ; r12* B_y ; r13* B_z ; r14* temp ; r15* index ; ; * indicates that the register must be ; preserved for the caller. %define s2limb rcx %define s1p rsi %define rp rdi %define A_x r8 %define A_y r9 %define A_z r10 %define B_x r11 %define B_y r12 %define B_z r13 %define temp r14 %define index r15 ; INPUT PARAMETERS ; rp rdi ; s1p rsi ; n rdx ; s2limb rcx BITS 64 GLOBAL_FUNC mpn_submul_1 ; Compare the limb count ; with the threshold value. ; If the limb count is small ; we just use the small loop, ; otherwise we jump to the ; more complicated loop. cmp rdx,THRESHOLD jge L_mpn_submul_1_main_loop_prep mov r11,rdx lea rsi,[rsi+rdx*8] lea rdi,[rdi+rdx*8] neg r11 xor r8, r8 xor r10, r10 jmp L_mpn_submul_1_small_loop align 16 L_mpn_submul_1_small_loop: mov rax,[rsi+r11*8] mul rcx add rax,r8 adc rdx,r10 sub [rdi+r11*8],rax mov r8,r10 adc r8,rdx inc r11 jne L_mpn_submul_1_small_loop mov rax,r8 ret L_mpn_submul_1_main_loop_prep: push r15 push r14 push r13 push r12 ; If n is even, we need to do three ; pre-multiplies, if n is odd we only ; need to do two. mov temp,rdx mov index,0 mov A_x,0 mov A_y,0 and rdx,1 jnz L_mpn_submul_1_odd_n ; Case n is even mov rax,ADDR(s1p,index,0) mul s2limb sub ADDR(rp,index,0),rax adc A_x,rdx add index,1 ; At this point ; temp = n (even) ; index = 1 L_mpn_submul_1_odd_n: ; Now ; temp = n ; index = 1 if n even ; = 0 if n odd ; mov rax,ADDR(s1p,index,0) mul s2limb mov A_z,ADDR(rp,index,0) add A_x,rax adc A_y,rdx mov rax,ADDR(s1p,index,1) mul s2limb mov B_z,ADDR(rp,index,1) mov B_x,rax mov B_y,rdx mov rax,ADDR(s1p,index,2) add index,3 lea s1p,ADDR(s1p,temp,-1) lea rp,ADDR(rp,temp,-1) neg temp add index,temp ; At this point: ; s1p = address of last s1limb ; rp = address of last rplimb ; temp = -n ; index = 4 - n if n even ; = 3 - n if n odd ; ; So, index is a (negative) even ; number. ; ; ***************************************** ; ATTENTION: ; ; From here on, I will use array ; indexing notation in the comments ; because it is convenient. So, I ; will pretend that index is positive ; because then a comment like ; B_z = rp[index-1] ; is easier to read. ; However, keep in mind that index is ; actually a negative number indexing ; back from the end of the array. ; This is a common trick to remove one ; compare operation from the main loop. ; ***************************************** ; ; Now we enter a spin-up loop the ; will make sure that the index is ; a multiple of UNROLL_SIZE before ; going to our main unrolled loop. mov temp,index neg temp and temp,UNROLL_MASK jz L_mpn_submul_1_main_loop shr temp,1 L_mpn_submul_1_main_loop_spin_up: ; At this point we should have: ; ; A_x = low_mul[index-2] + carry_in ; A_y = high_mul[index-2] + CF ; A_z = rp[index-2] ; ; B_x = low_mul[index-1] ; B_y = high_mul[index-1] ; B_z = rp[index-1] ; ; rax = s1p[index] mul s2limb sub A_z,A_x lea A_x,[rax] mov rax,ADDR(s1p,index,1) adc B_x,A_y mov ADDR(rp,index,-2),A_z mov A_z,ADDR(rp,index,0) adc B_y,0 lea A_y,[rdx] ; At this point we should have: ; ; B_x = low_mul[index-1] + carry_in ; B_y = high_mul[index-1] + CF ; B_z = rp[index-1] ; ; A_x = low_mul[index] ; A_y = high_mul[index] ; A_z = rp[index] ; ; rax = s1p[index+1] mul s2limb sub B_z,B_x lea B_x,[rax] mov rax,ADDR(s1p,index,2) adc A_x,B_y mov ADDR(rp,index,-1),B_z mov B_z,ADDR(rp,index,1) adc A_y,0 lea B_y,[rdx] add index,2 sub temp,1 jnz L_mpn_submul_1_main_loop_spin_up jmp L_mpn_submul_1_main_loop align 16 L_mpn_submul_1_main_loop: ; The code here is really the same ; logic as the spin-up loop. It's ; just been unrolled. %assign unroll_index 0 %rep UNROLL_SIZE/2 mul s2limb sub A_z,A_x lea A_x,[rax] mov rax,ADDR(s1p,index,(2*unroll_index+1)) adc B_x,A_y mov ADDR(rp,index,(2*unroll_index-2)),A_z mov A_z,ADDR(rp,index,(2*unroll_index)) adc B_y,0 lea A_y,[rdx] mul s2limb sub B_z,B_x lea B_x,[rax] mov rax,ADDR(s1p,index,(2*unroll_index+2)) adc A_x,B_y mov ADDR(rp,index,(2*unroll_index-1)),B_z mov B_z,ADDR(rp,index,(2*unroll_index+1)) adc A_y,0 lea B_y,[rdx] %assign unroll_index unroll_index+1 %endrep add index,UNROLL_SIZE jnz L_mpn_submul_1_main_loop L_mpn_submul_1_finish: ; At this point we should have: ; ; index = n-1 ; ; A_x = low_mul[index-2] + carry_in ; A_y = high_mul[index-2] + CF ; A_z = rp[index-2] ; ; B_x = low_mul[index-1] ; B_y = high_mul[index-1] ; B_z = rp[index-1] ; ; rax = s1p[index] mul s2limb sub A_z,A_x mov A_x,rax mov ADDR(rp,index,-2),A_z mov A_z,ADDR(rp,index,0) adc B_x,A_y adc B_y,0 mov A_y,rdx ; At this point we should have: ; ; index = n-1 ; ; B_x = low_mul[index-1] + carry_in ; B_y = high_mul[index-1] + CF ; B_z = rp[index-1] ; ; A_x = low_mul[index] ; A_y = high_mul[index] ; A_z = rp[index] sub B_z,B_x mov ADDR(rp,index,-1),B_z adc A_x,B_y adc A_y,0 ; At this point we should have: ; ; index = n-1 ; ; A_x = low_mul[index] + carry_in ; A_y = high_mul[index] + CF ; A_z = rp[index] ; sub A_z,A_x mov ADDR(rp,index,0),A_z adc A_y,0 mov rax,A_y pop r12 pop r13 pop r14 pop r15 ret