;; ********************************************************************* ;; Intel64 mpn_submul_1 -- Multiply a limb vector with a limb and ;; subtract the result from a second limb vector. ;; ;; Copyright (C) 2006 Jason Worth Martin ;; ;; This program is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2 of the License, or ;; (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License along ;; with this program; if not, write to the Free Software Foundation, Inc., ;; 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ;; ;; ************************************************************************** ;; ;; ;; CREDITS ;; ;; This code is based largely on Pierrick Gaudry's excellent assembly ;; support for the AMD64 architecture. (Note that Intel64 and AMD64, ;; while using the same instruction set, have very different ;; microarchitectures. So, this code performs very poorly on AMD64 ;; machines even though it is near-optimal on Intel64.) ;; ;; Roger Golliver works for Intel and provided insightful improvements ;; particularly in using the "lea" instruction to perform additions ;; and register-to-register moves. ;; ;; Eric Bainville has a brilliant exposition of optimizing arithmetic for ;; AMD64 (http://www.bealto.it). I adapted many of the ideas he ;; describes to Intel64. ;; ;; Agner Fog is a demigod in the x86 world. If you are reading assembly ;; code files and you haven't heard of Agner Fog, then take a minute to ;; look over his software optimization manuals (http://www.agner.org/). ;; They are superb. ;; ;; ********************************************************************* ;; With a 4-way unroll the code has ;; ;; cycles/limb ;; Hammer: 4.6 ;; Woodcrest: 4.6 ;; ;; With increased unrolling, it appears to converge to 4 cycles/limb ;; on Intel Core 2 machines. I believe that this is optimal, however ;; it requires such absurd unrolling that it becomes unusable for all ;; but the largest inputs. A 4-way unroll seems like a good balance ;; to me because then commonly used input sizes (e.g. 1024bit Public ;; keys) still benifit from the speed up. ;; ;; This is just a check to see if we are in my code testing sandbox ;; or if we are actually in the GMP source tree ;; %ifdef __JWM_Test_Code__ %include 'yasm_mac.inc' %else %include '../yasm_mac.inc' %endif ;; ********************************************************************* ;; ********************************************************************* ;; ;; Here are the important macro parameters for the code ;; ;; BpL is Bytes per Limb (8 since this is 64bit code) ;; ;; UNROLL_SIZE is a power of 2 for which we unroll the code. ;; possible values are 2,4,8,15,..., 256. A reasonable ;; value is probably 4. If really large inputs ;; are expected, then 16 is probably good. Larger ;; values are really only useful for flashy ;; benchmarks and testing asymptotic behavior. ;; ;; THRESHOLD is the minimum number of limbs needed before we bother ;; using the complicated loop. A reasonable value is ;; 2*UNROLL_SIZE + 6 ;; ;; ********************************************************************* ;; ********************************************************************* %define BpL 8 %define UNROLL_SIZE 4 %define UNROLL_MASK UNROLL_SIZE-1 %define THRESHOLD 2*UNROLL_SIZE+6 ;; Here is a convenient Macro for addressing ;; memory. Entries of the form ;; ;; ADDR(ptr,index,displacement) ;; ;; get converted to ;; ;; [displacement*BpL + ptr + index*BpL] ;; %define ADDR(a,b,c) [c*BpL+a+b*BpL] ;; Register Usage ;; -------- ----- ;; rax low word from mul ;; rbx* ;; rcx s2limb ;; rdx high word from mul ;; rsi s1p ;; rdi rp ;; rbp* Base Pointer ;; rsp* Stack Pointer ;; r8 A_x ;; r9 A_y ;; r10 A_z ;; r11 B_x ;; r12* B_y ;; r13* B_z ;; r14* temp ;; r15* index ;; ;; * indicates that the register must be ;; preserved for the caller. %define s2limb rcx %define s1p rsi %define rp rdi %define A_x r8 %define A_y r9 %define A_z r10 %define B_x r11 %define B_y r12 %define B_z r13 %define temp r14 %define index r15 ;; INPUT PARAMETERS ;; rp rdi ;; s1p rsi ;; n rdx ;; s2limb rcx BITS 64 GLOBAL_FUNC mpn_submul_1 ;; Compare the limb count ;; with the threshold value. ;; If the limb count is small ;; we just use the small loop, ;; otherwise we jump to the ;; more complicated loop. cmp rdx,THRESHOLD jge L_mpn_submul_1_main_loop_prep mov r11,rdx lea rsi,[rsi+rdx*8] lea rdi,[rdi+rdx*8] neg r11 xor r8, r8 xor r10, r10 jmp L_mpn_submul_1_small_loop align 16 L_mpn_submul_1_small_loop: mov rax,[rsi+r11*8] mul rcx add rax,r8 adc rdx,r10 sub [rdi+r11*8],rax mov r8,r10 adc r8,rdx inc r11 jne L_mpn_submul_1_small_loop mov rax,r8 ret L_mpn_submul_1_main_loop_prep: push r15 push r14 push r13 push r12 ;; If n is even, we need to do three ;; pre-multiplies, if n is odd we only ;; need to do two. mov temp,rdx mov index,0 mov A_x,0 mov A_y,0 and rdx,1 jnz L_mpn_submul_1_odd_n ;; Case n is even mov rax,ADDR(s1p,index,0) mul s2limb sub ADDR(rp,index,0),rax adc A_x,rdx add index,1 ;; At this point ;; temp = n (even) ;; index = 1 L_mpn_submul_1_odd_n: ;; Now ;; temp = n ;; index = 1 if n even ;; = 0 if n odd ;; mov rax,ADDR(s1p,index,0) mul s2limb mov A_z,ADDR(rp,index,0) add A_x,rax adc A_y,rdx mov rax,ADDR(s1p,index,1) mul s2limb mov B_z,ADDR(rp,index,1) mov B_x,rax mov B_y,rdx mov rax,ADDR(s1p,index,2) add index,3 lea s1p,ADDR(s1p,temp,-1) lea rp,ADDR(rp,temp,-1) neg temp add index,temp ;; At this point: ;; s1p = address of last s1limb ;; rp = address of last rplimb ;; temp = -n ;; index = 4 - n if n even ;; = 3 - n if n odd ;; ;; So, index is a (negative) even ;; number. ;; ;; ***************************************** ;; ATTENTION: ;; ;; From here on, I will use array ;; indexing notation in the comments ;; because it is convenient. So, I ;; will pretend that index is positive ;; because then a comment like ;; B_z = rp[index-1] ;; is easier to read. ;; However, keep in mind that index is ;; actually a negative number indexing ;; back from the end of the array. ;; This is a common trick to remove one ;; compare operation from the main loop. ;; ***************************************** ;; ;; Now we enter a spin-up loop the ;; will make sure that the index is ;; a multiple of UNROLL_SIZE before ;; going to our main unrolled loop. mov temp,index neg temp and temp,UNROLL_MASK jz L_mpn_submul_1_main_loop shr temp,1 L_mpn_submul_1_main_loop_spin_up: ;; At this point we should have: ;; ;; A_x = low_mul[index-2] + carry_in ;; A_y = high_mul[index-2] + CF ;; A_z = rp[index-2] ;; ;; B_x = low_mul[index-1] ;; B_y = high_mul[index-1] ;; B_z = rp[index-1] ;; ;; rax = s1p[index] mul s2limb sub A_z,A_x lea A_x,[rax] mov rax,ADDR(s1p,index,1) adc B_x,A_y mov ADDR(rp,index,-2),A_z mov A_z,ADDR(rp,index,0) adc B_y,0 lea A_y,[rdx] ;; At this point we should have: ;; ;; B_x = low_mul[index-1] + carry_in ;; B_y = high_mul[index-1] + CF ;; B_z = rp[index-1] ;; ;; A_x = low_mul[index] ;; A_y = high_mul[index] ;; A_z = rp[index] ;; ;; rax = s1p[index+1] mul s2limb sub B_z,B_x lea B_x,[rax] mov rax,ADDR(s1p,index,2) adc A_x,B_y mov ADDR(rp,index,-1),B_z mov B_z,ADDR(rp,index,1) adc A_y,0 lea B_y,[rdx] add index,2 sub temp,1 jnz L_mpn_submul_1_main_loop_spin_up jmp L_mpn_submul_1_main_loop align 16 L_mpn_submul_1_main_loop: ;; The code here is really the same ;; logic as the spin-up loop. It's ;; just been unrolled. %assign unroll_index 0 %rep UNROLL_SIZE/2 mul s2limb sub A_z,A_x lea A_x,[rax] mov rax,ADDR(s1p,index,(2*unroll_index+1)) adc B_x,A_y mov ADDR(rp,index,(2*unroll_index-2)),A_z mov A_z,ADDR(rp,index,(2*unroll_index)) adc B_y,0 lea A_y,[rdx] mul s2limb sub B_z,B_x lea B_x,[rax] mov rax,ADDR(s1p,index,(2*unroll_index+2)) adc A_x,B_y mov ADDR(rp,index,(2*unroll_index-1)),B_z mov B_z,ADDR(rp,index,(2*unroll_index+1)) adc A_y,0 lea B_y,[rdx] %assign unroll_index unroll_index+1 %endrep add index,UNROLL_SIZE jnz L_mpn_submul_1_main_loop L_mpn_submul_1_finish: ;; At this point we should have: ;; ;; index = n-1 ;; ;; A_x = low_mul[index-2] + carry_in ;; A_y = high_mul[index-2] + CF ;; A_z = rp[index-2] ;; ;; B_x = low_mul[index-1] ;; B_y = high_mul[index-1] ;; B_z = rp[index-1] ;; ;; rax = s1p[index] mul s2limb sub A_z,A_x mov A_x,rax mov ADDR(rp,index,-2),A_z mov A_z,ADDR(rp,index,0) adc B_x,A_y adc B_y,0 mov A_y,rdx ;; At this point we should have: ;; ;; index = n-1 ;; ;; B_x = low_mul[index-1] + carry_in ;; B_y = high_mul[index-1] + CF ;; B_z = rp[index-1] ;; ;; A_x = low_mul[index] ;; A_y = high_mul[index] ;; A_z = rp[index] sub B_z,B_x mov ADDR(rp,index,-1),B_z adc A_x,B_y adc A_y,0 ;; At this point we should have: ;; ;; index = n-1 ;; ;; A_x = low_mul[index] + carry_in ;; A_y = high_mul[index] + CF ;; A_z = rp[index] ;; sub A_z,A_x mov ADDR(rp,index,0),A_z adc A_y,0 mov rax,A_y pop r12 pop r13 pop r14 pop r15 ret