399 lines
9.5 KiB
399 lines
9.5 KiB
![]() |
; **************************************************************************
; Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and
; add the result to a second limb vector.
; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; GNU General Public License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with this program; if not, write to the Free Software Foundation,
; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
; **************************************************************************
; This code is based largely on Pierrick Gaudry's excellent assembly
; support for the AMD64 architecture. (Note that Intel64 and AMD64,
; while using the same instruction set, have very different
; microarchitectures. So, this code performs very poorly on AMD64
; machines even though it is near-optimal on Intel64.)
; Roger Golliver works for Intel and provided insightful improvements
; particularly in using the "lea" instruction to perform additions
; and register-to-register moves.
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
; AMD64 (http://www.bealto.it). I adapted many of the ideas he
; describes to Intel64.
; Agner Fog is a demigod in the x86 world. If you are reading assembly
; code files and you haven't heard of Agner Fog, then take a minute to
; look over his software optimization manuals (http://www.agner.org/).
; They are superb.
; *********************************************************************
; With a 4-way unroll the code has
; cycles/limb
; Hammer: 4.8
; Woodcrest: 4.6
; With increased unrolling, it appears to converge to 4 cycles/limb
; on Intel Core 2 machines. I believe that this is optimal, however
; it requires such absurd unrolling that it becomes unusable for all
; but the largest inputs. A 4-way unroll seems like a good balance
; to me because then commonly used input sizes (e.g. 1024bit Public
; keys) still benifit from the speed up.
; This is just a check to see if we are in my code testing sandbox
; or if we are actually in the GMP source tree
%include 'yasm_mac.inc'
; *********************************************************************
; *********************************************************************
; Here are the important macro parameters for the code
; BpL is Bytes per Limb (8 since this is 64bit code)
; UNROLL_SIZE is a power of 2 for which we unroll the code.
; possible values are 2,4,8,15,..., 256. A reasonable
; value is probably 4. If really large inputs
; are expected, then 16 is probably good. Larger
; values are really only useful for flashy
; benchmarks and testing asymptotic behavior.
; THRESHOLD is the minimum number of limbs needed before we bother
; using the complicated loop. A reasonable value is
; *********************************************************************
; *********************************************************************
%define BpL 8
%define UNROLL_SIZE 4
; Here is a convenient Macro for addressing
; memory. Entries of the form
; ADDR(ptr,index,displacement)
; get converted to
; [displacement*BpL + ptr + index*BpL]
%define ADDR(a,b,c) [c*BpL+a+b*BpL]
; Register Usage
; -------- -----
; rax low word from mul
; rbx*
; rcx s2limb
; rdx high word from mul
; rsi s1p
; rdi rp
; rbp* Base Pointer
; rsp* Stack Pointer
; r8 A_x
; r9 A_y
; r10 A_z
; r11 B_x
; r12* B_y
; r13* B_z
; r14* temp
; r15* index
; * indicates that the register must be
; preserved for the caller.
%define s2limb rcx
%define s1p rsi
%define rp rdi
%define A_x r8
%define A_y r9
%define A_z r10
%define B_x r11
%define B_y r12
%define B_z r13
%define temp r14
%define index r15
; rp rdi
; s1p rsi
; n rdx
; s2limb rcx
GLOBAL_FUNC mpn_addmul_1
; Compare the limb count
; with the threshold value.
; If the limb count is small
; we just use the small loop,
; otherwise we jump to the
; more complicated loop.
jge L_mpn_addmul_1_main_loop_prep
mov r11,rdx
lea rsi,[rsi+rdx*8]
lea rdi,[rdi+rdx*8]
neg r11
xor r8, r8
xor r10, r10
jmp L_mpn_addmul_1_small_loop
align 16
mov rax,[rsi+r11*8]
mul rcx
add rax,[rdi+r11*8]
adc rdx,r10
add rax,r8
mov r8,r10
mov [rdi+r11*8],rax
adc r8,rdx
inc r11
jne L_mpn_addmul_1_small_loop
mov rax,r8
push r15
push r14
push r13
push r12
; If n is even, we need to do three
; pre-multiplies, if n is odd we only
; need to do two.
mov temp,rdx
mov index,0
mov A_x,0
mov A_y,0
and rdx,1
jnz L_mpn_addmul_1_odd_n
; Case n is even
mov rax,ADDR(s1p,index,0)
mul s2limb
add ADDR(rp,index,0),rax
adc A_x,rdx
add index,1
; At this point
; temp = n (even)
; index = 1
; Now
; temp = n
; index = 1 if n even
; = 0 if n odd
mov rax,ADDR(s1p,index,0)
mul s2limb
mov A_z,ADDR(rp,index,0)
add A_x,rax
adc A_y,rdx
mov rax,ADDR(s1p,index,1)
mul s2limb
mov B_z,ADDR(rp,index,1)
mov B_x,rax
mov B_y,rdx
mov rax,ADDR(s1p,index,2)
add index,3
lea s1p,ADDR(s1p,temp,-1)
lea rp,ADDR(rp,temp,-1)
neg temp
add index,temp
; At this point:
; s1p = address of last s1limb
; rp = address of last rplimb
; temp = -n
; index = 4 - n if n even
; = 3 - n if n odd
; So, index is a (negative) even
; number.
; *****************************************
; From here on, I will use array
; indexing notation in the comments
; because it is convenient. So, I
; will pretend that index is positive
; because then a comment like
; B_z = rp[index-1]
; is easier to read.
; However, keep in mind that index is
; actually a negative number indexing
; back from the end of the array.
; This is a common trick to remove one
; compare operation from the main loop.
; *****************************************
; Now we enter a spin-up loop the
; will make sure that the index is
; a multiple of UNROLL_SIZE before
; going to our main unrolled loop.
mov temp,index
neg temp
and temp,UNROLL_MASK
jz L_mpn_addmul_1_main_loop
shr temp,1
; At this point we should have:
; A_x = low_mul[index-2] + carry_in
; A_y = high_mul[index-2] + CF
; A_z = rp[index-2]
; B_x = low_mul[index-1]
; B_y = high_mul[index-1]
; B_z = rp[index-1]
; rax = s1p[index]
mul s2limb
add A_z,A_x
lea A_x,[rax]
mov rax,ADDR(s1p,index,1)
adc B_x,A_y
mov ADDR(rp,index,-2),A_z
mov A_z,ADDR(rp,index,0)
adc B_y,0
lea A_y,[rdx]
; At this point we should have:
; B_x = low_mul[index-1] + carry_in
; B_y = high_mul[index-1] + CF
; B_z = rp[index-1]
; A_x = low_mul[index]
; A_y = high_mul[index]
; A_z = rp[index]
; rax = s1p[index+1]
mul s2limb
add B_z,B_x
lea B_x,[rax]
mov rax,ADDR(s1p,index,2)
adc A_x,B_y
mov ADDR(rp,index,-1),B_z
mov B_z,ADDR(rp,index,1)
adc A_y,0
lea B_y,[rdx]
add index,2
sub temp,1
jnz L_mpn_addmul_1_main_loop_spin_up
jmp L_mpn_addmul_1_main_loop
align 16
; The code here is really the same
; logic as the spin-up loop. It's
; just been unrolled.
%assign unroll_index 0
mul s2limb
add A_z,A_x
lea A_x,[rax]
mov rax,ADDR(s1p,index,(2*unroll_index+1))
adc B_x,A_y
mov ADDR(rp,index,(2*unroll_index-2)),A_z
mov A_z,ADDR(rp,index,(2*unroll_index))
adc B_y,0
lea A_y,[rdx]
mul s2limb
add B_z,B_x
lea B_x,[rax]
mov rax,ADDR(s1p,index,(2*unroll_index+2))
adc A_x,B_y
mov ADDR(rp,index,(2*unroll_index-1)),B_z
mov B_z,ADDR(rp,index,(2*unroll_index+1))
adc A_y,0
lea B_y,[rdx]
%assign unroll_index unroll_index+1
add index,UNROLL_SIZE
jnz L_mpn_addmul_1_main_loop
; At this point we should have:
; index = n-1
; A_x = low_mul[index-2] + carry_in
; A_y = high_mul[index-2] + CF
; A_z = rp[index-2]
; B_x = low_mul[index-1]
; B_y = high_mul[index-1]
; B_z = rp[index-1]
; rax = s1p[index]
mul s2limb
add A_z,A_x
mov A_x,rax
mov ADDR(rp,index,-2),A_z
mov A_z,ADDR(rp,index,0)
adc B_x,A_y
adc B_y,0
mov A_y,rdx
; At this point we should have:
; index = n-1
; B_x = low_mul[index-1] + carry_in
; B_y = high_mul[index-1] + CF
; B_z = rp[index-1]
; A_x = low_mul[index]
; A_y = high_mul[index]
; A_z = rp[index]
add B_z,B_x
mov ADDR(rp,index,-1),B_z
adc A_x,B_y
adc A_y,0
; At this point we should have:
; index = n-1
; A_x = low_mul[index] + carry_in
; A_y = high_mul[index] + CF
; A_z = rp[index]
add A_z,A_x
mov ADDR(rp,index,0),A_z
adc A_y,0
mov rax,A_y
pop r12
pop r13
pop r14
pop r15