48248cda46
2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code
301 lines
8.8 KiB
NASM
301 lines
8.8 KiB
NASM
; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
|
|
; compiler and the YASM assembler.
|
|
|
|
; AMD64 mpn_mul_basecase -- multiply two mpn numbers.
|
|
;
|
|
; Calling interface:
|
|
;
|
|
; void __gmpn_mul_basecase(
|
|
; mp_ptr rp, rcx
|
|
; mp_srcptr xp, rdx
|
|
; mp_size_t xn, r8
|
|
; mp_srcptr yp, r9
|
|
; mp_size_t yn [rsp+0x28] as a *** 32-bit *** word
|
|
; )
|
|
;
|
|
; Multiply xp[xn] by yp[yn] and write the result to rp[un+vn] with xn >= yn on
|
|
; entry.
|
|
;
|
|
; This is an SEH Frame Function with a leaf prologue
|
|
|
|
%define _SEH_
|
|
|
|
%define UNROLL_LOG2 4
|
|
%define UNROLL_COUNT (1 << UNROLL_LOG2)
|
|
%define UNROLL_MASK (UNROLL_COUNT - 1)
|
|
%define UNROLL_BYTES (8 * UNROLL_COUNT)
|
|
%define UNROLL_THRESHOLD 5
|
|
|
|
bits 64
|
|
section .text
|
|
|
|
%define v_par rsp + 16
|
|
%define v_adj rsp + 8
|
|
%define v_xlo rsp
|
|
%define v_len 24
|
|
|
|
%define r_ptr rcx
|
|
%define x_ptr r11
|
|
%define x_len r8
|
|
%define y_ptr r9
|
|
%define y_len r10
|
|
|
|
%define v_ctr r8 ; x_len reused
|
|
%define v_jmp r11 ; x_ptr reused
|
|
|
|
global __gmpn_mul_basecase
|
|
|
|
%ifdef DLL
|
|
export __gmpn_mul_basecase
|
|
%endif
|
|
|
|
__gmpn_mul_basecase:
|
|
movsxd x_len,r8d
|
|
mov rax,[y_ptr]
|
|
cmp x_len,2
|
|
ja mul_m_by_n
|
|
je mul_2_by_n
|
|
mul qword [rdx]
|
|
mov [r_ptr],rax
|
|
mov [r_ptr+8],rdx
|
|
ret
|
|
|
|
mul_2_by_n:
|
|
movsxd r10,dword[rsp+0x28] ; load as a 32-bit integer
|
|
mov x_ptr,rdx
|
|
dec qword y_len
|
|
jnz mul_2_by_2
|
|
mov r8,rax ; y[0] -> r8 (was x_len)
|
|
mov rax,[x_ptr]
|
|
mul r8
|
|
mov [r_ptr],rax
|
|
mov rax,[x_ptr+8]
|
|
mov r9,rdx ; carry -> r9 (was y_ptr)
|
|
mul r8
|
|
add r9,rax
|
|
mov [r_ptr+8],r9
|
|
adc rdx,y_len ; note: r10 = 0 (was y_len)
|
|
mov [r_ptr+16],rdx
|
|
ret
|
|
|
|
mul_2_by_2: ; r8 (x_len) and r10 (y_len) free
|
|
mov r10,[x_ptr] ; x[0]
|
|
mul r10 ; y[0] * x[0]
|
|
mov [r_ptr],rax
|
|
mov r8,rdx ; cry = { 0, r8 }
|
|
mov rax,[y_ptr+8] ; y[1]
|
|
mul r10 ; y[1] * x[0]
|
|
add r8,rax
|
|
adc rdx,byte 0
|
|
mov r10,[x_ptr+8] ; x[1] - r11 (x_ptr) now free
|
|
mov r11,rdx ; cry = { r11, r8 }
|
|
mov rax,[y_ptr] ; y[0]
|
|
mul r10 ; y[0] * x[1]
|
|
add r8,rax
|
|
adc r11,rdx
|
|
mov [r_ptr+8],r8
|
|
mov r8,dword 0
|
|
adc r8,r8 ; cry = { r8, r11 }
|
|
mov rax,[y_ptr+8] ; y[1]
|
|
mul r10 ; x[1] * y[1]
|
|
add rax,r11
|
|
adc rdx,r8
|
|
mov [r_ptr+16],rax
|
|
mov [r_ptr+24],rdx
|
|
ret
|
|
|
|
; do first multiply of y[0] * x[n] as it can simply be stored
|
|
|
|
mul_m_by_n:
|
|
mov r10d,dword[rsp+0x28] ; load as a 32-bit integer
|
|
%ifdef _SEH_
|
|
PROC_FRAME fmul_m_by_n
|
|
push_reg rbx
|
|
push_reg rsi
|
|
push_reg rdi
|
|
push_reg rbp
|
|
push_reg r12
|
|
alloc_stack 4*8 ; align to 16 byte boundary
|
|
END_PROLOGUE ; [rsp], [rsp+8] & [rsp+16]
|
|
%else
|
|
push rbx
|
|
push rsi
|
|
push rdi
|
|
push rbp
|
|
push r12
|
|
sub rsp,v_len ; ***
|
|
%endif
|
|
mov x_ptr,rdx
|
|
mov r12,x_len
|
|
mov rbp,rax ; y[0] -> rbp
|
|
xor rbx,rbx ; for carry
|
|
lea rsi,[x_ptr+r12*8] ; past end of x[]
|
|
lea rdi,[r_ptr+r12*8] ; past end of r[]
|
|
neg r12
|
|
.0: mov rax,[rsi+r12*8] ; x[n]
|
|
mul rbp ; x[n] * y[0]
|
|
add rax,rbx ; add carry from previous round
|
|
mov [rdi+r12*8],rax ; store r[n]
|
|
mov rbx,dword 0 ; propagate carry
|
|
adc rbx,rdx
|
|
inc r12 ; next iteration
|
|
jnz .0
|
|
mov [rdi],rbx ; store final digit in carry
|
|
mov rdx,y_len ; done if y_len is 1
|
|
dec rdx
|
|
jnz .1 ; more to do
|
|
jmp L_exit
|
|
|
|
.1: cmp x_len,UNROLL_THRESHOLD ; unroll if many loops
|
|
jae L_unroll
|
|
lea y_ptr,[y_ptr+rdx*8+8] ; pointer to end limb of y[]
|
|
neg x_len ; negative counter for x[n]
|
|
neg rdx ; negative counter for y[n]
|
|
mov rax,[rsi+x_len*8] ; x[0] -> rax
|
|
mov y_len,rdx ; now -(y_len - 1)
|
|
inc x_len ; negative counter for x[1]
|
|
xor rbx,rbx ; for carry
|
|
mov rcx,x_len ; now -(x_len - 1) -> rcx (was r_ptr)
|
|
mov rbp,[y_ptr+rdx*8] ; y[n] -> rbp
|
|
jmp .3
|
|
.2: mov rcx,x_len ; restore x[] counter
|
|
xor rbx,rbx ; clear carry
|
|
add rdi,8 ; increase end of r[] pointer
|
|
mov rbp,[y_ptr+y_len*8] ; y[n] -> rbp
|
|
mov rax,[rsi+rcx*8-8] ; x[m] -> rax
|
|
.3: mul rbp ; x[m] * y[n]
|
|
add rbx,rax ; add carry
|
|
adc rdx,byte 0
|
|
add [rdi+rcx*8],rbx ; add into r[]
|
|
mov rax,[rsi+rcx*8] ; next x[m] ->rax
|
|
adc rdx,byte 0 ; add carry to rdx
|
|
inc rcx ; got to next limb of x[]
|
|
mov rbx,rdx ; move carry into rbx
|
|
jnz .3 ; got to next limb of x[]
|
|
mul rbp ; do last limb
|
|
add rbx,rax ; propagate carry
|
|
adc rdx,byte 0
|
|
add [rdi],rbx ; add into r[]
|
|
adc rdx,byte 0 ; add add in any carry
|
|
inc y_len
|
|
mov [rdi+8],rdx ; move (not add) carry into r[]
|
|
jnz .2 ; go to next limb of y[]
|
|
jmp L_exit
|
|
|
|
L_unroll:
|
|
mov rdi,r_ptr
|
|
mov rcx,x_len
|
|
mov rsi,x_ptr
|
|
mov rbp,[y_ptr+8]
|
|
lea y_ptr,[y_ptr+rdx*8+8]
|
|
neg rdx
|
|
mov y_len,rdx
|
|
lea rbx,[UNROLL_COUNT-2+rcx]
|
|
dec rcx
|
|
mov rax,[rsi] ; x[0]
|
|
and rbx,-UNROLL_MASK-1
|
|
neg rcx
|
|
neg rbx
|
|
and rcx,UNROLL_MASK
|
|
mov [v_par],rcx
|
|
mov [v_adj],rbx
|
|
mov rdx,rcx
|
|
shl rcx,3
|
|
lea rcx,[rcx+rcx*2]
|
|
lea v_jmp,[rel .4]
|
|
lea v_jmp,[v_jmp+rcx]
|
|
neg rdx
|
|
mov [v_xlo],rax
|
|
lea rdi,[rdi+rdx*8+8]
|
|
lea rsi,[rsi+rdx*8+8]
|
|
jmp .3
|
|
.2: mov rbx,[v_adj]
|
|
mov rax,[v_xlo]
|
|
lea rdi,[rdi+rbx*8+8]
|
|
lea rsi,[rsi+rbx*8]
|
|
mov rbp,[y_ptr+y_len*8]
|
|
.3: mul rbp
|
|
sar rbx,UNROLL_LOG2
|
|
mov rcx,[v_par]
|
|
mov v_ctr,rbx
|
|
test cl,1 ; low word of product + carry
|
|
mov rbx,dword 0 ; is in rcx on even rounds and
|
|
mov rcx,dword 0 ; rbx on odd rounds - we must
|
|
cmovz rcx,rax ; put low word of first product
|
|
cmovnz rbx,rax ; in the right register here
|
|
jmp v_jmp
|
|
.4:
|
|
|
|
%define CHUNK_COUNT 2
|
|
%assign i 0
|
|
%rep UNROLL_COUNT / CHUNK_COUNT
|
|
%define disp0 8 * i * CHUNK_COUNT
|
|
|
|
mov rax,[byte rsi+disp0]
|
|
adc rbx,rdx
|
|
mul rbp
|
|
add [byte rdi+disp0],rcx
|
|
mov rcx,dword 0
|
|
adc rbx,rax
|
|
mov rax,[byte rsi+disp0+8]
|
|
adc rcx,rdx
|
|
mul rbp
|
|
add [byte rdi+disp0+8],rbx
|
|
mov rbx,dword 0
|
|
adc rcx,rax
|
|
|
|
%assign i i + 1
|
|
%endrep
|
|
|
|
inc v_ctr
|
|
lea rsi,[UNROLL_BYTES+rsi]
|
|
lea rdi,[UNROLL_BYTES+rdi]
|
|
jnz .4
|
|
|
|
adc rdx,byte 0
|
|
add [rdi],rcx
|
|
adc rdx,byte 0
|
|
inc y_len
|
|
mov [rdi+8],rdx
|
|
jnz .2
|
|
L_exit:
|
|
%ifdef _SEH_
|
|
add rsp, 4*8
|
|
pop r12
|
|
pop rbp
|
|
pop rdi
|
|
pop rsi
|
|
pop rbx
|
|
ret
|
|
ENDPROC_FRAME
|
|
%else
|
|
add rsp,v_len
|
|
pop r12
|
|
pop rbp
|
|
pop rdi
|
|
pop rsi
|
|
pop rbx
|
|
ret
|
|
%endif
|
|
|
|
end
|