270 lines
8.1 KiB
NASM
270 lines
8.1 KiB
NASM
; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
|
;
|
|
; This file is part of the GNU MP Library.
|
|
;
|
|
; The GNU MP Library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Lesser General Public License as
|
|
; published by the Free Software Foundation; either version 2.1 of the
|
|
; License, or (at your option) any later version.
|
|
;
|
|
; The GNU MP Library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Lesser General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Lesser General Public
|
|
; License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
; Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
|
|
; compiler and the YASM assembler.
|
|
|
|
; AMD64 mpn_mul_basecase -- multiply two mpn numbers.
|
|
;
|
|
; Calling interface:
|
|
;
|
|
; void __gmpn_mul_basecase(
|
|
; mp_ptr rp, rcx
|
|
; mp_srcptr xp, rdx
|
|
; mp_size_t xn, r8
|
|
; mp_srcptr yp, r9
|
|
; mp_size_t yn [rsp+0x28] as a *** 32-bit *** word
|
|
; )
|
|
;
|
|
; Multiply xp[xn] by yp[yn] and write the result to rp[un+vn] with xn >= yn on
|
|
; entry.
|
|
;
|
|
; This is an SEH Frame Function with a leaf prologue
|
|
|
|
%include "..\x86_64_asm.inc"
|
|
|
|
%define reg_save_list rbx, rsi, rdi, rbp, r12
|
|
|
|
%define UNROLL_LOG2 4
|
|
%define UNROLL_COUNT (1 << UNROLL_LOG2)
|
|
%define UNROLL_MASK (UNROLL_COUNT - 1)
|
|
%define UNROLL_BYTES (8 * UNROLL_COUNT)
|
|
%define UNROLL_THRESHOLD 5
|
|
|
|
bits 64
|
|
section .text
|
|
|
|
%define v_par 16
|
|
%define v_adj 8
|
|
%define v_xlo 0
|
|
%define v_len 24
|
|
|
|
%define r_ptr rcx
|
|
%define x_ptr r11
|
|
%define x_len r8
|
|
%define y_ptr r9
|
|
%define y_len r10
|
|
|
|
%define v_ctr r8 ; x_len reused
|
|
%define v_jmp r11 ; x_ptr reused
|
|
|
|
global __gmpn_mul_basecase
|
|
|
|
%ifdef DLL
|
|
export __gmpn_mul_basecase
|
|
%endif
|
|
|
|
__gmpn_mul_basecase:
|
|
movsxd x_len,r8d
|
|
mov rax,[y_ptr]
|
|
cmp x_len,2
|
|
ja mul_m_by_n
|
|
je mul_2_by_n
|
|
mul qword [rdx]
|
|
mov [r_ptr],rax
|
|
mov [r_ptr+8],rdx
|
|
ret
|
|
|
|
mul_2_by_n:
|
|
movsxd r10,dword[rsp+0x28] ; load as a 32-bit integer
|
|
mov x_ptr,rdx
|
|
dec qword y_len
|
|
jnz mul_2_by_2
|
|
mov r8,rax ; y[0] -> r8 (was x_len)
|
|
mov rax,[x_ptr]
|
|
mul r8
|
|
mov [r_ptr],rax
|
|
mov rax,[x_ptr+8]
|
|
mov r9,rdx ; carry -> r9 (was y_ptr)
|
|
mul r8
|
|
add r9,rax
|
|
mov [r_ptr+8],r9
|
|
adc rdx,y_len ; note: r10 = 0 (was y_len)
|
|
mov [r_ptr+16],rdx
|
|
ret
|
|
|
|
mul_2_by_2: ; r8 (x_len) and r10 (y_len) free
|
|
mov r10,[x_ptr] ; x[0]
|
|
mul r10 ; y[0] * x[0]
|
|
mov [r_ptr],rax
|
|
mov r8,rdx ; cry = { 0, r8 }
|
|
mov rax,[y_ptr+8] ; y[1]
|
|
mul r10 ; y[1] * x[0]
|
|
add r8,rax
|
|
adc rdx,byte 0
|
|
mov r10,[x_ptr+8] ; x[1] - r11 (x_ptr) now free
|
|
mov r11,rdx ; cry = { r11, r8 }
|
|
mov rax,[y_ptr] ; y[0]
|
|
mul r10 ; y[0] * x[1]
|
|
add r8,rax
|
|
adc r11,rdx
|
|
mov [r_ptr+8],r8
|
|
mov r8,dword 0
|
|
adc r8,r8 ; cry = { r8, r11 }
|
|
mov rax,[y_ptr+8] ; y[1]
|
|
mul r10 ; x[1] * y[1]
|
|
add rax,r11
|
|
adc rdx,r8
|
|
mov [r_ptr+16],rax
|
|
mov [r_ptr+24],rdx
|
|
ret
|
|
|
|
; do first multiply of y[0] * x[n] as it can simply be stored
|
|
|
|
mul_m_by_n:
|
|
mov r10d, dword[rsp+0x28] ; load as a 32-bit integer
|
|
|
|
prologue fmul_m_by_n, reg_save_list, 3
|
|
mov x_ptr,rdx
|
|
mov r12,x_len
|
|
mov rbp,rax ; y[0] -> rbp
|
|
xor rbx,rbx ; for carry
|
|
lea rsi,[x_ptr+r12*8] ; past end of x[]
|
|
lea rdi,[r_ptr+r12*8] ; past end of r[]
|
|
neg r12
|
|
.0: mov rax,[rsi+r12*8] ; x[n]
|
|
mul rbp ; x[n] * y[0]
|
|
add rax,rbx ; add carry from previous round
|
|
mov [rdi+r12*8],rax ; store r[n]
|
|
mov rbx,dword 0 ; propagate carry
|
|
adc rbx,rdx
|
|
inc r12 ; next iteration
|
|
jnz .0
|
|
mov [rdi],rbx ; store final digit in carry
|
|
mov rdx,y_len ; done if y_len is 1
|
|
dec rdx
|
|
jnz .1 ; more to do
|
|
jmp L_exit
|
|
|
|
.1: cmp x_len,UNROLL_THRESHOLD ; unroll if many loops
|
|
jae L_unroll
|
|
lea y_ptr,[y_ptr+rdx*8+8] ; pointer to end limb of y[]
|
|
neg x_len ; negative counter for x[n]
|
|
neg rdx ; negative counter for y[n]
|
|
mov rax,[rsi+x_len*8] ; x[0] -> rax
|
|
mov y_len,rdx ; now -(y_len - 1)
|
|
inc x_len ; negative counter for x[1]
|
|
xor rbx,rbx ; for carry
|
|
mov rcx,x_len ; now -(x_len - 1) -> rcx (was r_ptr)
|
|
mov rbp,[y_ptr+rdx*8] ; y[n] -> rbp
|
|
jmp .3
|
|
.2: mov rcx,x_len ; restore x[] counter
|
|
xor rbx,rbx ; clear carry
|
|
add rdi,8 ; increase end of r[] pointer
|
|
mov rbp,[y_ptr+y_len*8] ; y[n] -> rbp
|
|
mov rax,[rsi+rcx*8-8] ; x[m] -> rax
|
|
.3: mul rbp ; x[m] * y[n]
|
|
add rbx,rax ; add carry
|
|
adc rdx,byte 0
|
|
add [rdi+rcx*8],rbx ; add into r[]
|
|
mov rax,[rsi+rcx*8] ; next x[m] ->rax
|
|
adc rdx,byte 0 ; add carry to rdx
|
|
inc rcx ; got to next limb of x[]
|
|
mov rbx,rdx ; move carry into rbx
|
|
jnz .3 ; got to next limb of x[]
|
|
mul rbp ; do last limb
|
|
add rbx,rax ; propagate carry
|
|
adc rdx,byte 0
|
|
add [rdi],rbx ; add into r[]
|
|
adc rdx,byte 0 ; add add in any carry
|
|
inc y_len
|
|
mov [rdi+8],rdx ; move (not add) carry into r[]
|
|
jnz .2 ; go to next limb of y[]
|
|
jmp L_exit
|
|
|
|
L_unroll:
|
|
mov rdi,r_ptr
|
|
mov rcx,x_len
|
|
mov rsi,x_ptr
|
|
mov rbp,[y_ptr+8]
|
|
lea y_ptr,[y_ptr+rdx*8+8]
|
|
neg rdx
|
|
mov y_len,rdx
|
|
lea rbx,[UNROLL_COUNT-2+rcx]
|
|
dec rcx
|
|
mov rax,[rsi] ; x[0]
|
|
and rbx,-UNROLL_MASK-1
|
|
neg rcx
|
|
neg rbx
|
|
and rcx,UNROLL_MASK
|
|
mov [rsp+v_par],rcx
|
|
mov [rsp+v_adj],rbx
|
|
mov rdx,rcx
|
|
shl rcx,3
|
|
lea rcx,[rcx+rcx*2]
|
|
lea v_jmp,[rel .4]
|
|
lea v_jmp,[v_jmp+rcx]
|
|
neg rdx
|
|
mov [rsp+v_xlo],rax
|
|
lea rdi,[rdi+rdx*8+8]
|
|
lea rsi,[rsi+rdx*8+8]
|
|
jmp .3
|
|
.2: mov rbx,[rsp+v_adj]
|
|
mov rax,[rsp+v_xlo]
|
|
lea rdi,[rdi+rbx*8+8]
|
|
lea rsi,[rsi+rbx*8]
|
|
mov rbp,[y_ptr+y_len*8]
|
|
.3: mul rbp
|
|
sar rbx,UNROLL_LOG2
|
|
mov rcx,[rsp+v_par]
|
|
mov v_ctr,rbx
|
|
test cl,1 ; low word of product + carry
|
|
mov rbx,dword 0 ; is in rcx on even rounds and
|
|
mov rcx,dword 0 ; rbx on odd rounds - we must
|
|
cmovz rcx,rax ; put low word of first product
|
|
cmovnz rbx,rax ; in the right register here
|
|
jmp v_jmp
|
|
.4:
|
|
%define CHUNK_COUNT 2
|
|
%assign i 0
|
|
%rep UNROLL_COUNT / CHUNK_COUNT
|
|
%define disp0 8 * i * CHUNK_COUNT
|
|
|
|
mov rax,[byte rsi+disp0]
|
|
adc rbx,rdx
|
|
mul rbp
|
|
add [byte rdi+disp0],rcx
|
|
mov rcx,dword 0
|
|
adc rbx,rax
|
|
mov rax,[byte rsi+disp0+8]
|
|
adc rcx,rdx
|
|
mul rbp
|
|
add [byte rdi+disp0+8],rbx
|
|
mov rbx,dword 0
|
|
adc rcx,rax
|
|
|
|
%assign i i + 1
|
|
%endrep
|
|
|
|
inc v_ctr
|
|
lea rsi,[UNROLL_BYTES+rsi]
|
|
lea rdi,[UNROLL_BYTES+rdi]
|
|
jnz .4
|
|
|
|
adc rdx,byte 0
|
|
add [rdi],rcx
|
|
adc rdx,byte 0
|
|
inc y_len
|
|
mov [rdi+8],rdx
|
|
jnz .2
|
|
L_exit:
|
|
epilogue reg_save_list, 3
|
|
|
|
end
|