mpir/mpn/x86_64/amd64/mul_basecase.as

581 lines
12 KiB
ActionScript

; AMD K8 mpn_mul_basecase -- multiply two mpn numbers.
;
; This file is just an adaptation of similar file in the k7 directory
; made by P. Gaudry in April 2005.
;
; Here is the copyright of the original k7 version:
;
; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
; Copyright 2005, 2006 Pierrick Gaudry
;
; Copyright 2008 William Hart
;
; This file is part of the MPIR Library.
;
; The MPIR Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The MPIR Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the MPIR Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 51 Franklin Street,
; Fifth Floor, Boston, MA 02110-1301, USA.
; void mpn_mul_basecase (mp_ptr wp,
; mp_srcptr xp, mp_size_t xsize,
; mp_srcptr yp, mp_size_t ysize);
;
; Calculate xp,xsize multiplied by yp,ysize, storing the result in
; wp,xsize+ysize.
;
; This routine is essentially the same as mpn/generic/mul_basecase.c, but
; it's faster because it does most of the mpn_addmul_1() startup
; calculations only once. The saving is 15-25% on typical sizes coming from
; the Karatsuba multiply code.
%include '../yasm_mac.inc'
%define UNROLL_LOG2 4
%define UNROLL_COUNT (1 << UNROLL_LOG2)
%define UNROLL_MASK (UNROLL_COUNT - 1)
%define UNROLL_BYTES (8 * UNROLL_COUNT)
%define UNROLL_THRESHOLD 5
%define ysize r8
%define yp rcx
%define xsize rdx
%define xp rsi
%define wp rdi
BITS 64
align 32
GLOBAL_FUNC mpn_mul_basecase
cmp xsize, 2
ja xsize_more_than_two
je two_by_something
; one limb by one limb
mov rax, [yp] ; rax <- y0
mul qword [xp] ; [ax:dx] <- y0*x0
mov [wp], rax
mov [wp+8], rdx
ret
;-----------------------------------------------------------------------------
two_by_something:
; xsize = 2, hence rdx is free for usage
dec ysize ; YSIZE--
mov r9, [yp] ; r9 <- y0
mov rax, [xp] ; rax <- x0
jnz two_by_two
; two limbs by one limb
mul r9 ; [ax:dx] <- x0*y0
mov [wp], rax ; w0 <- low_prod
mov rax, [xp+8] ; rax <- x1 (rsi is now free)
mov rsi, rdx ; rsi <- carry
mul r9 ; [ax:dx] <- x1*y0
add rsi, rax ; rsi <- ax+carry ( --> carry_flag)
mov [wp+8], rsi ; w1 <- rsi
adc rdx, 0 ; dx <- dx+carry
mov [wp+16], rdx ; w2 <- dx
ret
; -----------------------------------------------------------------------------
align 16
two_by_two:
; rax x0 r8
; rbx **untouched** r9 y0
; rcx yp r10-11
; rdx
; rsi xp
; rdi wp
; rbp
mul r9 ; [ax:dx] <- x0*y0
mov r10, rdx ; r10 <- carry for w1
mov [wp], rax ; w0 <- ax
mov rax, [xp+8] ; ax <- x1
mul r9 ; [ax:dx] <- x1*y0
add r10, rax ; r10 <- r10 + ax for w1
adc rdx, 0 ; dx <- carry for w2
mov rcx, [yp+8] ; cx <- y1
mov [wp+8], r10 ; w1 <- r10
mov rax, [xp+8] ; ax <- x1
mov r10, rdx ; carry, for w2
mul rcx ; [ax:dx] <- x1*y1
add r10, rax ; r10 <- for w2
adc rdx, 0 ; for w3
mov rax, [xp] ; x0
mov rsi, rdx ; carry, for w3
mul rcx ; x0*y1
add [wp+8], rax ; w1 += ax
adc r10, rdx ; for w2
mov [wp+16], r10 ; w2 <- r10
adc rsi, 0
mov [wp+24], rsi ; w3 <- carry in rsi
ret
; -----------------------------------------------------------------------------
align 16
xsize_more_than_two:
; The first limb of yp is processed with a simple mpn_mul_1 style loop
; inline. Unrolling this doesn't seem worthwhile since it's only run once
; (whereas the addmul below is run ysize-1 many times). A call to the
; actual mpn_mul_1 will be slowed down by the call and parameter pushing and
; popping, and doesn't seem likely to be worthwhile on the typical 13-26
; limb operations the Karatsuba code calls here with.
; rax r8 ysize
; rbx
; rcx yp
; rdx xsize
; rsi xp
; rdi wp
; rbp
%define YSIZE r8 ; already there
%define YP r9 ; init : rcx
%define XSIZE r10 ; init : rdx
%define XP r11 ; init : rsi
%define WP r12 ; init : rdi r12 should be saved!
; FRAME doesn't carry on from previous, no pushes yet here
%define SAVE_RBX [rsp+16]
%define SAVE_R12 [rsp+8]
%define SAVE_RBP [rsp]
sub rsp, 24
mov SAVE_RBX, rbx
mov SAVE_R12, r12
mov SAVE_RBP, rbp
mov YP, rcx
mov XSIZE, rdx
mov XP, rsi
mov WP, rdi
mov rbp, [YP]
mov rcx, XSIZE
xor rbx, rbx
lea rsi, [XP+XSIZE*8] ; xp end
lea rdi, [WP+XSIZE*8] ; wp end of mul1
neg rcx
mul1:
; rax scratch
; rbx carry
; rcx counter, negative
; rdx scratch
; rsi xp end
; rdi wp end of mul1
; rbp multiplier
mov rax, [rsi+rcx*8]
mul rbp
add rax, rbx
mov [rdi+rcx*8], rax
mov rbx, 0
adc rbx, rdx
inc rcx
jnz mul1
mov rdx, YSIZE
mov rcx, XSIZE
mov [rdi], rbx ; final carry
dec rdx
jnz ysize_more_than_one
mov rbx, SAVE_RBX
mov rbp, SAVE_RBP
mov r12, SAVE_R12
add rsp, 24
ret
ysize_more_than_one:
cmp rcx, UNROLL_THRESHOLD
mov rax, YP
jae unroll
; -----------------------------------------------------------------------------
; simple addmul looping
;
; rax yp
; rbx
; rcx xsize
; rdx ysize-1
; rsi xp end
; rdi wp end of mul1
; rbp
lea rbp, [rax+rdx*8+8] ; yp end
neg rcx
neg rdx
mov RAX, [rsi+rcx*8] ; xp low limb
mov YSIZE, rdx ; -(ysize-1)
inc rcx
xor rbx, rbx ; initial carry
mov XSIZE, rcx ; -(xsize-1)
mov YP, rbp
mov rbp, [rbp+rdx*8] ; yp second lowest limb - multiplier
jmp simple_outer_entry
; this is offset ???? Align ?
simple_outer_top:
; rbp ysize counter, negative
mov rdx, YP
mov rcx, XSIZE ; -(xsize-1)
xor rbx, rbx ; carry
mov YSIZE, rbp
add rdi, 8 ; next position in wp
mov rbp, [rdx+rbp*8] ; yp limb - multiplier
mov rax, [rsi+rcx*8-8] ; xp low limb
simple_outer_entry:
simple_inner:
; rax xp limb
; rbx carry limb
; rcx loop counter (negative)
; rdx scratch
; rsi xp end
; rdi wp end
; rbp multiplier
mul rbp
add rbx, rax
adc rdx, 0
add [rdi+rcx*8], rbx
mov rax, [rsi+rcx*8]
adc rdx, 0
inc rcx
mov rbx, rdx
jnz simple_inner
mul rbp
mov rbp, YSIZE
add rbx, rax
adc rdx, 0
add [rdi], rbx
adc rdx, 0
inc rbp
mov [rdi+8], rdx
jnz simple_outer_top
mov rbx, SAVE_RBX
mov rbp, SAVE_RBP
mov r12, SAVE_R12
add rsp, 24
ret
; -----------------------------------------------------------------------------
;
; The unrolled loop is the same as in mpn_addmul_1(), see that code for some
; comments.
;
; VAR_ADJUST is the negative of how many limbs the leals in the inner loop
; increment xp and wp. This is used to adjust back xp and wp, and rshifted
; to given an initial VAR_COUNTER at the top of the outer loop.
;
; VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
; up to -1, inclusive.
;
; VAR_JMP is the computed jump into the unrolled loop.
;
; VAR_XP_LOW is the least significant limb of xp, which is needed at the
; start of the unrolled loop.
;
; YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
; inclusive.
;
; YP is offset appropriately so that the YSIZE counter can be
; added to give the location of the next limb of yp, which is the multiplier
; in the unrolled loop.
;
; The trick with VAR_ADJUST means it's only necessary to do one fetch in the
; outer loop to take care of xp, wp and the inner loop counter.
%define VAR_COUNTER [rsp+16]
%define VAR_ADJUST [rsp+8]
%define VAR_XP_LOW [rsp]
%define VAR_EXTRA_SPACE 24
unroll:
; rax yp
; rbx
; rcx xsize
; rdx ysize-1
; rsi xp end
; rdi wp end of mul1
; rbp
mov rsi, XP ; from here, XP not used
mov rbp, [rax+8] ; multiplier (yp second limb)
lea rax, [rax+rdx*8+8] ; yp adjust for ysize indexing
mov rdi, WP
mov YP, rax
neg rdx
; From here, only YP and YSIZE are used
; Hence r10, r11, r12 are free for use
mov YSIZE, rdx
lea rbx, [rcx+UNROLL_COUNT-2] ; (xsize-1)+UNROLL_COUNT-1
dec rcx ; xsize-1
mov rax, [rsi] ; xp low limb
and rbx, -UNROLL_MASK-1
neg rcx
sub rsp, VAR_EXTRA_SPACE
neg rbx
and rcx, UNROLL_MASK
mov r12, rcx ; for later parity test
mov VAR_ADJUST, rbx
mov rdx, rcx
mov r10, rcx
shl rcx, 4
shl r10, 3
sar rbx, UNROLL_LOG2
; 24=16+8 code bytes per limb
%ifdef PIC
call pic_calc
unroll_here:
..@unroll_here1:
%else
lea rcx, [rcx + r10 + ..@unroll_entry1]
%endif
neg rdx
mov VAR_XP_LOW, rax
mov XP, rcx ; XP used for VAR_JUMP
lea rdi, [rdi+rdx*8+8] ; wp and xp, adjust for unrolling,
lea rsi, [rsi+rdx*8+8] ; and start at second limb
jmp unroll_outer_entry
%ifdef PIC
pic_calc:
lea rcx, [rcx+r10]
add rcx, ..@unroll_entry1 - ..@unroll_here1
add rcx, [rsp]
ret
%endif
; --------------------------------------------------------------------------
align 32
unroll_outer_top:
; ebp ysize counter, negative
mov rbx, VAR_ADJUST
mov rdx, YP
mov rax, VAR_XP_LOW
mov YSIZE, rbp ; store incremented ysize counter
lea rdi, [rdi+rbx*8+8]
lea rsi, [rsi+rbx*8]
sar rbx, UNROLL_LOG2
mov rbp, [rdx+rbp*8] ; yp next multiplier
mov rcx, XP
unroll_outer_entry:
mul rbp
mov rcx, r12
test cl, 1 ; and clear carry bit
mov VAR_COUNTER, rbx
mov rbx, 0
mov rcx, 0
cmovz rcx, rax ; eax into low carry, zero into high carry limb
cmovnz rbx, rax
; Extra fetch of VAR_JMP is bad, but registers are tight
; TODO: we have more registers, now!!!!
jmp XP
; -----------------------------------------------------------------------------
align 32
unroll_top:
; rax xp limb
; rbx carry high
; rcx carry low
; rdx scratch
; rsi xp+8
; rdi wp
; rbp yp multiplier limb
;
; VAR_COUNTER loop counter, negative
;
; 24 bytes each limb
unroll_entry:
..@unroll_entry1:
%define CHUNK_COUNT 2
%assign i 0
%rep UNROLL_COUNT / CHUNK_COUNT
%define disp0 8 * i * CHUNK_COUNT
adc rbx, rdx
mov rax, [byte rsi+disp0]
mul rbp
add [byte rdi+disp0], rcx
mov rcx, 0
adc rbx, rax
adc rcx, rdx
mov rax, [byte rsi+disp0+8]
mul rbp
add [byte rdi+disp0+8], rbx
mov rbx, 0
adc rcx, rax
%assign i i + 1
%endrep
inc qword VAR_COUNTER
lea rsi, [rsi+UNROLL_BYTES]
lea rdi, [rdi+UNROLL_BYTES]
jnz unroll_top
; rax
; rbx zero
; rcx low
; rdx high
; rsi
; rdi wp, pointing at second last limb)
; rbp
;
; carry flag to be added to high
mov rbp, YSIZE
adc rdx, 0
add [rdi], rcx
adc rdx, 0
inc rbp
mov [rdi+8], rdx
jnz unroll_outer_top
add rsp, 24
mov rbp, SAVE_RBP
mov r12, SAVE_R12
mov rbx, SAVE_RBX
add rsp, 24
ret