mpir/mpn/x86_64/amd64/mul_basecase.asm

568 lines
12 KiB
NASM
Raw Normal View History

dnl AMD K8 mpn_mul_basecase -- multiply two mpn numbers.
dnl This file is just an adaptation of similar file in the k7 directory.
dnl Adapted by P. Gaudry in April 2005.
dnl Here is the copyright of the original k7 version:
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
dnl Suite 330, Boston, MA 02111-1307, USA.
include(`../config.m4')
deflit(UNROLL_COUNT, 16)
C void mpn_mul_basecase (mp_ptr wp,
C mp_srcptr xp, mp_size_t xsize,
C mp_srcptr yp, mp_size_t ysize);
C
C Calculate xp,xsize multiplied by yp,ysize, storing the result in
C wp,xsize+ysize.
C
C This routine is essentially the same as mpn/generic/mul_basecase.c, but
C it's faster because it does most of the mpn_addmul_1() startup
C calculations only once. The saving is 15-25% on typical sizes coming from
C the Karatsuba multiply code.
ifdef(`PIC',`
deflit(UNROLL_THRESHOLD, 5)
',`
deflit(UNROLL_THRESHOLD, 5)
')
define(param_ysize,%r8)
define(param_yp, %rcx)
define(param_xsize,%rdx)
define(param_xp, %rsi)
define(param_wp, %rdi)
TEXT
ALIGN(32)
PROLOGUE(mpn_mul_basecase)
deflit(`FRAME',0)
cmpq $2, param_xsize
ja L(xsize_more_than_two)
je L(two_by_something)
C one limb by one limb
movq (param_yp), %rax C rax <- y0
mulq (param_xp) C [ax:dx] <- y0*x0
movq %rax, (param_wp)
movq %rdx, 8(param_wp)
ret
C -----------------------------------------------------------------------------
L(two_by_something):
C xsize = 2, hence rdx is free for usage
deflit(`FRAME',0)
decq param_ysize C YSIZE--
movq (param_yp), %r9 C r9 <- y0
movq (param_xp), %rax C rax <- x0
jnz L(two_by_two)
C two limbs by one limb
mulq %r9 C [ax:dx] <- x0*y0
movq %rax, (param_wp) C w0 <- low_prod
movq 8(param_xp), %rax C rax <- x1 (rsi is now free)
movq %rdx, %rsi C rsi <- carry
mulq %r9 C [ax:dx] <- x1*y0
addq %rax, %rsi C rsi <- ax+carry ( --> carry_flag)
movq %rsi, 8(param_wp) C w1 <- rsi
adcq $0, %rdx C dx <- dx+carry
movq %rdx, 16(param_wp) C w2 <- dx
ret
C -----------------------------------------------------------------------------
ALIGN(16)
L(two_by_two):
C rax x0 r8
C rbx **untouched** r9 y0
C rcx yp r10-11
C rdx
C rsi xp
C rdi wp
C rbp
mulq %r9 C [ax:dx] <- x0*y0
movq %rdx, %r10 C r10 <- carry for w1
movq %rax, (param_wp) C w0 <- ax
movq 8(param_xp), %rax C ax <- x1
mulq %r9 C [ax:dx] <- x1*y0
addq %rax, %r10 C r10 <- r10 + ax for w1
adcq $0, %rdx C dx <- carry for w2
movq 8(param_yp), %rcx C cx <- y1
movq %r10, 8(param_wp) C w1 <- r10
movq 8(param_xp), %rax C ax <- x1
movq %rdx, %r10 C carry, for w2
mulq %rcx C [ax:dx] <- x1*y1
addq %rax, %r10 C r10 <- for w2
adcq $0, %rdx C for w3
movq (param_xp), %rax C x0
movq %rdx, %rsi C carry, for w3
mulq %rcx C x0*y1
addq %rax, 8(param_wp) C w1 += ax
adcq %rdx, %r10 C for w2
movq %r10, 16(param_wp) C w2 <- r10
adcq $0, %rsi
movq %rsi, 24(param_wp) C w3 <- carry in rsi
ret
C -----------------------------------------------------------------------------
ALIGN(16)
L(xsize_more_than_two):
C The first limb of yp is processed with a simple mpn_mul_1 style loop
C inline. Unrolling this doesn't seem worthwhile since it's only run once
C (whereas the addmul below is run ysize-1 many times). A call to the
C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
C popping, and doesn't seem likely to be worthwhile on the typical 13-26
C limb operations the Karatsuba code calls here with.
C rax r8 ysize
C rbx
C rcx yp
C rdx xsize
C rsi xp
C rdi wp
C rbp
define(PARAM_YSIZE,%r8) C already there
define(PARAM_YP, %r9) C init : %rcx
define(PARAM_XSIZE,%r10) C init : %rdx
define(PARAM_XP, %r11) C init : %rsi
define(PARAM_WP, %r12) C init : %rdi r12 should be saved!
dnl FRAME doesn't carry on from previous, no pushes yet here
defframe(`SAVE_RBX',-8)
defframe(`SAVE_R12',-16)
defframe(`SAVE_RBP',-24)
deflit(`FRAME',0)
subq $24, %rsp
deflit(`FRAME',24)
movq %rbx, SAVE_RBX
movq %r12, SAVE_R12
movq %rbp, SAVE_RBP
movq %rcx, PARAM_YP
movq %rdx, PARAM_XSIZE
movq %rsi, PARAM_XP
movq %rdi, PARAM_WP
movq (PARAM_YP), %rbp
movq PARAM_XSIZE, %rcx
xorq %rbx, %rbx
leaq (PARAM_XP,PARAM_XSIZE,8), %rsi C xp end
leaq (PARAM_WP,PARAM_XSIZE,8), %rdi C wp end of mul1
negq %rcx
L(mul1):
C rax scratch
C rbx carry
C rcx counter, negative
C rdx scratch
C rsi xp end
C rdi wp end of mul1
C rbp multiplier
movq (%rsi,%rcx,8), %rax
mulq %rbp
addq %rbx, %rax
movq %rax, (%rdi,%rcx,8)
movq $0, %rbx
adcq %rdx, %rbx
incq %rcx
jnz L(mul1)
movq PARAM_YSIZE, %rdx
movq PARAM_XSIZE, %rcx
movq %rbx, (%rdi) C final carry
decq %rdx
jnz L(ysize_more_than_one)
movq SAVE_RBX, %rbx
movq SAVE_RBP, %rbp
movq SAVE_R12, %r12
addq $FRAME, %rsp
ret
L(ysize_more_than_one):
cmpq $UNROLL_THRESHOLD, %rcx
movq PARAM_YP, %rax
jae L(unroll)
C -----------------------------------------------------------------------------
C simple addmul looping
C
C rax yp
C rbx
C rcx xsize
C rdx ysize-1
C rsi xp end
C rdi wp end of mul1
C rbp
leaq 8(%rax,%rdx,8), %rbp C yp end
negq %rcx
negq %rdx
movq (%rsi,%rcx,8), %rax C xp low limb
movq %rdx, PARAM_YSIZE C -(ysize-1)
incq %rcx
xorq %rbx, %rbx C initial carry
movq %rcx, PARAM_XSIZE C -(xsize-1)
movq %rbp, PARAM_YP
movq (%rbp,%rdx,8), %rbp C yp second lowest limb - multiplier
jmp L(simple_outer_entry)
C this is offset ???? Align ?
L(simple_outer_top):
C rbp ysize counter, negative
movq PARAM_YP, %rdx
movq PARAM_XSIZE, %rcx C -(xsize-1)
xorq %rbx, %rbx C carry
movq %rbp, PARAM_YSIZE
addq $8, %rdi C next position in wp
movq (%rdx,%rbp,8), %rbp C yp limb - multiplier
movq -8(%rsi,%rcx,8), %rax C xp low limb
L(simple_outer_entry):
L(simple_inner):
C rax xp limb
C rbx carry limb
C rcx loop counter (negative)
C rdx scratch
C rsi xp end
C rdi wp end
C rbp multiplier
mulq %rbp
addq %rax, %rbx
adcq $0, %rdx
addq %rbx, (%rdi,%rcx,8)
movq (%rsi,%rcx,8), %rax
adcq $0, %rdx
incq %rcx
movq %rdx, %rbx
jnz L(simple_inner)
mulq %rbp
movq PARAM_YSIZE, %rbp
addq %rax, %rbx
adcq $0, %rdx
addq %rbx, (%rdi)
adcq $0, %rdx
incq %rbp
movq %rdx, 8(%rdi)
jnz L(simple_outer_top)
movq SAVE_RBX, %rbx
movq SAVE_RBP, %rbp
movq SAVE_R12, %r12
addq $FRAME, %rsp
ret
C -----------------------------------------------------------------------------
C
C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
C comments.
C
C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
C increment xp and wp. This is used to adjust back xp and wp, and rshifted
C to given an initial VAR_COUNTER at the top of the outer loop.
C
C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
C up to -1, inclusive.
C
C VAR_JMP is the computed jump into the unrolled loop.
C
C VAR_XP_LOW is the least significant limb of xp, which is needed at the
C start of the unrolled loop.
C
C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
C inclusive.
C
C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
C added to give the location of the next limb of yp, which is the multiplier
C in the unrolled loop.
C
C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
C outer loop to take care of xp, wp and the inner loop counter.
defframe(VAR_COUNTER, -32)
defframe(VAR_ADJUST, -40)
defframe(VAR_XP_LOW, -48)
deflit(VAR_EXTRA_SPACE, 24)
L(unroll):
C rax yp
C rbx
C rcx xsize
C rdx ysize-1
C rsi xp end
C rdi wp end of mul1
C rbp
movq PARAM_XP, %rsi C from here, PARAM_XP not used
movq 8(%rax), %rbp C multiplier (yp second limb)
leaq 8(%rax,%rdx,8), %rax C yp adjust for ysize indexing
movq PARAM_WP, %rdi
movq %rax, PARAM_YP
negq %rdx
C From here, only PARAM_YP and PARAM_YSIZE are used
C Hence r10, r11, r12 are free for use
movq %rdx, PARAM_YSIZE
leaq UNROLL_COUNT-2(%rcx), %rbx C (xsize-1)+UNROLL_COUNT-1
decq %rcx C xsize-1
movq (%rsi), %rax C xp low limb
andq $-UNROLL_MASK-1, %rbx
negq %rcx
subq $VAR_EXTRA_SPACE, %rsp
deflit(`FRAME',24+VAR_EXTRA_SPACE)
negq %rbx
andq $UNROLL_MASK, %rcx
movq %rcx, %r12 C for later parity test
movq %rbx, VAR_ADJUST
movq %rcx, %rdx
movq %rcx, %r10
shlq $4, %rcx
shlq $3, %r10
sarq $UNROLL_LOG2, %rbx
C 24=16+8 code bytes per limb
ifdef(`PIC',`
callq L(pic_calc)
L(unroll_here):
',`
leaq L(unroll_entry) (%rcx,%r10,1), %rcx
')
negq %rdx
movq %rax, VAR_XP_LOW
movq %rcx, PARAM_XP C PARAM_XP used for VAR_JUMP
leaq 8(%rdi,%rdx,8), %rdi C wp and xp, adjust for unrolling,
leaq 8(%rsi,%rdx,8), %rsi C and start at second limb
jmp L(unroll_outer_entry)
ifdef(`PIC',`
L(pic_calc):
C See mpn/x86/README about old gas bugs
leaq (%rcx,%r10,1), %rcx
addq $L(unroll_entry)-L(unroll_here), %rcx
addq (%rsp), %rcx
ret
')
C --------------------------------------------------------------------------
ALIGN(32)
L(unroll_outer_top):
C ebp ysize counter, negative
movq VAR_ADJUST, %rbx
movq PARAM_YP, %rdx
movq VAR_XP_LOW, %rax
movq %rbp, PARAM_YSIZE C store incremented ysize counter
leaq 8(%rdi,%rbx,8), %rdi
leaq (%rsi,%rbx,8), %rsi
sarq $UNROLL_LOG2, %rbx
movq (%rdx,%rbp,8), %rbp C yp next multiplier
movq PARAM_XP, %rcx
L(unroll_outer_entry):
mulq %rbp
movq %r12, %rcx
testb $1, %cl C and clear carry bit
movq %rbx, VAR_COUNTER
movq $0, %rbx
movq $0, %rcx
cmovz %rax, %rcx C eax into low carry, zero into high carry limb
cmovnz %rax, %rbx
C Extra fetch of VAR_JMP is bad, but registers are tight
C TODO: we have more registers, now!!!!
jmp *PARAM_XP
C -----------------------------------------------------------------------------
ALIGN(32)
L(unroll_top):
C rax xp limb
C rbx carry high
C rcx carry low
C rdx scratch
C rsi xp+8
C rdi wp
C rbp yp multiplier limb
C
C VAR_COUNTER loop counter, negative
C
C 24 bytes each limb
L(unroll_entry):
deflit(CHUNK_COUNT,2)
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
deflit(`disp0', eval(i*CHUNK_COUNT*8 ifelse(UNROLL_BYTES,256,-128)))
deflit(`disp1', eval(disp0 + 8))
adcq %rdx, %rbx
Zdisp( movq, disp0,(%rsi), %rax)
mulq %rbp
Zdisp( addq, %rcx, disp0,(%rdi))
movq $0, %rcx
adcq %rax, %rbx
adcq %rdx, %rcx
movq disp1(%rsi), %rax
mulq %rbp
addq %rbx, disp1(%rdi)
movq $0, %rbx
adcq %rax, %rcx
')
incq VAR_COUNTER
leaq UNROLL_BYTES(%rsi), %rsi
leaq UNROLL_BYTES(%rdi), %rdi
jnz L(unroll_top)
C rax
C rbx zero
C rcx low
C rdx high
C rsi
C rdi wp, pointing at second last limb)
C rbp
C
C carry flag to be added to high
deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
deflit(`disp1', eval(disp0-0 + 8))
movq PARAM_YSIZE, %rbp
adcq $0, %rdx
addq %rcx, disp0(%rdi)
adcq $0, %rdx
incq %rbp
movq %rdx, disp1(%rdi)
jnz L(unroll_outer_top)
movq SAVE_RBP, %rbp
movq SAVE_R12, %r12
movq SAVE_RBX, %rbx
addq $FRAME, %rsp
ret
EPILOGUE()