a197a2d3eb
Removed directories for no longer supported architectures.
568 lines
12 KiB
NASM
568 lines
12 KiB
NASM
dnl AMD K8 mpn_mul_basecase -- multiply two mpn numbers.
|
|
|
|
dnl This file is just an adaptation of similar file in the k7 directory.
|
|
dnl Adapted by P. Gaudry in April 2005.
|
|
dnl Here is the copyright of the original k7 version:
|
|
|
|
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
|
dnl License, or (at your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
dnl Lesser General Public License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public
|
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
|
|
dnl Suite 330, Boston, MA 02111-1307, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
deflit(UNROLL_COUNT, 16)
|
|
|
|
|
|
C void mpn_mul_basecase (mp_ptr wp,
|
|
C mp_srcptr xp, mp_size_t xsize,
|
|
C mp_srcptr yp, mp_size_t ysize);
|
|
C
|
|
C Calculate xp,xsize multiplied by yp,ysize, storing the result in
|
|
C wp,xsize+ysize.
|
|
C
|
|
C This routine is essentially the same as mpn/generic/mul_basecase.c, but
|
|
C it's faster because it does most of the mpn_addmul_1() startup
|
|
C calculations only once. The saving is 15-25% on typical sizes coming from
|
|
C the Karatsuba multiply code.
|
|
|
|
ifdef(`PIC',`
|
|
deflit(UNROLL_THRESHOLD, 5)
|
|
',`
|
|
deflit(UNROLL_THRESHOLD, 5)
|
|
')
|
|
|
|
define(param_ysize,%r8)
|
|
define(param_yp, %rcx)
|
|
define(param_xsize,%rdx)
|
|
define(param_xp, %rsi)
|
|
define(param_wp, %rdi)
|
|
|
|
|
|
TEXT
|
|
ALIGN(32)
|
|
PROLOGUE(mpn_mul_basecase)
|
|
deflit(`FRAME',0)
|
|
|
|
cmpq $2, param_xsize
|
|
ja L(xsize_more_than_two)
|
|
je L(two_by_something)
|
|
|
|
C one limb by one limb
|
|
|
|
movq (param_yp), %rax C rax <- y0
|
|
mulq (param_xp) C [ax:dx] <- y0*x0
|
|
|
|
movq %rax, (param_wp)
|
|
movq %rdx, 8(param_wp)
|
|
ret
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
L(two_by_something):
|
|
C xsize = 2, hence rdx is free for usage
|
|
deflit(`FRAME',0)
|
|
decq param_ysize C YSIZE--
|
|
|
|
movq (param_yp), %r9 C r9 <- y0
|
|
movq (param_xp), %rax C rax <- x0
|
|
jnz L(two_by_two)
|
|
|
|
|
|
C two limbs by one limb
|
|
|
|
mulq %r9 C [ax:dx] <- x0*y0
|
|
|
|
movq %rax, (param_wp) C w0 <- low_prod
|
|
movq 8(param_xp), %rax C rax <- x1 (rsi is now free)
|
|
movq %rdx, %rsi C rsi <- carry
|
|
|
|
mulq %r9 C [ax:dx] <- x1*y0
|
|
|
|
addq %rax, %rsi C rsi <- ax+carry ( --> carry_flag)
|
|
movq %rsi, 8(param_wp) C w1 <- rsi
|
|
|
|
adcq $0, %rdx C dx <- dx+carry
|
|
movq %rdx, 16(param_wp) C w2 <- dx
|
|
|
|
ret
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
ALIGN(16)
|
|
L(two_by_two):
|
|
C rax x0 r8
|
|
C rbx **untouched** r9 y0
|
|
C rcx yp r10-11
|
|
C rdx
|
|
C rsi xp
|
|
C rdi wp
|
|
C rbp
|
|
|
|
mulq %r9 C [ax:dx] <- x0*y0
|
|
|
|
movq %rdx, %r10 C r10 <- carry for w1
|
|
|
|
movq %rax, (param_wp) C w0 <- ax
|
|
movq 8(param_xp), %rax C ax <- x1
|
|
|
|
mulq %r9 C [ax:dx] <- x1*y0
|
|
|
|
addq %rax, %r10 C r10 <- r10 + ax for w1
|
|
|
|
adcq $0, %rdx C dx <- carry for w2
|
|
movq 8(param_yp), %rcx C cx <- y1
|
|
movq %r10, 8(param_wp) C w1 <- r10
|
|
|
|
movq 8(param_xp), %rax C ax <- x1
|
|
movq %rdx, %r10 C carry, for w2
|
|
|
|
mulq %rcx C [ax:dx] <- x1*y1
|
|
|
|
addq %rax, %r10 C r10 <- for w2
|
|
|
|
adcq $0, %rdx C for w3
|
|
movq (param_xp), %rax C x0
|
|
|
|
movq %rdx, %rsi C carry, for w3
|
|
|
|
mulq %rcx C x0*y1
|
|
|
|
addq %rax, 8(param_wp) C w1 += ax
|
|
adcq %rdx, %r10 C for w2
|
|
movq %r10, 16(param_wp) C w2 <- r10
|
|
|
|
adcq $0, %rsi
|
|
movq %rsi, 24(param_wp) C w3 <- carry in rsi
|
|
|
|
ret
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
ALIGN(16)
|
|
L(xsize_more_than_two):
|
|
|
|
C The first limb of yp is processed with a simple mpn_mul_1 style loop
|
|
C inline. Unrolling this doesn't seem worthwhile since it's only run once
|
|
C (whereas the addmul below is run ysize-1 many times). A call to the
|
|
C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
|
|
C popping, and doesn't seem likely to be worthwhile on the typical 13-26
|
|
C limb operations the Karatsuba code calls here with.
|
|
|
|
C rax r8 ysize
|
|
C rbx
|
|
C rcx yp
|
|
C rdx xsize
|
|
C rsi xp
|
|
C rdi wp
|
|
C rbp
|
|
|
|
|
|
define(PARAM_YSIZE,%r8) C already there
|
|
define(PARAM_YP, %r9) C init : %rcx
|
|
define(PARAM_XSIZE,%r10) C init : %rdx
|
|
define(PARAM_XP, %r11) C init : %rsi
|
|
define(PARAM_WP, %r12) C init : %rdi r12 should be saved!
|
|
|
|
|
|
dnl FRAME doesn't carry on from previous, no pushes yet here
|
|
defframe(`SAVE_RBX',-8)
|
|
defframe(`SAVE_R12',-16)
|
|
defframe(`SAVE_RBP',-24)
|
|
deflit(`FRAME',0)
|
|
|
|
subq $24, %rsp
|
|
deflit(`FRAME',24)
|
|
|
|
movq %rbx, SAVE_RBX
|
|
movq %r12, SAVE_R12
|
|
movq %rbp, SAVE_RBP
|
|
|
|
movq %rcx, PARAM_YP
|
|
movq %rdx, PARAM_XSIZE
|
|
movq %rsi, PARAM_XP
|
|
movq %rdi, PARAM_WP
|
|
|
|
movq (PARAM_YP), %rbp
|
|
movq PARAM_XSIZE, %rcx
|
|
|
|
xorq %rbx, %rbx
|
|
leaq (PARAM_XP,PARAM_XSIZE,8), %rsi C xp end
|
|
|
|
leaq (PARAM_WP,PARAM_XSIZE,8), %rdi C wp end of mul1
|
|
negq %rcx
|
|
|
|
|
|
L(mul1):
|
|
C rax scratch
|
|
C rbx carry
|
|
C rcx counter, negative
|
|
C rdx scratch
|
|
C rsi xp end
|
|
C rdi wp end of mul1
|
|
C rbp multiplier
|
|
|
|
movq (%rsi,%rcx,8), %rax
|
|
|
|
mulq %rbp
|
|
|
|
addq %rbx, %rax
|
|
movq %rax, (%rdi,%rcx,8)
|
|
movq $0, %rbx
|
|
|
|
adcq %rdx, %rbx
|
|
incq %rcx
|
|
jnz L(mul1)
|
|
|
|
|
|
movq PARAM_YSIZE, %rdx
|
|
movq PARAM_XSIZE, %rcx
|
|
|
|
movq %rbx, (%rdi) C final carry
|
|
decq %rdx
|
|
|
|
jnz L(ysize_more_than_one)
|
|
|
|
|
|
movq SAVE_RBX, %rbx
|
|
movq SAVE_RBP, %rbp
|
|
movq SAVE_R12, %r12
|
|
addq $FRAME, %rsp
|
|
|
|
ret
|
|
|
|
|
|
L(ysize_more_than_one):
|
|
cmpq $UNROLL_THRESHOLD, %rcx
|
|
movq PARAM_YP, %rax
|
|
|
|
jae L(unroll)
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
C simple addmul looping
|
|
C
|
|
C rax yp
|
|
C rbx
|
|
C rcx xsize
|
|
C rdx ysize-1
|
|
C rsi xp end
|
|
C rdi wp end of mul1
|
|
C rbp
|
|
|
|
leaq 8(%rax,%rdx,8), %rbp C yp end
|
|
negq %rcx
|
|
negq %rdx
|
|
|
|
movq (%rsi,%rcx,8), %rax C xp low limb
|
|
movq %rdx, PARAM_YSIZE C -(ysize-1)
|
|
incq %rcx
|
|
|
|
xorq %rbx, %rbx C initial carry
|
|
movq %rcx, PARAM_XSIZE C -(xsize-1)
|
|
movq %rbp, PARAM_YP
|
|
|
|
movq (%rbp,%rdx,8), %rbp C yp second lowest limb - multiplier
|
|
jmp L(simple_outer_entry)
|
|
|
|
|
|
C this is offset ???? Align ?
|
|
L(simple_outer_top):
|
|
C rbp ysize counter, negative
|
|
|
|
movq PARAM_YP, %rdx
|
|
movq PARAM_XSIZE, %rcx C -(xsize-1)
|
|
xorq %rbx, %rbx C carry
|
|
|
|
movq %rbp, PARAM_YSIZE
|
|
addq $8, %rdi C next position in wp
|
|
|
|
movq (%rdx,%rbp,8), %rbp C yp limb - multiplier
|
|
movq -8(%rsi,%rcx,8), %rax C xp low limb
|
|
|
|
|
|
L(simple_outer_entry):
|
|
|
|
L(simple_inner):
|
|
C rax xp limb
|
|
C rbx carry limb
|
|
C rcx loop counter (negative)
|
|
C rdx scratch
|
|
C rsi xp end
|
|
C rdi wp end
|
|
C rbp multiplier
|
|
|
|
mulq %rbp
|
|
|
|
addq %rax, %rbx
|
|
adcq $0, %rdx
|
|
|
|
addq %rbx, (%rdi,%rcx,8)
|
|
movq (%rsi,%rcx,8), %rax
|
|
adcq $0, %rdx
|
|
|
|
incq %rcx
|
|
movq %rdx, %rbx
|
|
jnz L(simple_inner)
|
|
|
|
|
|
mulq %rbp
|
|
|
|
movq PARAM_YSIZE, %rbp
|
|
addq %rax, %rbx
|
|
|
|
adcq $0, %rdx
|
|
addq %rbx, (%rdi)
|
|
|
|
adcq $0, %rdx
|
|
incq %rbp
|
|
|
|
movq %rdx, 8(%rdi)
|
|
jnz L(simple_outer_top)
|
|
|
|
|
|
movq SAVE_RBX, %rbx
|
|
movq SAVE_RBP, %rbp
|
|
movq SAVE_R12, %r12
|
|
addq $FRAME, %rsp
|
|
|
|
ret
|
|
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
C
|
|
C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
|
|
C comments.
|
|
C
|
|
C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
|
|
C increment xp and wp. This is used to adjust back xp and wp, and rshifted
|
|
C to given an initial VAR_COUNTER at the top of the outer loop.
|
|
C
|
|
C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
|
|
C up to -1, inclusive.
|
|
C
|
|
C VAR_JMP is the computed jump into the unrolled loop.
|
|
C
|
|
C VAR_XP_LOW is the least significant limb of xp, which is needed at the
|
|
C start of the unrolled loop.
|
|
C
|
|
C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
|
|
C inclusive.
|
|
C
|
|
C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
|
|
C added to give the location of the next limb of yp, which is the multiplier
|
|
C in the unrolled loop.
|
|
C
|
|
C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
|
|
C outer loop to take care of xp, wp and the inner loop counter.
|
|
|
|
defframe(VAR_COUNTER, -32)
|
|
defframe(VAR_ADJUST, -40)
|
|
defframe(VAR_XP_LOW, -48)
|
|
deflit(VAR_EXTRA_SPACE, 24)
|
|
|
|
|
|
L(unroll):
|
|
C rax yp
|
|
C rbx
|
|
C rcx xsize
|
|
C rdx ysize-1
|
|
C rsi xp end
|
|
C rdi wp end of mul1
|
|
C rbp
|
|
|
|
movq PARAM_XP, %rsi C from here, PARAM_XP not used
|
|
movq 8(%rax), %rbp C multiplier (yp second limb)
|
|
leaq 8(%rax,%rdx,8), %rax C yp adjust for ysize indexing
|
|
|
|
movq PARAM_WP, %rdi
|
|
movq %rax, PARAM_YP
|
|
negq %rdx
|
|
|
|
C From here, only PARAM_YP and PARAM_YSIZE are used
|
|
C Hence r10, r11, r12 are free for use
|
|
|
|
movq %rdx, PARAM_YSIZE
|
|
leaq UNROLL_COUNT-2(%rcx), %rbx C (xsize-1)+UNROLL_COUNT-1
|
|
decq %rcx C xsize-1
|
|
|
|
movq (%rsi), %rax C xp low limb
|
|
andq $-UNROLL_MASK-1, %rbx
|
|
negq %rcx
|
|
|
|
subq $VAR_EXTRA_SPACE, %rsp
|
|
deflit(`FRAME',24+VAR_EXTRA_SPACE)
|
|
negq %rbx
|
|
andq $UNROLL_MASK, %rcx
|
|
movq %rcx, %r12 C for later parity test
|
|
|
|
movq %rbx, VAR_ADJUST
|
|
movq %rcx, %rdx
|
|
movq %rcx, %r10
|
|
shlq $4, %rcx
|
|
shlq $3, %r10
|
|
|
|
sarq $UNROLL_LOG2, %rbx
|
|
|
|
C 24=16+8 code bytes per limb
|
|
ifdef(`PIC',`
|
|
callq L(pic_calc)
|
|
L(unroll_here):
|
|
',`
|
|
leaq L(unroll_entry) (%rcx,%r10,1), %rcx
|
|
')
|
|
negq %rdx
|
|
|
|
movq %rax, VAR_XP_LOW
|
|
movq %rcx, PARAM_XP C PARAM_XP used for VAR_JUMP
|
|
leaq 8(%rdi,%rdx,8), %rdi C wp and xp, adjust for unrolling,
|
|
leaq 8(%rsi,%rdx,8), %rsi C and start at second limb
|
|
jmp L(unroll_outer_entry)
|
|
|
|
|
|
ifdef(`PIC',`
|
|
L(pic_calc):
|
|
C See mpn/x86/README about old gas bugs
|
|
leaq (%rcx,%r10,1), %rcx
|
|
addq $L(unroll_entry)-L(unroll_here), %rcx
|
|
addq (%rsp), %rcx
|
|
ret
|
|
')
|
|
|
|
|
|
C --------------------------------------------------------------------------
|
|
ALIGN(32)
|
|
L(unroll_outer_top):
|
|
C ebp ysize counter, negative
|
|
|
|
movq VAR_ADJUST, %rbx
|
|
movq PARAM_YP, %rdx
|
|
|
|
movq VAR_XP_LOW, %rax
|
|
movq %rbp, PARAM_YSIZE C store incremented ysize counter
|
|
|
|
leaq 8(%rdi,%rbx,8), %rdi
|
|
leaq (%rsi,%rbx,8), %rsi
|
|
sarq $UNROLL_LOG2, %rbx
|
|
|
|
movq (%rdx,%rbp,8), %rbp C yp next multiplier
|
|
movq PARAM_XP, %rcx
|
|
|
|
L(unroll_outer_entry):
|
|
mulq %rbp
|
|
|
|
movq %r12, %rcx
|
|
testb $1, %cl C and clear carry bit
|
|
movq %rbx, VAR_COUNTER
|
|
movq $0, %rbx
|
|
|
|
movq $0, %rcx
|
|
cmovz %rax, %rcx C eax into low carry, zero into high carry limb
|
|
cmovnz %rax, %rbx
|
|
|
|
C Extra fetch of VAR_JMP is bad, but registers are tight
|
|
C TODO: we have more registers, now!!!!
|
|
jmp *PARAM_XP
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
ALIGN(32)
|
|
L(unroll_top):
|
|
C rax xp limb
|
|
C rbx carry high
|
|
C rcx carry low
|
|
C rdx scratch
|
|
C rsi xp+8
|
|
C rdi wp
|
|
C rbp yp multiplier limb
|
|
C
|
|
C VAR_COUNTER loop counter, negative
|
|
C
|
|
C 24 bytes each limb
|
|
|
|
L(unroll_entry):
|
|
|
|
deflit(CHUNK_COUNT,2)
|
|
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
|
|
deflit(`disp0', eval(i*CHUNK_COUNT*8 ifelse(UNROLL_BYTES,256,-128)))
|
|
deflit(`disp1', eval(disp0 + 8))
|
|
|
|
adcq %rdx, %rbx
|
|
Zdisp( movq, disp0,(%rsi), %rax)
|
|
|
|
mulq %rbp
|
|
|
|
Zdisp( addq, %rcx, disp0,(%rdi))
|
|
movq $0, %rcx
|
|
|
|
adcq %rax, %rbx
|
|
|
|
|
|
adcq %rdx, %rcx
|
|
movq disp1(%rsi), %rax
|
|
|
|
mulq %rbp
|
|
|
|
addq %rbx, disp1(%rdi)
|
|
movq $0, %rbx
|
|
|
|
adcq %rax, %rcx
|
|
')
|
|
|
|
|
|
incq VAR_COUNTER
|
|
leaq UNROLL_BYTES(%rsi), %rsi
|
|
leaq UNROLL_BYTES(%rdi), %rdi
|
|
|
|
jnz L(unroll_top)
|
|
|
|
|
|
C rax
|
|
C rbx zero
|
|
C rcx low
|
|
C rdx high
|
|
C rsi
|
|
C rdi wp, pointing at second last limb)
|
|
C rbp
|
|
C
|
|
C carry flag to be added to high
|
|
|
|
deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
|
|
deflit(`disp1', eval(disp0-0 + 8))
|
|
|
|
movq PARAM_YSIZE, %rbp
|
|
adcq $0, %rdx
|
|
addq %rcx, disp0(%rdi)
|
|
|
|
adcq $0, %rdx
|
|
incq %rbp
|
|
|
|
movq %rdx, disp1(%rdi)
|
|
jnz L(unroll_outer_top)
|
|
|
|
|
|
movq SAVE_RBP, %rbp
|
|
movq SAVE_R12, %r12
|
|
movq SAVE_RBX, %rbx
|
|
addq $FRAME, %rsp
|
|
|
|
ret
|
|
|
|
EPILOGUE()
|