dnl AMD K8 mpn_mul_basecase -- multiply two mpn numbers. dnl This file is just an adaptation of similar file in the k7 directory. dnl Adapted by P. Gaudry in April 2005. dnl Here is the copyright of the original k7 version: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. include(`../config.m4') deflit(UNROLL_COUNT, 16) C void mpn_mul_basecase (mp_ptr wp, C mp_srcptr xp, mp_size_t xsize, C mp_srcptr yp, mp_size_t ysize); C C Calculate xp,xsize multiplied by yp,ysize, storing the result in C wp,xsize+ysize. C C This routine is essentially the same as mpn/generic/mul_basecase.c, but C it's faster because it does most of the mpn_addmul_1() startup C calculations only once. The saving is 15-25% on typical sizes coming from C the Karatsuba multiply code. ifdef(`PIC',` deflit(UNROLL_THRESHOLD, 5) ',` deflit(UNROLL_THRESHOLD, 5) ') define(param_ysize,%r8) define(param_yp, %rcx) define(param_xsize,%rdx) define(param_xp, %rsi) define(param_wp, %rdi) TEXT ALIGN(32) PROLOGUE(mpn_mul_basecase) deflit(`FRAME',0) cmpq $2, param_xsize ja L(xsize_more_than_two) je L(two_by_something) C one limb by one limb movq (param_yp), %rax C rax <- y0 mulq (param_xp) C [ax:dx] <- y0*x0 movq %rax, (param_wp) movq %rdx, 8(param_wp) ret C ----------------------------------------------------------------------------- L(two_by_something): C xsize = 2, hence rdx is free for usage deflit(`FRAME',0) decq param_ysize C YSIZE-- movq (param_yp), %r9 C r9 <- y0 movq (param_xp), %rax C rax <- x0 jnz L(two_by_two) C two limbs by one limb mulq %r9 C [ax:dx] <- x0*y0 movq %rax, (param_wp) C w0 <- low_prod movq 8(param_xp), %rax C rax <- x1 (rsi is now free) movq %rdx, %rsi C rsi <- carry mulq %r9 C [ax:dx] <- x1*y0 addq %rax, %rsi C rsi <- ax+carry ( --> carry_flag) movq %rsi, 8(param_wp) C w1 <- rsi adcq $0, %rdx C dx <- dx+carry movq %rdx, 16(param_wp) C w2 <- dx ret C ----------------------------------------------------------------------------- ALIGN(16) L(two_by_two): C rax x0 r8 C rbx **untouched** r9 y0 C rcx yp r10-11 C rdx C rsi xp C rdi wp C rbp mulq %r9 C [ax:dx] <- x0*y0 movq %rdx, %r10 C r10 <- carry for w1 movq %rax, (param_wp) C w0 <- ax movq 8(param_xp), %rax C ax <- x1 mulq %r9 C [ax:dx] <- x1*y0 addq %rax, %r10 C r10 <- r10 + ax for w1 adcq $0, %rdx C dx <- carry for w2 movq 8(param_yp), %rcx C cx <- y1 movq %r10, 8(param_wp) C w1 <- r10 movq 8(param_xp), %rax C ax <- x1 movq %rdx, %r10 C carry, for w2 mulq %rcx C [ax:dx] <- x1*y1 addq %rax, %r10 C r10 <- for w2 adcq $0, %rdx C for w3 movq (param_xp), %rax C x0 movq %rdx, %rsi C carry, for w3 mulq %rcx C x0*y1 addq %rax, 8(param_wp) C w1 += ax adcq %rdx, %r10 C for w2 movq %r10, 16(param_wp) C w2 <- r10 adcq $0, %rsi movq %rsi, 24(param_wp) C w3 <- carry in rsi ret C ----------------------------------------------------------------------------- ALIGN(16) L(xsize_more_than_two): C The first limb of yp is processed with a simple mpn_mul_1 style loop C inline. Unrolling this doesn't seem worthwhile since it's only run once C (whereas the addmul below is run ysize-1 many times). A call to the C actual mpn_mul_1 will be slowed down by the call and parameter pushing and C popping, and doesn't seem likely to be worthwhile on the typical 13-26 C limb operations the Karatsuba code calls here with. C rax r8 ysize C rbx C rcx yp C rdx xsize C rsi xp C rdi wp C rbp define(PARAM_YSIZE,%r8) C already there define(PARAM_YP, %r9) C init : %rcx define(PARAM_XSIZE,%r10) C init : %rdx define(PARAM_XP, %r11) C init : %rsi define(PARAM_WP, %r12) C init : %rdi r12 should be saved! dnl FRAME doesn't carry on from previous, no pushes yet here defframe(`SAVE_RBX',-8) defframe(`SAVE_R12',-16) defframe(`SAVE_RBP',-24) deflit(`FRAME',0) subq $24, %rsp deflit(`FRAME',24) movq %rbx, SAVE_RBX movq %r12, SAVE_R12 movq %rbp, SAVE_RBP movq %rcx, PARAM_YP movq %rdx, PARAM_XSIZE movq %rsi, PARAM_XP movq %rdi, PARAM_WP movq (PARAM_YP), %rbp movq PARAM_XSIZE, %rcx xorq %rbx, %rbx leaq (PARAM_XP,PARAM_XSIZE,8), %rsi C xp end leaq (PARAM_WP,PARAM_XSIZE,8), %rdi C wp end of mul1 negq %rcx L(mul1): C rax scratch C rbx carry C rcx counter, negative C rdx scratch C rsi xp end C rdi wp end of mul1 C rbp multiplier movq (%rsi,%rcx,8), %rax mulq %rbp addq %rbx, %rax movq %rax, (%rdi,%rcx,8) movq $0, %rbx adcq %rdx, %rbx incq %rcx jnz L(mul1) movq PARAM_YSIZE, %rdx movq PARAM_XSIZE, %rcx movq %rbx, (%rdi) C final carry decq %rdx jnz L(ysize_more_than_one) movq SAVE_RBX, %rbx movq SAVE_RBP, %rbp movq SAVE_R12, %r12 addq $FRAME, %rsp ret L(ysize_more_than_one): cmpq $UNROLL_THRESHOLD, %rcx movq PARAM_YP, %rax jae L(unroll) C ----------------------------------------------------------------------------- C simple addmul looping C C rax yp C rbx C rcx xsize C rdx ysize-1 C rsi xp end C rdi wp end of mul1 C rbp leaq 8(%rax,%rdx,8), %rbp C yp end negq %rcx negq %rdx movq (%rsi,%rcx,8), %rax C xp low limb movq %rdx, PARAM_YSIZE C -(ysize-1) incq %rcx xorq %rbx, %rbx C initial carry movq %rcx, PARAM_XSIZE C -(xsize-1) movq %rbp, PARAM_YP movq (%rbp,%rdx,8), %rbp C yp second lowest limb - multiplier jmp L(simple_outer_entry) C this is offset ???? Align ? L(simple_outer_top): C rbp ysize counter, negative movq PARAM_YP, %rdx movq PARAM_XSIZE, %rcx C -(xsize-1) xorq %rbx, %rbx C carry movq %rbp, PARAM_YSIZE addq $8, %rdi C next position in wp movq (%rdx,%rbp,8), %rbp C yp limb - multiplier movq -8(%rsi,%rcx,8), %rax C xp low limb L(simple_outer_entry): L(simple_inner): C rax xp limb C rbx carry limb C rcx loop counter (negative) C rdx scratch C rsi xp end C rdi wp end C rbp multiplier mulq %rbp addq %rax, %rbx adcq $0, %rdx addq %rbx, (%rdi,%rcx,8) movq (%rsi,%rcx,8), %rax adcq $0, %rdx incq %rcx movq %rdx, %rbx jnz L(simple_inner) mulq %rbp movq PARAM_YSIZE, %rbp addq %rax, %rbx adcq $0, %rdx addq %rbx, (%rdi) adcq $0, %rdx incq %rbp movq %rdx, 8(%rdi) jnz L(simple_outer_top) movq SAVE_RBX, %rbx movq SAVE_RBP, %rbp movq SAVE_R12, %r12 addq $FRAME, %rsp ret C ----------------------------------------------------------------------------- C C The unrolled loop is the same as in mpn_addmul_1(), see that code for some C comments. C C VAR_ADJUST is the negative of how many limbs the leals in the inner loop C increment xp and wp. This is used to adjust back xp and wp, and rshifted C to given an initial VAR_COUNTER at the top of the outer loop. C C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT C up to -1, inclusive. C C VAR_JMP is the computed jump into the unrolled loop. C C VAR_XP_LOW is the least significant limb of xp, which is needed at the C start of the unrolled loop. C C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, C inclusive. C C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be C added to give the location of the next limb of yp, which is the multiplier C in the unrolled loop. C C The trick with VAR_ADJUST means it's only necessary to do one fetch in the C outer loop to take care of xp, wp and the inner loop counter. defframe(VAR_COUNTER, -32) defframe(VAR_ADJUST, -40) defframe(VAR_XP_LOW, -48) deflit(VAR_EXTRA_SPACE, 24) L(unroll): C rax yp C rbx C rcx xsize C rdx ysize-1 C rsi xp end C rdi wp end of mul1 C rbp movq PARAM_XP, %rsi C from here, PARAM_XP not used movq 8(%rax), %rbp C multiplier (yp second limb) leaq 8(%rax,%rdx,8), %rax C yp adjust for ysize indexing movq PARAM_WP, %rdi movq %rax, PARAM_YP negq %rdx C From here, only PARAM_YP and PARAM_YSIZE are used C Hence r10, r11, r12 are free for use movq %rdx, PARAM_YSIZE leaq UNROLL_COUNT-2(%rcx), %rbx C (xsize-1)+UNROLL_COUNT-1 decq %rcx C xsize-1 movq (%rsi), %rax C xp low limb andq $-UNROLL_MASK-1, %rbx negq %rcx subq $VAR_EXTRA_SPACE, %rsp deflit(`FRAME',24+VAR_EXTRA_SPACE) negq %rbx andq $UNROLL_MASK, %rcx movq %rcx, %r12 C for later parity test movq %rbx, VAR_ADJUST movq %rcx, %rdx movq %rcx, %r10 shlq $4, %rcx shlq $3, %r10 sarq $UNROLL_LOG2, %rbx C 24=16+8 code bytes per limb ifdef(`PIC',` callq L(pic_calc) L(unroll_here): ',` leaq L(unroll_entry) (%rcx,%r10,1), %rcx ') negq %rdx movq %rax, VAR_XP_LOW movq %rcx, PARAM_XP C PARAM_XP used for VAR_JUMP leaq 8(%rdi,%rdx,8), %rdi C wp and xp, adjust for unrolling, leaq 8(%rsi,%rdx,8), %rsi C and start at second limb jmp L(unroll_outer_entry) ifdef(`PIC',` L(pic_calc): C See mpn/x86/README about old gas bugs leaq (%rcx,%r10,1), %rcx addq $L(unroll_entry)-L(unroll_here), %rcx addq (%rsp), %rcx ret ') C -------------------------------------------------------------------------- ALIGN(32) L(unroll_outer_top): C ebp ysize counter, negative movq VAR_ADJUST, %rbx movq PARAM_YP, %rdx movq VAR_XP_LOW, %rax movq %rbp, PARAM_YSIZE C store incremented ysize counter leaq 8(%rdi,%rbx,8), %rdi leaq (%rsi,%rbx,8), %rsi sarq $UNROLL_LOG2, %rbx movq (%rdx,%rbp,8), %rbp C yp next multiplier movq PARAM_XP, %rcx L(unroll_outer_entry): mulq %rbp movq %r12, %rcx testb $1, %cl C and clear carry bit movq %rbx, VAR_COUNTER movq $0, %rbx movq $0, %rcx cmovz %rax, %rcx C eax into low carry, zero into high carry limb cmovnz %rax, %rbx C Extra fetch of VAR_JMP is bad, but registers are tight C TODO: we have more registers, now!!!! jmp *PARAM_XP C ----------------------------------------------------------------------------- ALIGN(32) L(unroll_top): C rax xp limb C rbx carry high C rcx carry low C rdx scratch C rsi xp+8 C rdi wp C rbp yp multiplier limb C C VAR_COUNTER loop counter, negative C C 24 bytes each limb L(unroll_entry): deflit(CHUNK_COUNT,2) forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` deflit(`disp0', eval(i*CHUNK_COUNT*8 ifelse(UNROLL_BYTES,256,-128))) deflit(`disp1', eval(disp0 + 8)) adcq %rdx, %rbx Zdisp( movq, disp0,(%rsi), %rax) mulq %rbp Zdisp( addq, %rcx, disp0,(%rdi)) movq $0, %rcx adcq %rax, %rbx adcq %rdx, %rcx movq disp1(%rsi), %rax mulq %rbp addq %rbx, disp1(%rdi) movq $0, %rbx adcq %rax, %rcx ') incq VAR_COUNTER leaq UNROLL_BYTES(%rsi), %rsi leaq UNROLL_BYTES(%rdi), %rdi jnz L(unroll_top) C rax C rbx zero C rcx low C rdx high C rsi C rdi wp, pointing at second last limb) C rbp C C carry flag to be added to high deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) deflit(`disp1', eval(disp0-0 + 8)) movq PARAM_YSIZE, %rbp adcq $0, %rdx addq %rcx, disp0(%rdi) adcq $0, %rdx incq %rbp movq %rdx, disp1(%rdi) jnz L(unroll_outer_top) movq SAVE_RBP, %rbp movq SAVE_R12, %r12 movq SAVE_RBX, %rbx addq $FRAME, %rsp ret EPILOGUE()