dnl AMD K8 mpn_sqr_basecase -- square an mpn number. dnl This file is just an adaptation of similar file in the k7 directory. dnl Adapted by P. Gaudry in April 2005. dnl Here is the copyright of the original k7 version: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can rrdistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. include(`../config.m4') deflit(SQR_KARATSUBA_THRESHOLD_MAX, 34) ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE', `define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)') m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD') C deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3)) deflit(UNROLL_COUNT, 31) C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); C C With a SQR_KARATSUBA_THRESHOLD around 50 this code is about 1500 bytes, C which is quite a bit, but is considered good value since squares big C enough to use most of the code will be spending quite a few cycles in it. define(param_dst, %rdi) define(param_src, %rsi) define(param_size, %rdx) define(PARAM_DST, %r8) define(PARAM_SRC, %r9) define(PARAM_SIZE, %r10) TEXT ALIGN(32) PROLOGUE(mpn_sqr_basecase) movq param_size, %rcx movq param_src, %rax cmpq $2, %rcx movq param_dst, %rdx je L(two_limbs) ja L(three_or_more) C------------------------------------------------------------------------------ C one limb only C rax src C rcx size C rdx dst C C rsi src C rdi dst movq (%rsi), %rax mulq %rax movq %rdx, 8(%rdi) movq %rax, (%rdi) ret C------------------------------------------------------------------------------ C C Using the read/modify/write "add"s seems to be faster than saving and C restoring registers. Perhaps the loads for the first set hide under the C mul latency and the second gets store to load forwarding. ALIGN(16) L(two_limbs): C rax C rbx C rcx C rdx C rsi src C rdi dst C C r8 s0 C r9 s1 movq (%rsi), %r8 movq 8(%rsi), %r9 movq %r8, %rax mulq %rax C src[0]^2 movq %rax, (%rdi) C dst[0] movq %r9, %rax movq %rdx, 8(%rdi) C dst[1] mulq %rax C src[1]^2 movq %rax, 16(%rdi) C dst[2] movq %r8, %rax movq %rdx, 24(%rdi) C dst[3] mulq %r9 C src[0]*src[1] addq %rax, 8(%rdi) adcq %rdx, 16(%rdi) adcq $0, 24(%rdi) ASSERT(nc) addq %rax, 8(%rdi) adcq %rdx, 16(%rdi) adcq $0, 24(%rdi) ASSERT(nc) ret C------------------------------------------------------------------------------ defframe(SAVE_RBX, -8) defframe(SAVE_RBP, -16) deflit(STACK_SPACE, 16) L(three_or_more): subq $STACK_SPACE, %rsp cmpq $4, %rcx jae L(four_or_more) deflit(`FRAME',STACK_SPACE) C------------------------------------------------------------------------------ C Three limbs C C Writing out the loads and stores separately at the end of this code comes C out about 10 cycles faster than using adcls to memory. C rax src C rcx size C rdx dst movq %rbx, SAVE_RBX movq %rax, %rbx C src movq (%rax), %rax movq %rdx, %rcx C dst mulq %rax C src[0] ^ 2 movq %rax, (%rcx) movq 8(%rbx), %rax movq %rdx, 8(%rcx) mulq %rax C src[1] ^ 2 movq %rax, 16(%rcx) movq 16(%rbx), %rax movq %rdx, 24(%rcx) mulq %rax C src[2] ^ 2 movq %rax, 32(%rcx) movq (%rbx), %rax movq %rdx, 40(%rcx) mulq 8(%rbx) C src[0] * src[1] movq %rax, %rsi movq (%rbx), %rax movq %rdx, %rdi mulq 16(%rbx) C src[0] * src[2] addq %rax, %rdi movq %rbp, SAVE_RBP movq $0, %rbp movq 8(%rbx), %rax adcq %rdx, %rbp mulq 16(%rbx) C src[1] * src[2] xorq %rbx, %rbx addq %rax, %rbp adcq $0, %rdx C rax C rbx zero, will be dst[5] C rcx dst C rdx dst[4] C rsi dst[1] C rdi dst[2] C rbp dst[3] adcq $0, %rdx addq %rsi, %rsi adcq %rdi, %rdi movq 8(%rcx), %rax adcq %rbp, %rbp adcq %rdx, %rdx adcq $0, %rbx addq %rax, %rsi movq 16(%rcx), %rax adcq %rax, %rdi movq 24(%rcx), %rax movq %rsi, 8(%rcx) adcq %rax, %rbp movq 32(%rcx), %rax movq %rdi, 16(%rcx) adcq %rax, %rdx movq 40(%rcx), %rax movq %rbp, 24(%rcx) adcq %rbx, %rax ASSERT(nc) movq SAVE_RBX, %rbx movq SAVE_RBP, %rbp movq %rdx, 32(%rcx) movq %rax, 40(%rcx) addq $FRAME, %rsp ret C------------------------------------------------------------------------------ L(four_or_more): C First multiply src[0]*src[1..size-1] and store at dst[1..size]. C Further products are added in rather than stored. C rax src C rbx C rcx size C rdx dst C rsi C rdi C rbp defframe(`VAR_COUNTER',-24) defframe(`VAR_JMP', -32) deflit(EXTRA_STACK_SPACE, 16) movq param_dst, PARAM_DST movq param_src, PARAM_SRC movq %rcx, PARAM_SIZE movq %rbx, SAVE_RBX leaq (%rdx,%rcx,8), %rdi C &dst[size] movq %rbp, SAVE_RBP leaq (%rax,%rcx,8), %rsi C &src[size] movq (%rax), %rbp C multiplier movq $0, %rbx decq %rcx negq %rcx subq $EXTRA_STACK_SPACE, %rsp deflit(`FRAME', STACK_SPACE+EXTRA_STACK_SPACE) L(mul_1): C rax scratch C rbx carry C rcx counter C rdx scratch C rsi &src[size] C rdi &dst[size] C rbp multiplier movq (%rsi,%rcx,8), %rax mulq %rbp addq %rbx, %rax movq %rax, (%rdi,%rcx,8) movq $0, %rbx adcq %rdx, %rbx incq %rcx jnz L(mul_1) C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. C C The last two products, which are the bottom right corner of the product C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] C and src[size-2]*src[size-1]. If size is 4 then it's only these corner C cases that need to be done. C C The unrolled code is the same as in mpn_addmul_1, see that routine for C some comments. C C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive. C C VAR_JMP is the computed jump into the unrolled code, stepped by one code C chunk each outer loop. C C K7 does branch prrdiction on indirect jumps, which is bad since it's a C different target each time. There seems no way to avoid this. dnl This value also hard coded in some shifts and adds deflit(CODE_BYTES_PER_LIMB, 25) dnl With the unmodified &src[size] and &dst[size] pointers, the dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT dnl values up to 31, but above that an offset must be added to them. deflit(OFFSET, ifelse(eval(UNROLL_COUNT>15),1, eval((UNROLL_COUNT-15)*8), 0)) dnl Because the last chunk of code is generated differently, a label placed dnl at the end doesn't work. Instead calculate the implied end using the dnl start and how many chunks of code there are. deflit(UNROLL_INNER_END, `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') C rax C rbx carry C rcx C rdx C rsi &src[size] C rdi &dst[size] C rbp movq PARAM_SIZE, %rcx movq %rbx, (%rdi) subq $4, %rcx jz L(corner) negq %rcx ifelse(OFFSET,0,,`subq $OFFSET, %rdi') ifelse(OFFSET,0,,`subq $OFFSET, %rsi') movq %rcx, %rdx shlq $4, %rcx movq %rdx, %r11 shlq $3, %r11 addq %rdx, %r11 C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC Change here!!!! ifdef(`PIC',` call L(pic_calc) L(here): ',` leaq UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%rcx,%r11), %rcx ') C The calculated jump mustn't come out to before the start of the C code available. This is the limit UNROLL_COUNT puts on the src C operand size, but checked here directly using the jump address. ASSERT(ae, `movq_text_address(L(unroll_inner_start), %rax) cmpq %rax, %rcx') C------------------------------------------------------------------------------ ALIGN(16) L(unroll_outer_top): C rax C rbx high limb to store C rcx VAR_JMP C rdx VAR_COUNTER, limbs, negative C rsi &src[size], constant C rdi dst ptr, high of last addmul C rbp movq -24+OFFSET(%rsi,%rdx,8), %rbp C next multiplier movq -16+OFFSET(%rsi,%rdx,8), %rax C first of multiplicand movq %rdx, VAR_COUNTER mulq %rbp define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz $@',`cmovnz $@')') testb $1, %cl C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCcccc movq %rdx, %rbx C high carry movq %rcx, %rdx C jump movq %rax, %rcx C low carry cmovX( %rbx, %rcx) C high carry reverse cmovX( %rax, %rbx) C low carry reverse leaq CODE_BYTES_PER_LIMB(%rdx), %rax xorq %rdx, %rdx leaq 8(%rdi), %rdi movq %rax, VAR_JMP jmp *%rax ifdef(`PIC',` L(pic_calc): addq (%rsp), %rcx addq $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %rcx addq %r11, %rcx ret ') C Must be an even address to preserve the significance of the low C bit of the jump address indicating which way around rcx/rbx should C start. ALIGN(2) L(unroll_inner_start): C rax next limb C rbx carry high C rcx carry low C rdx scratch C rsi src C rdi dst C rbp multiplier forloop(`i', UNROLL_COUNT, 1, ` deflit(`disp_src', eval(-i*8 + OFFSET)) deflit(`disp_dst', eval(disp_src - 8)) m4_assert(`disp_src>=-128 && disp_src<128') m4_assert(`disp_dst>=-128 && disp_dst<128') ifelse(eval(i%2),0,` .byte 0x90 adcq %rdx, %rbx Zdisp( movq, disp_src,(%rsi), %rax) mulq %rbp Zdisp( addq, %rcx, disp_dst,(%rdi)) movq $0, %rcx adcq %rax, %rbx ',` dnl this bit comes out last .byte 0x90 adcq %rdx, %rcx Zdisp( movq, disp_src,(%rsi), %rax) mulq %rbp Zdisp( addq, %rbx, disp_dst,(%rdi)) ifelse(forloop_last,0, ` movq $0, %rbx') adcq %rax, %rcx ') ') C rax next limb C rbx carry high C rcx carry low C rdx scratch C rsi src C rdi dst C rbp multiplier adcq $0, %rdx addq %rcx, -8+OFFSET(%rdi) movq VAR_JMP, %rcx adcq $0, %rdx movq %rdx, m4_empty_if_zero(OFFSET) (%rdi) movq VAR_COUNTER, %rdx incq %rdx jnz L(unroll_outer_top) ifelse(OFFSET,0,,` addq $OFFSET, %rsi addq $OFFSET, %rdi ') C------------------------------------------------------------------------------ L(corner): C rsi &src[size] C rdi &dst[2*size-5] movq -24(%rsi), %rbp movq -16(%rsi), %rax movq %rax, %rcx mulq %rbp addq %rax, -8(%rdi) movq -8(%rsi), %rax adcq $0, %rdx movq %rdx, %rbx movq %rax, %rsi mulq %rbp addq %rbx, %rax adcq $0, %rdx addq %rax, (%rdi) movq %rsi, %rax adcq $0, %rdx movq %rdx, %rbx mulq %rcx addq %rbx, %rax movq %rax, 8(%rdi) adcq $0, %rdx movq %rdx, 16(%rdi) C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. L(lshift_start): movq PARAM_SIZE, %rax movq PARAM_DST, %rdi movq %rax, %r11 shlq $1, %r11 leaq (%rdi,%r11,8), %rdi notq %rax C -size-1, preserve carry leaq 2(%rax), %rax C -(size-1) movq %rax, %r11 shlq $1, %r11 xorq %rcx, %rcx C clear carry L(lshift): C rax counter, negative C rbx C rcx C rdx C rsi C rdi dst, pointing just after last limb C rbp rclq -8(%rdi,%r11,8) rclq (%rdi,%r11,8) incq %r11 incq %r11 incq %rax jnz L(lshift) setc %al movq PARAM_SRC, %rsi movq %rax, -8(%rdi) C dst most significant limb movq PARAM_SIZE, %rcx C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the C low limb of src[0]^2. movq (%rsi), %rax C src[0] mulq %rax leaq (%rsi,%rcx,8), %rsi C src point just after last limb negq %rcx movq %rcx, %r11 shlq $1, %r11 movq %rax, (%rdi,%r11,8) C dst[0] incq %rcx incq %r11 incq %r11 L(diag): C rax scratch C rbx scratch C rcx counter, negative C rdx carry C rsi src just after last limb C rdi dst just after last limb C rbp movq %rdx, %rbx movq (%rsi,%rcx,8), %rax mulq %rax addq %rbx, -8(%rdi,%r11,8) adcq %rax, (%rdi,%r11,8) adcq $0, %rdx incq %rcx incq %r11 incq %r11 jnz L(diag) movq SAVE_RBX, %rbx addq %rdx, -8(%rdi) C dst most significant limb movq SAVE_RBP, %rbp addq $FRAME, %rsp ret EPILOGUE()