dnl Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder. dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, dnl Fifth Floor, Boston, MA 02110-1301, USA. include(`../config.m4') dnl P4: 31 cycles/limb. C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, C mp_limb_t carry); C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, C mp_limb_t inverse); C C An idea was tried in the mul-by-inverse to process the last limb by a jump C back to the top of the loop skipping the -4(%esi) fetch. But that seemed C to produce slightly strange timings, like 9 and 10 limb operations about C the same speed. The jump would be successively taken and not-taken, which C in theory should predict ok, but perhaps isn't enjoyed by the chip. C Duplicating the loop for the last limb seems to be a couple of cycles C quicker too. C C Enhancements: C C The loop measures 31 cycles, but the dependent chain would suggest it C could be done with 30. Not sure where to start looking for the extra C cycle. dnl MUL_THRESHOLD is the size at which the multiply by inverse method is dnl used, rather than plain "divl"s. Minimum value 2. dnl dnl The inverse takes about 80-90 cycles to calculate, but after that the dnl multiply is 31 c/l versus division at about 58 c/l. deflit(MUL_THRESHOLD, 5) defframe(PARAM_INVERSE,16) dnl mpn_preinv_mod_1 defframe(PARAM_CARRY, 16) dnl mpn_mod_1c defframe(PARAM_DIVISOR,12) defframe(PARAM_SIZE, 8) defframe(PARAM_SRC, 4) dnl re-use parameter space define(SAVE_ESI,`PARAM_SIZE') define(SAVE_EBP,`PARAM_SRC') TEXT ALIGN(16) PROLOGUE(mpn_preinv_mod_1) deflit(`FRAME',0) movl PARAM_SIZE, %ecx movl %esi, SAVE_ESI movl $32, %eax movd %eax, %mm6 C l = 0, so 32-l = 32 movl PARAM_SRC, %esi movl %ebp, SAVE_EBP movd PARAM_DIVISOR, %mm5 pxor %mm7, %mm7 C l = 0 movd -4(%esi,%ecx,4), %mm0 C src high limb leal -8(%esi,%ecx,4), %esi C &src[size-2] movd PARAM_INVERSE, %mm4 subl $2, %ecx C size-2 psubq %mm5, %mm0 C high-divisor movq %mm0, %mm2 psrlq $32, %mm0 C -1 if underflow pand %mm5, %mm0 C divisor if underflow paddq %mm2, %mm0 C addback if underflow jz L(inverse_last) C if size==2 ja L(inverse_top) C if size>2 C if size==1 movl SAVE_ESI, %esi movd %mm0, %eax emms ret EPILOGUE() ALIGN(16) PROLOGUE(mpn_mod_1c) deflit(`FRAME',0) movl PARAM_SIZE, %ecx movl %esi, SAVE_ESI movl PARAM_SRC, %esi movl %ebp, SAVE_EBP movl PARAM_CARRY, %edx orl %ecx, %ecx jz L(divide_done) C result==carry if size==0 movl PARAM_DIVISOR, %ebp jmp L(start_1c) EPILOGUE() ALIGN(16) PROLOGUE(mpn_mod_1) deflit(`FRAME',0) movl PARAM_SIZE, %ecx movl %esi, SAVE_ESI movl PARAM_SRC, %esi movl %ebp, SAVE_EBP movl PARAM_DIVISOR, %ebp xorl %edx, %edx C result 0 if size==0 orl %ecx, %ecx jz L(divide_done) movl -4(%esi,%ecx,4), %eax C src high limb leal -1(%ecx), %edx cmpl %ebp, %eax C c if high n2 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 ASSERT(be,`C 0 or -1 movd %mm3, %eax addl $1, %eax cmpl $1, %eax') pand %mm5, %mm3 C mask & d paddd %mm3, %mm0 C addback if necessary subl $1, %ecx jnz L(inverse_top) C Least significant limb. C Same code as the loop, but there's no -4(%esi) limb to fetch. L(inverse_last): C eax C ebx C ecx C edx C esi &src[0] C C mm0 n2 C mm4 m C mm5 d C mm6 32-l C mm7 l movd (%esi), %mm1 C src[0] psllq %mm7, %mm1 C n10 movq %mm1, %mm2 C n10 movq %mm1, %mm3 C n10 psrad $31, %mm1 C -n1 pand %mm5, %mm1 C -n1 & d paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow psrld $31, %mm2 C n1 paddd %mm0, %mm2 C n2+n1 punpckldq %mm0, %mm1 C n2:nadj pmuludq %mm4, %mm2 C m*(n2+n1) paddq %mm2, %mm1 C n2:nadj + m*(n2+n1) psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1)) pmuludq %mm5, %mm1 C q1*d punpckldq %mm0, %mm3 C n psubq %mm5, %mm3 C n - d pxor %mm0, %mm0 psubq %mm1, %mm3 C n - (q1+1)*d por %mm3, %mm0 C remainder -> n2 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 ASSERT(be,`C 0 or -1 movd %mm3, %eax addl $1, %eax cmpl $1, %eax') movl SAVE_EBP, %ebp pand %mm5, %mm3 C mask & d movl SAVE_ESI, %esi paddd %mm3, %mm0 C addback if necessary psrld %mm7, %mm0 movd %mm0, %eax emms ret EPILOGUE()