dnl Intel P6 mpn_mod_1 -- mpn by limb remainder. dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 2.1 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public dnl License along with the GNU MP Library; see the file COPYING.LIB. If dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street, dnl Fifth Floor, Boston, MA 02110-1301, USA. include(`../config.m4') C P6: 21.5 cycles/limb C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, C mp_limb_t carry); C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, C mp_limb_t inverse); C C The code here is in two parts, a simple divl loop and a mul-by-inverse. C The divl is used by mod_1 and mod_1c for small sizes, until the savings in C the mul-by-inverse can overcome the time to calculate an inverse. C preinv_mod_1 goes straight to the mul-by-inverse. C C The mul-by-inverse normalizes the divisor (or for preinv_mod_1 it's C already normalized). The calculation done is r=a%(d*2^n) followed by a C final (r*2^n)%(d*2^n), where a is the dividend, d the divisor, and n is C the number of leading zero bits on d. This means there's no bit shifts in C the main loop, at the cost of an extra divide step at the end. C C The simple divl for mod_1 is able to skip one divide step if high n2 subl %ebp, %eax cmovnc( %eax, %edi) C n2-divisor if no underflow movl $-1, %eax movl $-1, %edx subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 leal -8(%esi,%ebx,4), %ecx C &src[size-2] divl %ebp C floor (b*(b-d)-1) / d L(preinv_entry): movl %eax, VAR_INVERSE C No special scheduling of loads is necessary in this loop, out of order C execution hides the latencies already. C C The way q1+1 is generated in %ebx and d is moved to %eax for the multiply C seems fastest. The obvious change to generate q1+1 in %eax and then just C multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle C slower, for no obvious reason. ALIGN(16) L(inverse_top): C eax n10 (then scratch) C ebx scratch (nadj, q1) C ecx src pointer, decrementing C edx scratch C esi n10 C edi n2 C ebp divisor movl (%ecx), %eax C next src limb movl %eax, %esi sarl $31, %eax C -n1 movl %ebp, %ebx andl %eax, %ebx C -n1 & d negl %eax C n1 addl %edi, %eax C n2+n1 mull VAR_INVERSE C m*(n2+n1) addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow subl $4, %ecx C addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag leal 1(%edi), %ebx C n2+1 movl %ebp, %eax C d adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 jz L(q1_ff) mull %ebx C (q1+1)*d C subl %eax, %esi C low n - (q1+1)*d sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1 andl %ebp, %edi C d if underflow addl %esi, %edi C remainder with addback if necessary cmpl PARAM_SRC, %ecx jae L(inverse_top) C ----------------------------------------------------------------------------- L(inverse_loop_done): C %edi is the remainder modulo d*2^n and now must be reduced to C 0<=r next n2 cmpl PARAM_SRC, %ecx jae L(inverse_top) jmp L(inverse_loop_done) EPILOGUE()