mpir/mpn/x86/applenopic/mod_1.asm

165 lines
3.8 KiB
NASM

dnl x86 mpn_mod_1 -- mpn by limb remainder.
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or
dnl modify it under the terms of the GNU Lesser General Public License as
dnl published by the Free Software Foundation; either version 2.1 of the
dnl License, or (at your option) any later version.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl Lesser General Public License for more details.
dnl
dnl You should have received a copy of the GNU Lesser General Public
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl Fifth Floor, Boston, MA 02110-1301, USA.
include(`../config.m4')
C cycles/limb
C 486 42 approx, maybe
C P5 44
C P6 39
C K6 20
C K7 41
C P4 58
C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C mp_limb_t carry);
C
C Essentially this code is the same as the division based part of
C mpn/generic/mod_1.c, but has the advantage that we get the desired divl
C instruction even when gcc is not being used (where longlong.h only has the
C rather slow generic C udiv_qrnnd().
C
C A test is done to see if the high limb is less the the divisor, and if so
C one less div is done. A div is between 20 and 40 cycles on the various
C x86s, so assuming high<divisor about half the time, then this test saves
C half that amount. The branch misprediction penalty on each chip is less
C than half a div.
C
C
C Notes for K6:
C
C Back-to-back div instructions take 20 cycles, the same as the loop here,
C so it seems there's nothing to gain by rearranging. Pairing the mov and
C loop instructions was found to gain nothing. Normally we use a loop
C instruction rather than decl/jnz, but it gains nothing here.
C
C A multiply-by-inverse is used in mpn/x86/k6/pre_mod_1.asm, but it saves
C only 2 c/l so currently we haven't bothered with the same for mpn_mod_1.
C If an inverse takes about 40 cycles for normalized or perhaps 60 for
C unnormalized (due to bsfl being slow on k6) then the threshold would be at
C least 20 or 30 limbs.
C
defframe(PARAM_CARRY, 16)
defframe(PARAM_DIVISOR,12)
defframe(PARAM_SIZE, 8)
defframe(PARAM_SRC, 4)
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
pushl %ebx FRAME_pushl()
movl PARAM_SRC, %ebx
pushl %esi FRAME_pushl()
orl %ecx, %ecx
jz L(done_zero)
movl PARAM_DIVISOR, %esi
movl -4(%ebx,%ecx,4), %eax C src high limb
cmpl %esi, %eax
sbbl %edx, %edx C -1 if high<divisor
addl %edx, %ecx C skip one division if high<divisor
jz L(done_eax)
andl %eax, %edx C carry if high<divisor
L(top):
C eax scratch (quotient)
C ebx src
C ecx counter
C edx carry (remainder)
C esi divisor
C edi
C ebp
movl -4(%ebx,%ecx,4), %eax
divl %esi
decl %ecx
jnz L(top)
movl %edx, %eax
L(done_eax):
popl %esi
popl %ebx
ret
EPILOGUE()
C This code located after mpn_mod_1, so the jump to L(top) here is
C back and hence will be predicted as taken. (size==0 is considered
C unlikely.)
ALIGN(16)
PROLOGUE(mpn_mod_1c)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
pushl %ebx FRAME_pushl()
movl PARAM_SRC, %ebx
pushl %esi FRAME_pushl()
movl PARAM_DIVISOR, %esi
orl %ecx, %ecx
movl PARAM_CARRY, %edx
jnz L(top)
popl %esi
movl %edx, %eax
popl %ebx
ret
C This code is for mpn_mod_1, but is positioned here to save some
C space in the alignment padding.
C
L(done_zero):
popl %esi
xorl %eax, %eax
popl %ebx
ret
EPILOGUE()