a197a2d3eb
Removed directories for no longer supported architectures.
225 lines
5.2 KiB
NASM
225 lines
5.2 KiB
NASM
dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
|
|
|
|
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
|
dnl License, or (at your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
dnl Lesser General Public License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public
|
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C cycles/limb
|
|
C 486 approx 43 maybe
|
|
C P5 44
|
|
C P6 39
|
|
C P6MMX 39
|
|
C K6 22
|
|
C K7 42
|
|
C P4 58
|
|
|
|
|
|
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
|
|
C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
|
|
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
|
|
C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
|
|
C mp_limb_t carry);
|
|
C
|
|
C Divide src,size by divisor and store the quotient in dst+xsize,size.
|
|
C Extend the division to fractional quotient limbs in dst,xsize. Return the
|
|
C remainder. Either or both xsize and size can be 0.
|
|
C
|
|
C mpn_divrem_1c takes a carry parameter which is an initial high limb,
|
|
C effectively one extra limb at the top of src,size. Must have
|
|
C carry<divisor.
|
|
C
|
|
C
|
|
C Essentially the code is the same as the division based part of
|
|
C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
|
|
C instruction even when gcc is not being used (when longlong.h only has the
|
|
C rather slow generic C udiv_qrnnd().
|
|
C
|
|
C A test is done to see if the high limb is less the the divisor, and if so
|
|
C one less div is done. A div is between 20 and 40 cycles on the various
|
|
C x86s, so assuming high<divisor about half the time, then this test saves
|
|
C half that amount. The branch misprediction penalty on each chip is less
|
|
C than half a div.
|
|
C
|
|
C
|
|
C Notes for P5:
|
|
C
|
|
C It might be thought that moving the load down to pair with the store would
|
|
C save 1 cycle, but that doesn't seem to happen in practice, and in any case
|
|
C would be a mere 2.2% saving, so it's hardly worth bothering about.
|
|
C
|
|
C A mul-by-inverse might be a possibility for P5, as done in
|
|
C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required
|
|
C is a hinderance, but there could be a 10-15% speedup available.
|
|
C
|
|
C
|
|
C Notes for K6:
|
|
C
|
|
C K6 has its own version of this code, using loop and paying attention to
|
|
C cache line boundary crossings. The target 20 c/l can be had with the
|
|
C decl+jnz of the present code by pairing up the load and store in the
|
|
C loops. But it's considered easier not to introduce complexity just for
|
|
C that, but instead let k6 have its own code.
|
|
C
|
|
|
|
defframe(PARAM_CARRY, 24)
|
|
defframe(PARAM_DIVISOR,20)
|
|
defframe(PARAM_SIZE, 16)
|
|
defframe(PARAM_SRC, 12)
|
|
defframe(PARAM_XSIZE, 8)
|
|
defframe(PARAM_DST, 4)
|
|
|
|
TEXT
|
|
ALIGN(16)
|
|
|
|
PROLOGUE(mpn_divrem_1c)
|
|
deflit(`FRAME',0)
|
|
|
|
movl PARAM_SIZE, %ecx
|
|
pushl %edi FRAME_pushl()
|
|
|
|
movl PARAM_SRC, %edi
|
|
pushl %esi FRAME_pushl()
|
|
|
|
movl PARAM_DIVISOR, %esi
|
|
pushl %ebx FRAME_pushl()
|
|
|
|
movl PARAM_DST, %ebx
|
|
pushl %ebp FRAME_pushl()
|
|
|
|
movl PARAM_XSIZE, %ebp
|
|
orl %ecx, %ecx
|
|
|
|
movl PARAM_CARRY, %edx
|
|
jz L(fraction)
|
|
|
|
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
|
|
jmp L(integer_top)
|
|
|
|
EPILOGUE()
|
|
|
|
|
|
PROLOGUE(mpn_divrem_1)
|
|
deflit(`FRAME',0)
|
|
|
|
movl PARAM_SIZE, %ecx
|
|
pushl %edi FRAME_pushl()
|
|
|
|
movl PARAM_SRC, %edi
|
|
pushl %esi FRAME_pushl()
|
|
|
|
movl PARAM_DIVISOR, %esi
|
|
orl %ecx,%ecx
|
|
|
|
jz L(size_zero)
|
|
pushl %ebx FRAME_pushl()
|
|
|
|
movl -4(%edi,%ecx,4), %eax C src high limb
|
|
xorl %edx, %edx
|
|
|
|
movl PARAM_DST, %ebx
|
|
pushl %ebp FRAME_pushl()
|
|
|
|
movl PARAM_XSIZE, %ebp
|
|
cmpl %esi, %eax
|
|
|
|
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
|
|
jae L(integer_entry)
|
|
|
|
|
|
C high<divisor, so high of dst is zero, and avoid one div
|
|
|
|
movl %edx, (%ebx,%ecx,4)
|
|
decl %ecx
|
|
|
|
movl %eax, %edx
|
|
jz L(fraction)
|
|
|
|
|
|
L(integer_top):
|
|
C eax scratch (quotient)
|
|
C ebx dst+4*xsize-4
|
|
C ecx counter
|
|
C edx scratch (remainder)
|
|
C esi divisor
|
|
C edi src
|
|
C ebp xsize
|
|
|
|
movl -4(%edi,%ecx,4), %eax
|
|
L(integer_entry):
|
|
|
|
divl %esi
|
|
|
|
movl %eax, (%ebx,%ecx,4)
|
|
decl %ecx
|
|
jnz L(integer_top)
|
|
|
|
|
|
L(fraction):
|
|
orl %ebp, %ecx
|
|
jz L(done)
|
|
|
|
movl PARAM_DST, %ebx
|
|
|
|
|
|
L(fraction_top):
|
|
C eax scratch (quotient)
|
|
C ebx dst
|
|
C ecx counter
|
|
C edx scratch (remainder)
|
|
C esi divisor
|
|
C edi
|
|
C ebp
|
|
|
|
xorl %eax, %eax
|
|
|
|
divl %esi
|
|
|
|
movl %eax, -4(%ebx,%ecx,4)
|
|
decl %ecx
|
|
jnz L(fraction_top)
|
|
|
|
|
|
L(done):
|
|
popl %ebp
|
|
movl %edx, %eax
|
|
popl %ebx
|
|
popl %esi
|
|
popl %edi
|
|
ret
|
|
|
|
|
|
L(size_zero):
|
|
deflit(`FRAME',8)
|
|
movl PARAM_XSIZE, %ecx
|
|
xorl %eax, %eax
|
|
|
|
movl PARAM_DST, %edi
|
|
|
|
cld C better safe than sorry, see mpn/x86/README
|
|
|
|
rep
|
|
stosl
|
|
|
|
popl %esi
|
|
popl %edi
|
|
ret
|
|
EPILOGUE()
|