195 lines
4.2 KiB
NASM
195 lines
4.2 KiB
NASM
|
dnl AMD K6 mpn_divrem_1 -- mpn by limb division.
|
||
|
|
||
|
dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
||
|
dnl
|
||
|
dnl This file is part of the GNU MP Library.
|
||
|
dnl
|
||
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
||
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
||
|
dnl License, or (at your option) any later version.
|
||
|
dnl
|
||
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
dnl Lesser General Public License for more details.
|
||
|
dnl
|
||
|
dnl You should have received a copy of the GNU Lesser General Public
|
||
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||
|
|
||
|
include(`../config.m4')
|
||
|
|
||
|
|
||
|
C K6: 20 cycles/limb
|
||
|
|
||
|
|
||
|
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
|
||
|
C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
|
||
|
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
|
||
|
C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
|
||
|
C mp_limb_t carry);
|
||
|
C
|
||
|
C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
|
||
|
C instead of decl+jnz, since it comes out 2 cycles/limb faster.
|
||
|
C
|
||
|
C A test is done to see if the high limb is less the the divisor, and if so
|
||
|
C one less div is done. A div is 20 cycles, so assuming high<divisor about
|
||
|
C half the time, then this test saves half that amount. The branch
|
||
|
C misprediction penalty is less than that.
|
||
|
C
|
||
|
C Back-to-back div instructions run at 20 cycles, the same as the loop here,
|
||
|
C so it seems there's nothing to gain by rearranging the loop. Pairing the
|
||
|
C mov and loop instructions was found to gain nothing.
|
||
|
C
|
||
|
C Enhancements:
|
||
|
C
|
||
|
C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
|
||
|
C that algorithm has been found to suffer from the relatively poor carry
|
||
|
C handling on K6 and too many auxiliary instructions. The fractional part
|
||
|
C however could be done at about 13 c/l, if it mattered enough.
|
||
|
|
||
|
defframe(PARAM_CARRY, 24)
|
||
|
defframe(PARAM_DIVISOR,20)
|
||
|
defframe(PARAM_SIZE, 16)
|
||
|
defframe(PARAM_SRC, 12)
|
||
|
defframe(PARAM_XSIZE, 8)
|
||
|
defframe(PARAM_DST, 4)
|
||
|
|
||
|
TEXT
|
||
|
|
||
|
ALIGN(32)
|
||
|
PROLOGUE(mpn_divrem_1c)
|
||
|
deflit(`FRAME',0)
|
||
|
|
||
|
movl PARAM_SIZE, %ecx
|
||
|
pushl %edi FRAME_pushl()
|
||
|
|
||
|
movl PARAM_SRC, %edi
|
||
|
pushl %esi FRAME_pushl()
|
||
|
|
||
|
movl PARAM_DIVISOR, %esi
|
||
|
pushl %ebx FRAME_pushl()
|
||
|
|
||
|
movl PARAM_DST, %ebx
|
||
|
pushl %ebp FRAME_pushl()
|
||
|
|
||
|
movl PARAM_XSIZE, %ebp
|
||
|
orl %ecx, %ecx C size
|
||
|
|
||
|
movl PARAM_CARRY, %edx
|
||
|
jz L(fraction) C if size==0
|
||
|
|
||
|
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
|
||
|
jmp L(integer_top)
|
||
|
|
||
|
EPILOGUE()
|
||
|
|
||
|
|
||
|
ALIGN(16)
|
||
|
PROLOGUE(mpn_divrem_1)
|
||
|
deflit(`FRAME',0)
|
||
|
|
||
|
movl PARAM_SIZE, %ecx
|
||
|
pushl %edi FRAME_pushl()
|
||
|
|
||
|
movl PARAM_SRC, %edi
|
||
|
pushl %esi FRAME_pushl()
|
||
|
|
||
|
movl PARAM_DIVISOR, %esi
|
||
|
orl %ecx,%ecx C size
|
||
|
|
||
|
jz L(size_zero)
|
||
|
pushl %ebx FRAME_pushl()
|
||
|
|
||
|
movl -4(%edi,%ecx,4), %eax C src high limb
|
||
|
xorl %edx, %edx
|
||
|
|
||
|
movl PARAM_DST, %ebx
|
||
|
pushl %ebp FRAME_pushl()
|
||
|
|
||
|
movl PARAM_XSIZE, %ebp
|
||
|
cmpl %esi, %eax
|
||
|
|
||
|
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
|
||
|
jae L(integer_entry)
|
||
|
|
||
|
|
||
|
C high<divisor, so high of dst is zero, and avoid one div
|
||
|
|
||
|
movl %edx, (%ebx,%ecx,4)
|
||
|
decl %ecx
|
||
|
|
||
|
movl %eax, %edx
|
||
|
jz L(fraction)
|
||
|
|
||
|
|
||
|
L(integer_top):
|
||
|
C eax scratch (quotient)
|
||
|
C ebx dst+4*xsize-4
|
||
|
C ecx counter
|
||
|
C edx scratch (remainder)
|
||
|
C esi divisor
|
||
|
C edi src
|
||
|
C ebp xsize
|
||
|
|
||
|
movl -4(%edi,%ecx,4), %eax
|
||
|
L(integer_entry):
|
||
|
|
||
|
divl %esi
|
||
|
|
||
|
movl %eax, (%ebx,%ecx,4)
|
||
|
loop L(integer_top)
|
||
|
|
||
|
|
||
|
L(fraction):
|
||
|
orl %ebp, %ecx
|
||
|
jz L(done)
|
||
|
|
||
|
movl PARAM_DST, %ebx
|
||
|
|
||
|
|
||
|
L(fraction_top):
|
||
|
C eax scratch (quotient)
|
||
|
C ebx dst
|
||
|
C ecx counter
|
||
|
C edx scratch (remainder)
|
||
|
C esi divisor
|
||
|
C edi
|
||
|
C ebp
|
||
|
|
||
|
xorl %eax, %eax
|
||
|
|
||
|
divl %esi
|
||
|
|
||
|
movl %eax, -4(%ebx,%ecx,4)
|
||
|
loop L(fraction_top)
|
||
|
|
||
|
|
||
|
L(done):
|
||
|
popl %ebp
|
||
|
movl %edx, %eax
|
||
|
popl %ebx
|
||
|
popl %esi
|
||
|
popl %edi
|
||
|
ret
|
||
|
|
||
|
|
||
|
L(size_zero):
|
||
|
deflit(`FRAME',8)
|
||
|
movl PARAM_XSIZE, %ecx
|
||
|
xorl %eax, %eax
|
||
|
|
||
|
movl PARAM_DST, %edi
|
||
|
|
||
|
cld C better safe than sorry, see mpn/x86/README
|
||
|
|
||
|
rep
|
||
|
stosl
|
||
|
|
||
|
popl %esi
|
||
|
popl %edi
|
||
|
ret
|
||
|
EPILOGUE()
|