138 lines
3.1 KiB
NASM
138 lines
3.1 KiB
NASM
dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
|
|
|
|
dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
|
dnl License, or (at your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
dnl Lesser General Public License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public
|
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C K6: 18.0 cycles/limb
|
|
|
|
|
|
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
|
|
C mp_limb_t inverse);
|
|
C
|
|
C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
|
|
C considered worthwhile (just).
|
|
|
|
defframe(PARAM_INVERSE,16)
|
|
defframe(PARAM_DIVISOR,12)
|
|
defframe(PARAM_SIZE, 8)
|
|
defframe(PARAM_SRC, 4)
|
|
|
|
TEXT
|
|
ALIGN(32)
|
|
PROLOGUE(mpn_preinv_mod_1)
|
|
deflit(`FRAME',0)
|
|
|
|
ASSERT(ae,`cmpl $1, PARAM_SIZE')
|
|
ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
|
|
|
|
movl PARAM_SIZE, %ecx
|
|
pushl %ebp FRAME_pushl()
|
|
|
|
movl PARAM_SRC, %ebp
|
|
pushl %edi FRAME_pushl()
|
|
|
|
movl PARAM_DIVISOR, %eax
|
|
pushl %esi FRAME_pushl()
|
|
|
|
movl -4(%ebp,%ecx,4), %esi C src high limb
|
|
pushl %ebx FRAME_pushl()
|
|
|
|
movl %edx, %edi C first n2 to cancel
|
|
subl %eax, %esi C first n1 = high-divisor
|
|
|
|
decl %ecx
|
|
jz L(done_sbbl)
|
|
|
|
L(top):
|
|
C eax scratch
|
|
C ebx n10, nadj, q1
|
|
C ecx counter, size to 1
|
|
C edx scratch
|
|
C esi n2
|
|
C edi old high, for underflow test
|
|
C ebp src
|
|
|
|
sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
|
|
|
|
L(entry):
|
|
andl PARAM_DIVISOR, %edi
|
|
L(q1_ff_top):
|
|
movl -4(%ebp,%ecx,4), %ebx
|
|
|
|
addl %esi, %edi C possible addback
|
|
movl %ebx, %esi C n10
|
|
|
|
sarl $31, %ebx C -n1 = 0 or -1
|
|
movl %edi, %eax C n2
|
|
|
|
movl PARAM_INVERSE, %edx
|
|
subl %ebx, %eax C n2+n1
|
|
|
|
mull %edx C m*(n2+n1)
|
|
|
|
andl PARAM_DIVISOR, %ebx C -n1 & d
|
|
addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow
|
|
|
|
addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag
|
|
leal 1(%edi), %ebx C n2+1
|
|
|
|
adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
|
|
|
|
movl PARAM_DIVISOR, %eax C d
|
|
jz L(q1_ff)
|
|
|
|
mull %edx C (q1+1)*d
|
|
|
|
subl %eax, %esi C low n-(q1+1)*d
|
|
loop L(top)
|
|
|
|
|
|
|
|
L(done_sbbl):
|
|
sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
|
|
|
|
andl PARAM_DIVISOR, %edi
|
|
L(done_esi_edi):
|
|
popl %ebx
|
|
|
|
leal (%esi,%edi), %eax
|
|
popl %esi
|
|
|
|
popl %edi
|
|
popl %ebp
|
|
|
|
ret
|
|
|
|
|
|
C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
|
|
C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely
|
|
C reached.
|
|
|
|
L(q1_ff):
|
|
movl PARAM_DIVISOR, %edi
|
|
loop L(q1_ff_top)
|
|
|
|
jmp L(done_esi_edi)
|
|
|
|
|
|
EPILOGUE()
|