354 lines
6.1 KiB
NASM
354 lines
6.1 KiB
NASM
|
dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
|
||
|
|
||
|
dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
|
||
|
dnl
|
||
|
dnl This file is part of the GNU MP Library.
|
||
|
dnl
|
||
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
||
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
||
|
dnl License, or (at your option) any later version.
|
||
|
dnl
|
||
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
dnl Lesser General Public License for more details.
|
||
|
dnl
|
||
|
dnl You should have received a copy of the GNU Lesser General Public
|
||
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||
|
|
||
|
include(`../config.m4')
|
||
|
|
||
|
|
||
|
C K6: 9.5 cycles/bit (approx) 1x1 gcd
|
||
|
C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
|
||
|
|
||
|
|
||
|
C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
|
||
|
C
|
||
|
C This code is nothing very special, but offers a speedup over what gcc 2.95
|
||
|
C can do with mpn/generic/gcd_1.c.
|
||
|
C
|
||
|
C Future:
|
||
|
C
|
||
|
C Using a lookup table to count trailing zeros seems a touch quicker, but
|
||
|
C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
|
||
|
C it too.
|
||
|
|
||
|
|
||
|
dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
|
||
|
dnl bigger than y, then a division x%y is done to reduce it.
|
||
|
dnl
|
||
|
dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
|
||
|
dnl there should be an advantage in the divl at about 4 or 5 bits, which is
|
||
|
dnl what's found.
|
||
|
|
||
|
deflit(DIV_THRESHOLD, 5)
|
||
|
|
||
|
|
||
|
defframe(PARAM_LIMB, 12)
|
||
|
defframe(PARAM_SIZE, 8)
|
||
|
defframe(PARAM_SRC, 4)
|
||
|
|
||
|
TEXT
|
||
|
ALIGN(16)
|
||
|
|
||
|
PROLOGUE(mpn_gcd_1)
|
||
|
deflit(`FRAME',0)
|
||
|
|
||
|
ASSERT(ne, `cmpl $0, PARAM_LIMB')
|
||
|
ASSERT(ae, `cmpl $1, PARAM_SIZE')
|
||
|
|
||
|
|
||
|
movl PARAM_SRC, %eax
|
||
|
pushl %ebx FRAME_pushl()
|
||
|
|
||
|
movl PARAM_LIMB, %edx
|
||
|
movl $-1, %ecx
|
||
|
|
||
|
movl (%eax), %ebx C src low limb
|
||
|
|
||
|
movl %ebx, %eax C src low limb
|
||
|
orl %edx, %ebx
|
||
|
|
||
|
L(common_twos):
|
||
|
shrl %ebx
|
||
|
incl %ecx
|
||
|
|
||
|
jnc L(common_twos) C 1/4 chance on random data
|
||
|
shrl %cl, %edx C y
|
||
|
|
||
|
cmpl $1, PARAM_SIZE
|
||
|
ja L(size_two_or_more)
|
||
|
|
||
|
|
||
|
ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
|
||
|
|
||
|
shrl %cl, %eax C x
|
||
|
|
||
|
|
||
|
C Swap if necessary to make x>=y. Measures a touch quicker as a
|
||
|
C jump than a branch free calculation.
|
||
|
C
|
||
|
C eax x
|
||
|
C ebx
|
||
|
C ecx common twos
|
||
|
C edx y
|
||
|
|
||
|
movl %eax, %ebx
|
||
|
cmpl %eax, %edx
|
||
|
|
||
|
jb L(noswap)
|
||
|
movl %edx, %eax
|
||
|
|
||
|
movl %ebx, %edx
|
||
|
movl %eax, %ebx
|
||
|
L(noswap):
|
||
|
|
||
|
|
||
|
C See if it's worth reducing x with a divl.
|
||
|
C
|
||
|
C eax x
|
||
|
C ebx x
|
||
|
C ecx common twos
|
||
|
C edx y
|
||
|
|
||
|
shrl $DIV_THRESHOLD, %ebx
|
||
|
|
||
|
cmpl %ebx, %edx
|
||
|
ja L(nodiv)
|
||
|
|
||
|
|
||
|
C Reduce x to x%y.
|
||
|
C
|
||
|
C eax x
|
||
|
C ebx
|
||
|
C ecx common twos
|
||
|
C edx y
|
||
|
|
||
|
movl %edx, %ebx
|
||
|
xorl %edx, %edx
|
||
|
|
||
|
divl %ebx
|
||
|
|
||
|
orl %edx, %edx C y
|
||
|
nop C code alignment
|
||
|
|
||
|
movl %ebx, %eax C x
|
||
|
jz L(done_shll)
|
||
|
L(nodiv):
|
||
|
|
||
|
|
||
|
C eax x
|
||
|
C ebx
|
||
|
C ecx common twos
|
||
|
C edx y
|
||
|
C esi
|
||
|
C edi
|
||
|
C ebp
|
||
|
|
||
|
L(strip_y):
|
||
|
shrl %edx
|
||
|
jnc L(strip_y)
|
||
|
|
||
|
leal 1(%edx,%edx), %edx
|
||
|
movl %ecx, %ebx C common twos
|
||
|
|
||
|
leal 1(%eax), %ecx
|
||
|
jmp L(strip_x_and)
|
||
|
|
||
|
|
||
|
C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
|
||
|
C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
|
||
|
C only 1/4.
|
||
|
C
|
||
|
C A second computed %cl shift was tried, but that measured a touch slower
|
||
|
C than branching back.
|
||
|
C
|
||
|
C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
|
||
|
C measured about 1 cycle/bit slower.
|
||
|
|
||
|
C eax x
|
||
|
C ebx common twos
|
||
|
C ecx scratch
|
||
|
C edx y
|
||
|
|
||
|
ALIGN(4)
|
||
|
L(swap):
|
||
|
addl %eax, %edx C x-y+y = x
|
||
|
negl %eax C -(x-y) = y-x
|
||
|
|
||
|
L(strip_x):
|
||
|
shrl %eax C odd-odd = even, so always one to strip
|
||
|
ASSERT(nz)
|
||
|
|
||
|
L(strip_x_leal):
|
||
|
leal 1(%eax), %ecx
|
||
|
|
||
|
L(strip_x_and):
|
||
|
andl $1, %ecx C (x^1)&1
|
||
|
|
||
|
shrl %cl, %eax C shift if x even
|
||
|
|
||
|
testb $1, %al
|
||
|
jz L(strip_x)
|
||
|
|
||
|
ASSERT(nz,`testl $1, %eax') C x, y odd
|
||
|
ASSERT(nz,`testl $1, %edx')
|
||
|
|
||
|
subl %edx, %eax
|
||
|
jb L(swap)
|
||
|
ja L(strip_x)
|
||
|
|
||
|
|
||
|
movl %edx, %eax
|
||
|
movl %ebx, %ecx
|
||
|
|
||
|
L(done_shll):
|
||
|
shll %cl, %eax
|
||
|
popl %ebx
|
||
|
|
||
|
ret
|
||
|
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
C Two or more limbs.
|
||
|
C
|
||
|
C x={src,size} is reduced modulo y using either a plain mod_1 style
|
||
|
C remainder, or a modexact_1 style exact division.
|
||
|
|
||
|
deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
|
||
|
|
||
|
ALIGN(8)
|
||
|
L(size_two_or_more):
|
||
|
C eax
|
||
|
C ebx
|
||
|
C ecx common twos
|
||
|
C edx y, without common twos
|
||
|
C esi
|
||
|
C edi
|
||
|
C ebp
|
||
|
|
||
|
deflit(FRAME_TWO_OR_MORE, FRAME)
|
||
|
|
||
|
pushl %edi defframe_pushl(SAVE_EDI)
|
||
|
movl PARAM_SRC, %ebx
|
||
|
|
||
|
L(y_twos):
|
||
|
shrl %edx
|
||
|
jnc L(y_twos)
|
||
|
|
||
|
movl %ecx, %edi C common twos
|
||
|
movl PARAM_SIZE, %ecx
|
||
|
|
||
|
pushl %esi defframe_pushl(SAVE_ESI)
|
||
|
leal 1(%edx,%edx), %esi C y (odd)
|
||
|
|
||
|
movl -4(%ebx,%ecx,4), %eax C src high limb
|
||
|
|
||
|
cmpl %edx, %eax C carry if high<divisor
|
||
|
|
||
|
sbbl %edx, %edx C -1 if high<divisor
|
||
|
|
||
|
addl %edx, %ecx C skip one limb if high<divisor
|
||
|
andl %eax, %edx
|
||
|
|
||
|
cmpl $MODEXACT_THRESHOLD, %ecx
|
||
|
jae L(modexact)
|
||
|
|
||
|
|
||
|
L(divide_top):
|
||
|
C eax scratch (quotient)
|
||
|
C ebx src
|
||
|
C ecx counter, size-1 to 1
|
||
|
C edx carry (remainder)
|
||
|
C esi divisor (odd)
|
||
|
C edi
|
||
|
C ebp
|
||
|
|
||
|
movl -4(%ebx,%ecx,4), %eax
|
||
|
divl %esi
|
||
|
loop L(divide_top)
|
||
|
|
||
|
|
||
|
movl %edx, %eax C x
|
||
|
movl %esi, %edx C y (odd)
|
||
|
|
||
|
movl %edi, %ebx C common twos
|
||
|
popl %esi
|
||
|
|
||
|
popl %edi
|
||
|
leal 1(%eax), %ecx
|
||
|
|
||
|
orl %eax, %eax
|
||
|
jnz L(strip_x_and)
|
||
|
|
||
|
|
||
|
movl %ebx, %ecx
|
||
|
movl %edx, %eax
|
||
|
|
||
|
shll %cl, %eax
|
||
|
popl %ebx
|
||
|
|
||
|
ret
|
||
|
|
||
|
|
||
|
ALIGN(8)
|
||
|
L(modexact):
|
||
|
C eax
|
||
|
C ebx src ptr
|
||
|
C ecx size or size-1
|
||
|
C edx
|
||
|
C esi y odd
|
||
|
C edi common twos
|
||
|
C ebp
|
||
|
|
||
|
movl PARAM_SIZE, %eax
|
||
|
pushl %esi FRAME_pushl()
|
||
|
|
||
|
pushl %eax FRAME_pushl()
|
||
|
|
||
|
pushl %ebx FRAME_pushl()
|
||
|
|
||
|
ifdef(`PIC',`
|
||
|
nop C code alignment
|
||
|
call L(movl_eip_ebx)
|
||
|
L(here):
|
||
|
addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
||
|
call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
|
||
|
',`
|
||
|
call GSYM_PREFIX`'mpn_modexact_1_odd
|
||
|
')
|
||
|
|
||
|
movl %esi, %edx C y odd
|
||
|
movl SAVE_ESI, %esi
|
||
|
|
||
|
movl %edi, %ebx C common twos
|
||
|
movl SAVE_EDI, %edi
|
||
|
|
||
|
addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
|
||
|
orl %eax, %eax
|
||
|
|
||
|
leal 1(%eax), %ecx
|
||
|
jnz L(strip_x_and)
|
||
|
|
||
|
|
||
|
movl %ebx, %ecx
|
||
|
movl %edx, %eax
|
||
|
|
||
|
shll %cl, %eax
|
||
|
popl %ebx
|
||
|
|
||
|
ret
|
||
|
|
||
|
|
||
|
ifdef(`PIC',`
|
||
|
L(movl_eip_ebx):
|
||
|
movl (%esp), %ebx
|
||
|
ret_internal
|
||
|
')
|
||
|
|
||
|
EPILOGUE()
|