520 lines
8.6 KiB
NASM
520 lines
8.6 KiB
NASM
|
dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
|
||
|
|
||
|
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
|
||
|
dnl
|
||
|
dnl This file is part of the GNU MP Library.
|
||
|
dnl
|
||
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
||
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
||
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
||
|
dnl License, or (at your option) any later version.
|
||
|
dnl
|
||
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
||
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
dnl Lesser General Public License for more details.
|
||
|
dnl
|
||
|
dnl You should have received a copy of the GNU Lesser General Public
|
||
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
||
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
||
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
||
|
|
||
|
include(`../config.m4')
|
||
|
|
||
|
|
||
|
C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
|
||
|
C product at around 20x20 limbs.
|
||
|
|
||
|
|
||
|
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
|
||
|
C
|
||
|
C Calculate src,size squared, storing the result in dst,2*size.
|
||
|
C
|
||
|
C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
|
||
|
C lot of function call overheads are avoided, especially when the size is
|
||
|
C small.
|
||
|
|
||
|
defframe(PARAM_SIZE,12)
|
||
|
defframe(PARAM_SRC, 8)
|
||
|
defframe(PARAM_DST, 4)
|
||
|
|
||
|
TEXT
|
||
|
ALIGN(8)
|
||
|
PROLOGUE(mpn_sqr_basecase)
|
||
|
deflit(`FRAME',0)
|
||
|
|
||
|
movl PARAM_SIZE, %edx
|
||
|
movl PARAM_SRC, %eax
|
||
|
|
||
|
cmpl $2, %edx
|
||
|
movl PARAM_DST, %ecx
|
||
|
|
||
|
je L(two_limbs)
|
||
|
|
||
|
movl (%eax), %eax
|
||
|
ja L(three_or_more)
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
C one limb only
|
||
|
C eax src
|
||
|
C ebx
|
||
|
C ecx dst
|
||
|
C edx
|
||
|
|
||
|
mull %eax
|
||
|
|
||
|
movl %eax, (%ecx)
|
||
|
movl %edx, 4(%ecx)
|
||
|
|
||
|
ret
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
ALIGN(8)
|
||
|
L(two_limbs):
|
||
|
C eax src
|
||
|
C ebx
|
||
|
C ecx dst
|
||
|
C edx size
|
||
|
|
||
|
pushl %ebp
|
||
|
pushl %edi
|
||
|
|
||
|
pushl %esi
|
||
|
pushl %ebx
|
||
|
|
||
|
movl %eax, %ebx
|
||
|
movl (%eax), %eax
|
||
|
|
||
|
mull %eax C src[0]^2
|
||
|
|
||
|
movl %eax, (%ecx) C dst[0]
|
||
|
movl %edx, %esi C dst[1]
|
||
|
|
||
|
movl 4(%ebx), %eax
|
||
|
|
||
|
mull %eax C src[1]^2
|
||
|
|
||
|
movl %eax, %edi C dst[2]
|
||
|
movl %edx, %ebp C dst[3]
|
||
|
|
||
|
movl (%ebx), %eax
|
||
|
|
||
|
mull 4(%ebx) C src[0]*src[1]
|
||
|
|
||
|
addl %eax, %esi
|
||
|
popl %ebx
|
||
|
|
||
|
adcl %edx, %edi
|
||
|
|
||
|
adcl $0, %ebp
|
||
|
addl %esi, %eax
|
||
|
|
||
|
adcl %edi, %edx
|
||
|
movl %eax, 4(%ecx)
|
||
|
|
||
|
adcl $0, %ebp
|
||
|
popl %esi
|
||
|
|
||
|
movl %edx, 8(%ecx)
|
||
|
movl %ebp, 12(%ecx)
|
||
|
|
||
|
popl %edi
|
||
|
popl %ebp
|
||
|
|
||
|
ret
|
||
|
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
ALIGN(8)
|
||
|
L(three_or_more):
|
||
|
C eax src low limb
|
||
|
C ebx
|
||
|
C ecx dst
|
||
|
C edx size
|
||
|
|
||
|
cmpl $4, %edx
|
||
|
pushl %ebx
|
||
|
deflit(`FRAME',4)
|
||
|
|
||
|
movl PARAM_SRC, %ebx
|
||
|
jae L(four_or_more)
|
||
|
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
C three limbs
|
||
|
C eax src low limb
|
||
|
C ebx src
|
||
|
C ecx dst
|
||
|
C edx size
|
||
|
|
||
|
pushl %ebp
|
||
|
pushl %edi
|
||
|
|
||
|
mull %eax C src[0] ^ 2
|
||
|
|
||
|
movl %eax, (%ecx)
|
||
|
movl %edx, 4(%ecx)
|
||
|
|
||
|
movl 4(%ebx), %eax
|
||
|
xorl %ebp, %ebp
|
||
|
|
||
|
mull %eax C src[1] ^ 2
|
||
|
|
||
|
movl %eax, 8(%ecx)
|
||
|
movl %edx, 12(%ecx)
|
||
|
|
||
|
movl 8(%ebx), %eax
|
||
|
pushl %esi C risk of cache bank clash
|
||
|
|
||
|
mull %eax C src[2] ^ 2
|
||
|
|
||
|
movl %eax, 16(%ecx)
|
||
|
movl %edx, 20(%ecx)
|
||
|
|
||
|
movl (%ebx), %eax
|
||
|
|
||
|
mull 4(%ebx) C src[0] * src[1]
|
||
|
|
||
|
movl %eax, %esi
|
||
|
movl %edx, %edi
|
||
|
|
||
|
movl (%ebx), %eax
|
||
|
|
||
|
mull 8(%ebx) C src[0] * src[2]
|
||
|
|
||
|
addl %eax, %edi
|
||
|
movl %edx, %ebp
|
||
|
|
||
|
adcl $0, %ebp
|
||
|
movl 4(%ebx), %eax
|
||
|
|
||
|
mull 8(%ebx) C src[1] * src[2]
|
||
|
|
||
|
xorl %ebx, %ebx
|
||
|
addl %eax, %ebp
|
||
|
|
||
|
C eax
|
||
|
C ebx zero, will be dst[5]
|
||
|
C ecx dst
|
||
|
C edx dst[4]
|
||
|
C esi dst[1]
|
||
|
C edi dst[2]
|
||
|
C ebp dst[3]
|
||
|
|
||
|
adcl $0, %edx
|
||
|
addl %esi, %esi
|
||
|
|
||
|
adcl %edi, %edi
|
||
|
|
||
|
adcl %ebp, %ebp
|
||
|
|
||
|
adcl %edx, %edx
|
||
|
movl 4(%ecx), %eax
|
||
|
|
||
|
adcl $0, %ebx
|
||
|
addl %esi, %eax
|
||
|
|
||
|
movl %eax, 4(%ecx)
|
||
|
movl 8(%ecx), %eax
|
||
|
|
||
|
adcl %edi, %eax
|
||
|
movl 12(%ecx), %esi
|
||
|
|
||
|
adcl %ebp, %esi
|
||
|
movl 16(%ecx), %edi
|
||
|
|
||
|
movl %eax, 8(%ecx)
|
||
|
movl %esi, 12(%ecx)
|
||
|
|
||
|
adcl %edx, %edi
|
||
|
popl %esi
|
||
|
|
||
|
movl 20(%ecx), %eax
|
||
|
movl %edi, 16(%ecx)
|
||
|
|
||
|
popl %edi
|
||
|
popl %ebp
|
||
|
|
||
|
adcl %ebx, %eax C no carry out of this
|
||
|
popl %ebx
|
||
|
|
||
|
movl %eax, 20(%ecx)
|
||
|
|
||
|
ret
|
||
|
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
ALIGN(8)
|
||
|
L(four_or_more):
|
||
|
C eax src low limb
|
||
|
C ebx src
|
||
|
C ecx dst
|
||
|
C edx size
|
||
|
C esi
|
||
|
C edi
|
||
|
C ebp
|
||
|
C
|
||
|
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
|
||
|
|
||
|
deflit(`FRAME',4)
|
||
|
|
||
|
pushl %edi
|
||
|
FRAME_pushl()
|
||
|
pushl %esi
|
||
|
FRAME_pushl()
|
||
|
|
||
|
pushl %ebp
|
||
|
FRAME_pushl()
|
||
|
leal (%ecx,%edx,4), %edi C dst end of this mul1
|
||
|
|
||
|
leal (%ebx,%edx,4), %esi C src end
|
||
|
movl %ebx, %ebp C src
|
||
|
|
||
|
negl %edx C -size
|
||
|
xorl %ebx, %ebx C clear carry limb and carry flag
|
||
|
|
||
|
leal 1(%edx), %ecx C -(size-1)
|
||
|
|
||
|
L(mul1):
|
||
|
C eax scratch
|
||
|
C ebx carry
|
||
|
C ecx counter, negative
|
||
|
C edx scratch
|
||
|
C esi &src[size]
|
||
|
C edi &dst[size]
|
||
|
C ebp src
|
||
|
|
||
|
adcl $0, %ebx
|
||
|
movl (%esi,%ecx,4), %eax
|
||
|
|
||
|
mull (%ebp)
|
||
|
|
||
|
addl %eax, %ebx
|
||
|
|
||
|
movl %ebx, (%edi,%ecx,4)
|
||
|
incl %ecx
|
||
|
|
||
|
movl %edx, %ebx
|
||
|
jnz L(mul1)
|
||
|
|
||
|
|
||
|
C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
|
||
|
C n=1..size-2.
|
||
|
C
|
||
|
C The last two products, which are the end corner of the product
|
||
|
C triangle, are handled separately to save looping overhead. These
|
||
|
C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
|
||
|
C If size is 4 then it's only these that need to be done.
|
||
|
C
|
||
|
C In the outer loop %esi is a constant, and %edi just advances by 1
|
||
|
C limb each time. The size of the operation decreases by 1 limb
|
||
|
C each time.
|
||
|
|
||
|
C eax
|
||
|
C ebx carry (needing carry flag added)
|
||
|
C ecx
|
||
|
C edx
|
||
|
C esi &src[size]
|
||
|
C edi &dst[size]
|
||
|
C ebp
|
||
|
|
||
|
adcl $0, %ebx
|
||
|
movl PARAM_SIZE, %edx
|
||
|
|
||
|
movl %ebx, (%edi)
|
||
|
subl $4, %edx
|
||
|
|
||
|
negl %edx
|
||
|
jz L(corner)
|
||
|
|
||
|
|
||
|
L(outer):
|
||
|
C ebx previous carry limb to store
|
||
|
C edx outer loop counter (negative)
|
||
|
C esi &src[size]
|
||
|
C edi dst, pointing at stored carry limb of previous loop
|
||
|
|
||
|
pushl %edx C new outer loop counter
|
||
|
leal -2(%edx), %ecx
|
||
|
|
||
|
movl %ebx, (%edi)
|
||
|
addl $4, %edi
|
||
|
|
||
|
addl $4, %ebp
|
||
|
xorl %ebx, %ebx C initial carry limb, clear carry flag
|
||
|
|
||
|
L(inner):
|
||
|
C eax scratch
|
||
|
C ebx carry (needing carry flag added)
|
||
|
C ecx counter, negative
|
||
|
C edx scratch
|
||
|
C esi &src[size]
|
||
|
C edi dst end of this addmul
|
||
|
C ebp &src[j]
|
||
|
|
||
|
adcl $0, %ebx
|
||
|
movl (%esi,%ecx,4), %eax
|
||
|
|
||
|
mull (%ebp)
|
||
|
|
||
|
addl %ebx, %eax
|
||
|
movl (%edi,%ecx,4), %ebx
|
||
|
|
||
|
adcl $0, %edx
|
||
|
addl %eax, %ebx
|
||
|
|
||
|
movl %ebx, (%edi,%ecx,4)
|
||
|
incl %ecx
|
||
|
|
||
|
movl %edx, %ebx
|
||
|
jnz L(inner)
|
||
|
|
||
|
|
||
|
adcl $0, %ebx
|
||
|
popl %edx C outer loop counter
|
||
|
|
||
|
incl %edx
|
||
|
jnz L(outer)
|
||
|
|
||
|
|
||
|
movl %ebx, (%edi)
|
||
|
|
||
|
L(corner):
|
||
|
C esi &src[size]
|
||
|
C edi &dst[2*size-4]
|
||
|
|
||
|
movl -8(%esi), %eax
|
||
|
movl -4(%edi), %ebx C risk of data cache bank clash here
|
||
|
|
||
|
mull -12(%esi) C src[size-2]*src[size-3]
|
||
|
|
||
|
addl %eax, %ebx
|
||
|
movl %edx, %ecx
|
||
|
|
||
|
adcl $0, %ecx
|
||
|
movl -4(%esi), %eax
|
||
|
|
||
|
mull -12(%esi) C src[size-1]*src[size-3]
|
||
|
|
||
|
addl %ecx, %eax
|
||
|
movl (%edi), %ecx
|
||
|
|
||
|
adcl $0, %edx
|
||
|
movl %ebx, -4(%edi)
|
||
|
|
||
|
addl %eax, %ecx
|
||
|
movl %edx, %ebx
|
||
|
|
||
|
adcl $0, %ebx
|
||
|
movl -4(%esi), %eax
|
||
|
|
||
|
mull -8(%esi) C src[size-1]*src[size-2]
|
||
|
|
||
|
movl %ecx, (%edi)
|
||
|
addl %eax, %ebx
|
||
|
|
||
|
adcl $0, %edx
|
||
|
movl PARAM_SIZE, %eax
|
||
|
|
||
|
negl %eax
|
||
|
movl %ebx, 4(%edi)
|
||
|
|
||
|
addl $1, %eax C -(size-1) and clear carry
|
||
|
movl %edx, 8(%edi)
|
||
|
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
|
||
|
|
||
|
L(lshift):
|
||
|
C eax counter, negative
|
||
|
C ebx next limb
|
||
|
C ecx
|
||
|
C edx
|
||
|
C esi
|
||
|
C edi &dst[2*size-4]
|
||
|
C ebp
|
||
|
|
||
|
movl 12(%edi,%eax,8), %ebx
|
||
|
|
||
|
rcll %ebx
|
||
|
movl 16(%edi,%eax,8), %ecx
|
||
|
|
||
|
rcll %ecx
|
||
|
movl %ebx, 12(%edi,%eax,8)
|
||
|
|
||
|
movl %ecx, 16(%edi,%eax,8)
|
||
|
incl %eax
|
||
|
|
||
|
jnz L(lshift)
|
||
|
|
||
|
|
||
|
adcl %eax, %eax C high bit out
|
||
|
movl PARAM_SRC, %esi
|
||
|
|
||
|
movl PARAM_SIZE, %ecx C risk of cache bank clash
|
||
|
movl %eax, 12(%edi) C dst most significant limb
|
||
|
|
||
|
|
||
|
C -----------------------------------------------------------------------------
|
||
|
C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
|
||
|
C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
|
||
|
C low limb of src[0]^2.
|
||
|
|
||
|
movl (%esi), %eax C src[0]
|
||
|
leal (%esi,%ecx,4), %esi C src end
|
||
|
|
||
|
negl %ecx
|
||
|
|
||
|
mull %eax
|
||
|
|
||
|
movl %eax, 16(%edi,%ecx,8) C dst[0]
|
||
|
movl %edx, %ebx
|
||
|
|
||
|
addl $1, %ecx C size-1 and clear carry
|
||
|
|
||
|
L(diag):
|
||
|
C eax scratch (low product)
|
||
|
C ebx carry limb
|
||
|
C ecx counter, negative
|
||
|
C edx scratch (high product)
|
||
|
C esi &src[size]
|
||
|
C edi &dst[2*size-4]
|
||
|
C ebp scratch (fetched dst limbs)
|
||
|
|
||
|
movl (%esi,%ecx,4), %eax
|
||
|
adcl $0, %ebx
|
||
|
|
||
|
mull %eax
|
||
|
|
||
|
movl 16-4(%edi,%ecx,8), %ebp
|
||
|
|
||
|
addl %ebp, %ebx
|
||
|
movl 16(%edi,%ecx,8), %ebp
|
||
|
|
||
|
adcl %eax, %ebp
|
||
|
movl %ebx, 16-4(%edi,%ecx,8)
|
||
|
|
||
|
movl %ebp, 16(%edi,%ecx,8)
|
||
|
incl %ecx
|
||
|
|
||
|
movl %edx, %ebx
|
||
|
jnz L(diag)
|
||
|
|
||
|
|
||
|
adcl $0, %edx
|
||
|
movl 16-4(%edi), %eax C dst most significant limb
|
||
|
|
||
|
addl %eax, %edx
|
||
|
popl %ebp
|
||
|
|
||
|
movl %edx, 16-4(%edi)
|
||
|
popl %esi C risk of cache bank clash
|
||
|
|
||
|
popl %edi
|
||
|
popl %ebx
|
||
|
|
||
|
ret
|
||
|
|
||
|
EPILOGUE()
|