351 lines
6.7 KiB
NASM
351 lines
6.7 KiB
NASM
dnl x86 generic mpn_sqr_basecase -- square an mpn number.
|
|
|
|
dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of the GNU Lesser General Public License as published
|
|
dnl by the Free Software Foundation; either version 2 of the License, or (at
|
|
dnl your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
dnl License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public License
|
|
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
|
|
dnl the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
dnl Boston, MA 02110-1301, USA.
|
|
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C cycles/crossproduct cycles/triangleproduct
|
|
C P5:
|
|
C P6:
|
|
C K6:
|
|
C K7:
|
|
C P4:
|
|
|
|
|
|
C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
|
|
C
|
|
C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
|
|
C lot of function call overheads are avoided, especially when the size is
|
|
C small.
|
|
C
|
|
C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
|
|
C code size to do so here.
|
|
C
|
|
C Enhancements:
|
|
C
|
|
C The addmul loop here is also not unrolled like aorsmul_1.asm and
|
|
C mul_basecase.asm are. Perhaps it should be done. It'd add to the
|
|
C complexity, but if it's worth doing in the other places then it should be
|
|
C worthwhile here.
|
|
C
|
|
C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
|
|
C might be worth considering. That'd add quite a bit to the code size, but
|
|
C only as much as is used would be dragged into L1 cache.
|
|
|
|
defframe(PARAM_SIZE,12)
|
|
defframe(PARAM_SRC, 8)
|
|
defframe(PARAM_DST, 4)
|
|
|
|
TEXT
|
|
ALIGN(8)
|
|
PROLOGUE(mpn_sqr_basecase)
|
|
deflit(`FRAME',0)
|
|
|
|
movl PARAM_SIZE, %edx
|
|
|
|
movl PARAM_SRC, %eax
|
|
|
|
cmpl $2, %edx
|
|
movl PARAM_DST, %ecx
|
|
|
|
je L(two_limbs)
|
|
ja L(three_or_more)
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
C one limb only
|
|
C eax src
|
|
C ebx
|
|
C ecx dst
|
|
C edx
|
|
|
|
movl (%eax), %eax
|
|
mull %eax
|
|
movl %eax, (%ecx)
|
|
movl %edx, 4(%ecx)
|
|
ret
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
ALIGN(8)
|
|
L(two_limbs):
|
|
C eax src
|
|
C ebx
|
|
C ecx dst
|
|
C edx
|
|
|
|
pushl %ebx
|
|
pushl %ebp
|
|
|
|
movl %eax, %ebx
|
|
movl (%eax), %eax
|
|
|
|
mull %eax C src[0]^2
|
|
|
|
pushl %esi
|
|
pushl %edi
|
|
|
|
movl %edx, %esi C dst[1]
|
|
movl %eax, (%ecx) C dst[0]
|
|
|
|
movl 4(%ebx), %eax
|
|
mull %eax C src[1]^2
|
|
|
|
movl %eax, %edi C dst[2]
|
|
movl %edx, %ebp C dst[3]
|
|
|
|
movl (%ebx), %eax
|
|
mull 4(%ebx) C src[0]*src[1]
|
|
|
|
addl %eax, %esi
|
|
|
|
adcl %edx, %edi
|
|
|
|
adcl $0, %ebp
|
|
addl %esi, %eax
|
|
|
|
adcl %edi, %edx
|
|
movl %eax, 4(%ecx)
|
|
|
|
adcl $0, %ebp
|
|
|
|
movl %edx, 8(%ecx)
|
|
movl %ebp, 12(%ecx)
|
|
|
|
popl %edi
|
|
popl %esi
|
|
|
|
popl %ebp
|
|
popl %ebx
|
|
|
|
ret
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
ALIGN(8)
|
|
L(three_or_more):
|
|
deflit(`FRAME',0)
|
|
C eax src
|
|
C ebx
|
|
C ecx dst
|
|
C edx size
|
|
|
|
pushl %ebx FRAME_pushl()
|
|
pushl %edi FRAME_pushl()
|
|
|
|
pushl %esi FRAME_pushl()
|
|
pushl %ebp FRAME_pushl()
|
|
|
|
leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1
|
|
leal (%eax,%edx,4), %esi C &src[size]
|
|
|
|
C First multiply src[0]*src[1..size-1] and store at dst[1..size].
|
|
|
|
movl (%eax), %ebp C src[0], multiplier
|
|
movl %edx, %ecx
|
|
|
|
negl %ecx C -size
|
|
xorl %ebx, %ebx C clear carry limb
|
|
|
|
incl %ecx C -(size-1)
|
|
|
|
L(mul1):
|
|
C eax scratch
|
|
C ebx carry
|
|
C ecx counter, limbs, negative
|
|
C edx scratch
|
|
C esi &src[size]
|
|
C edi &dst[size]
|
|
C ebp multiplier
|
|
|
|
movl (%esi,%ecx,4), %eax
|
|
mull %ebp
|
|
addl %eax, %ebx
|
|
adcl $0, %edx
|
|
movl %ebx, (%edi,%ecx,4)
|
|
movl %edx, %ebx
|
|
incl %ecx
|
|
jnz L(mul1)
|
|
|
|
movl %ebx, (%edi)
|
|
|
|
|
|
C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
|
|
C n=1..size-2.
|
|
C
|
|
C The last products src[size-2]*src[size-1], which is the end corner
|
|
C of the product triangle, is handled separately at the end to save
|
|
C looping overhead. If size is 3 then it's only this that needs to
|
|
C be done.
|
|
C
|
|
C In the outer loop %esi is a constant, and %edi just advances by 1
|
|
C limb each time. The size of the operation decreases by 1 limb
|
|
C each time.
|
|
|
|
C eax
|
|
C ebx carry (needing carry flag added)
|
|
C ecx
|
|
C edx
|
|
C esi &src[size]
|
|
C edi &dst[size]
|
|
C ebp
|
|
|
|
movl PARAM_SIZE, %ecx
|
|
subl $3, %ecx
|
|
jz L(corner)
|
|
|
|
negl %ecx
|
|
|
|
dnl re-use parameter space
|
|
define(VAR_OUTER,`PARAM_DST')
|
|
|
|
L(outer):
|
|
C eax
|
|
C ebx
|
|
C ecx
|
|
C edx outer loop counter, -(size-3) to -1
|
|
C esi &src[size]
|
|
C edi dst, pointing at stored carry limb of previous loop
|
|
C ebp
|
|
|
|
movl %ecx, VAR_OUTER
|
|
addl $4, %edi C advance dst end
|
|
|
|
movl -8(%esi,%ecx,4), %ebp C next multiplier
|
|
subl $1, %ecx
|
|
|
|
xorl %ebx, %ebx C initial carry limb
|
|
|
|
L(inner):
|
|
C eax scratch
|
|
C ebx carry (needing carry flag added)
|
|
C ecx counter, -n-1 to -1
|
|
C edx scratch
|
|
C esi &src[size]
|
|
C edi dst end of this addmul
|
|
C ebp multiplier
|
|
|
|
movl (%esi,%ecx,4), %eax
|
|
mull %ebp
|
|
addl %ebx, %eax
|
|
adcl $0, %edx
|
|
addl %eax, (%edi,%ecx,4)
|
|
adcl $0, %edx
|
|
movl %edx, %ebx
|
|
addl $1, %ecx
|
|
jl L(inner)
|
|
|
|
|
|
movl %ebx, (%edi)
|
|
movl VAR_OUTER, %ecx
|
|
incl %ecx
|
|
jnz L(outer)
|
|
|
|
|
|
L(corner):
|
|
C esi &src[size]
|
|
C edi &dst[2*size-3]
|
|
|
|
movl -4(%esi), %eax
|
|
mull -8(%esi) C src[size-1]*src[size-2]
|
|
addl %eax, 0(%edi)
|
|
adcl $0, %edx
|
|
movl %edx, 4(%edi) C dst high limb
|
|
|
|
|
|
C -----------------------------------------------------------------------------
|
|
C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
|
|
|
|
movl PARAM_SIZE, %eax
|
|
negl %eax
|
|
addl $1, %eax C -(size-1) and clear carry
|
|
|
|
L(lshift):
|
|
C eax counter, negative
|
|
C ebx next limb
|
|
C ecx
|
|
C edx
|
|
C esi
|
|
C edi &dst[2*size-4]
|
|
C ebp
|
|
|
|
rcll 8(%edi,%eax,8)
|
|
rcll 12(%edi,%eax,8)
|
|
incl %eax
|
|
jnz L(lshift)
|
|
|
|
|
|
adcl %eax, %eax C high bit out
|
|
movl %eax, 8(%edi) C dst most significant limb
|
|
|
|
|
|
C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
|
|
C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
|
|
C low limb of src[0]^2.
|
|
|
|
movl PARAM_SRC, %esi
|
|
movl (%esi), %eax C src[0]
|
|
mull %eax C src[0]^2
|
|
|
|
movl PARAM_SIZE, %ecx
|
|
leal (%esi,%ecx,4), %esi C src end
|
|
|
|
negl %ecx C -size
|
|
movl %edx, %ebx C initial carry
|
|
|
|
movl %eax, 12(%edi,%ecx,8) C dst[0]
|
|
incl %ecx C -(size-1)
|
|
|
|
L(diag):
|
|
C eax scratch (low product)
|
|
C ebx carry limb
|
|
C ecx counter, -(size-1) to -1
|
|
C edx scratch (high product)
|
|
C esi &src[size]
|
|
C edi &dst[2*size-3]
|
|
C ebp scratch (fetched dst limbs)
|
|
|
|
movl (%esi,%ecx,4), %eax
|
|
mull %eax
|
|
|
|
addl %ebx, 8(%edi,%ecx,8)
|
|
movl %edx, %ebx
|
|
|
|
adcl %eax, 12(%edi,%ecx,8)
|
|
adcl $0, %ebx
|
|
|
|
incl %ecx
|
|
jnz L(diag)
|
|
|
|
|
|
addl %ebx, 8(%edi) C dst most significant limb
|
|
|
|
popl %ebp
|
|
popl %esi
|
|
|
|
popl %edi
|
|
popl %ebx
|
|
|
|
ret
|
|
|
|
EPILOGUE()
|