a197a2d3eb
Removed directories for no longer supported architectures.
170 lines
4.0 KiB
NASM
170 lines
4.0 KiB
NASM
dnl Intel P6 mpn_copyd -- copy limb vector backwards.
|
|
|
|
dnl Copyright 2001, 2002 Free Software Foundation, Inc.
|
|
dnl
|
|
dnl This file is part of the GNU MP Library.
|
|
dnl
|
|
dnl The GNU MP Library is free software; you can redistribute it and/or
|
|
dnl modify it under the terms of the GNU Lesser General Public License as
|
|
dnl published by the Free Software Foundation; either version 2.1 of the
|
|
dnl License, or (at your option) any later version.
|
|
dnl
|
|
dnl The GNU MP Library is distributed in the hope that it will be useful,
|
|
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
dnl Lesser General Public License for more details.
|
|
dnl
|
|
dnl You should have received a copy of the GNU Lesser General Public
|
|
dnl License along with the GNU MP Library; see the file COPYING.LIB. If
|
|
dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
dnl Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
|
|
C P6: 1.75 cycles/limb, or 0.75 if no overlap
|
|
|
|
|
|
C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
|
|
C
|
|
C An explicit loop is used because a decrementing rep movsl is a bit slow at
|
|
C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
|
|
C code here stands a chance of being faster if the branches predict well.
|
|
C
|
|
C The slightly strange loop form seems necessary for the claimed speed.
|
|
C Maybe load/store ordering affects it.
|
|
C
|
|
C The source and destination are checked to see if they're actually
|
|
C overlapping, since it might be possible to use an incrementing rep movsl
|
|
C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
|
|
C version.)
|
|
C
|
|
C Enhancements:
|
|
C
|
|
C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
|
|
C one store each cycle. Unrolling the loop below would approach 1.0, but
|
|
C it'd be good to know why something like store/load/subl + store/load/jnz
|
|
C doesn't already run at 1.0 c/l. It looks like it should decode in 2
|
|
C cycles, but doesn't run that way.
|
|
|
|
defframe(PARAM_SIZE,12)
|
|
defframe(PARAM_SRC, 8)
|
|
defframe(PARAM_DST, 4)
|
|
|
|
dnl re-using parameter space
|
|
define(SAVE_ESI,`PARAM_SIZE')
|
|
define(SAVE_EDI,`PARAM_SRC')
|
|
|
|
TEXT
|
|
ALIGN(16)
|
|
|
|
PROLOGUE(mpn_copyd)
|
|
deflit(`FRAME',0)
|
|
|
|
movl PARAM_SIZE, %ecx
|
|
|
|
movl %esi, SAVE_ESI
|
|
movl PARAM_SRC, %esi
|
|
|
|
movl %edi, SAVE_EDI
|
|
movl PARAM_DST, %edi
|
|
|
|
subl $1, %ecx
|
|
jb L(zero)
|
|
|
|
movl (%esi,%ecx,4), %eax C src[size-1]
|
|
jz L(one)
|
|
|
|
movl -4(%esi,%ecx,4), %edx C src[size-2]
|
|
subl $2, %ecx
|
|
jbe L(done_loop) C 2 or 3 limbs only
|
|
|
|
|
|
C The usual overlap is
|
|
C
|
|
C high low
|
|
C +------------------+
|
|
C | dst|
|
|
C +------------------+
|
|
C +------------------+
|
|
C | src|
|
|
C +------------------+
|
|
C
|
|
C We can use an incrementing copy in the following circumstances.
|
|
C
|
|
C src+4*size<=dst, since then the regions are disjoint
|
|
C
|
|
C src==dst, clearly (though this shouldn't occur normally)
|
|
C
|
|
C src>dst, since in that case it's a requirement of the
|
|
C parameters that src>=dst+size*4, and hence the
|
|
C regions are disjoint
|
|
C
|
|
|
|
leal (%edi,%ecx,4), %edx
|
|
cmpl %edi, %esi
|
|
jae L(use_movsl) C src >= dst
|
|
|
|
cmpl %edi, %edx
|
|
movl 4(%esi,%ecx,4), %edx C src[size-2] again
|
|
jbe L(use_movsl) C src+4*size <= dst
|
|
|
|
|
|
L(top):
|
|
C eax prev high limb
|
|
C ebx
|
|
C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
|
|
C edx prev low limb
|
|
C esi src
|
|
C edi dst
|
|
C ebp
|
|
|
|
movl %eax, 8(%edi,%ecx,4)
|
|
movl (%esi,%ecx,4), %eax
|
|
|
|
movl %edx, 4(%edi,%ecx,4)
|
|
movl -4(%esi,%ecx,4), %edx
|
|
|
|
subl $2, %ecx
|
|
jnbe L(top)
|
|
|
|
|
|
L(done_loop):
|
|
movl %eax, 8(%edi,%ecx,4)
|
|
movl %edx, 4(%edi,%ecx,4)
|
|
|
|
C copy low limb (needed if size was odd, but will already have been
|
|
C done in the loop if size was even)
|
|
movl (%esi), %eax
|
|
L(one):
|
|
movl %eax, (%edi)
|
|
movl SAVE_EDI, %edi
|
|
movl SAVE_ESI, %esi
|
|
|
|
ret
|
|
|
|
|
|
L(use_movsl):
|
|
C eax
|
|
C ebx
|
|
C ecx size-3
|
|
C edx
|
|
C esi src
|
|
C edi dst
|
|
C ebp
|
|
|
|
addl $3, %ecx
|
|
|
|
cld C better safe than sorry, see mpn/x86/README
|
|
|
|
rep
|
|
movsl
|
|
|
|
L(zero):
|
|
movl SAVE_ESI, %esi
|
|
movl SAVE_EDI, %edi
|
|
|
|
ret
|
|
|
|
EPILOGUE()
|