173 lines
3.6 KiB
NASM
173 lines
3.6 KiB
NASM
dnl mpn_copyi
|
|
|
|
dnl Copyright 2009 Jason Moxham
|
|
|
|
dnl This file is part of the MPIR Library.
|
|
|
|
dnl The MPIR Library is free software; you can redistribute it and/or modify
|
|
dnl it under the terms of the GNU Lesser General Public License as published
|
|
dnl by the Free Software Foundation; either version 2.1 of the License, or (at
|
|
dnl your option) any later version.
|
|
|
|
dnl The MPIR Library is distributed in the hope that it will be useful, but
|
|
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
dnl License for more details.
|
|
|
|
dnl You should have received a copy of the GNU Lesser General Public License
|
|
dnl along with the MPIR Library; see the file COPYING.LIB. If not, write
|
|
dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
dnl Boston, MA 02110-1301, USA.
|
|
|
|
include(`../config.m4')
|
|
|
|
C ret mpn_copyi(mp_ptr,mp_ptr,mp_size_t)
|
|
C rax rdi, rsi, rdx
|
|
|
|
ASM_START()
|
|
PROLOGUE(mpn_copyi)
|
|
cmp $0,%rdx #needed for case n=0
|
|
jz endfn #needed for case n=0
|
|
mov %rdi,%rax
|
|
sub %rsi,%rax
|
|
test $0xF,%rax
|
|
jz aligned
|
|
test $0xF,%rdi
|
|
jz srcisodd
|
|
mov $5,%rcx
|
|
sub %rdx,%rcx
|
|
lea -40(%rsi,%rdx,8),%rsi
|
|
lea -40(%rdi,%rdx,8),%rdi
|
|
movapd (%rsi,%rcx,8),%xmm1
|
|
movq %xmm1,(%rdi,%rcx,8)
|
|
add $8,%rdi
|
|
cmp $1,%rdx #needed for case n=1
|
|
jz endfn #needed for case n=1
|
|
cmp $0,%rcx
|
|
jge skiplpud
|
|
ALIGN(16)
|
|
lpud: movapd 16(%rsi,%rcx,8),%xmm0
|
|
add $4,%rcx
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,-32(%rdi,%rcx,8)
|
|
movapd 32-32(%rsi,%rcx,8),%xmm1
|
|
shufpd $1,%xmm1,%xmm0
|
|
movapd %xmm0,16-32(%rdi,%rcx,8)
|
|
jnc lpud
|
|
skiplpud:
|
|
cmp $2,%rcx
|
|
ja case0d
|
|
jz case1d
|
|
jp case2d
|
|
ALIGN(16)
|
|
case3d: movapd 16(%rsi,%rcx,8),%xmm0
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,(%rdi,%rcx,8)
|
|
movapd 32(%rsi,%rcx,8),%xmm1 # top is read past
|
|
shufpd $1,%xmm1,%xmm0
|
|
movapd %xmm0,16(%rdi,%rcx,8)
|
|
ret
|
|
ALIGN(16)
|
|
case2d: movapd 16(%rsi,%rcx,8),%xmm0
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,(%rdi,%rcx,8)
|
|
movhpd %xmm0,16(%rdi,%rcx,8)
|
|
ret
|
|
ALIGN(16)
|
|
case1d: movapd 16(%rsi,%rcx,8),%xmm0 # top read past
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,(%rdi,%rcx,8)
|
|
ret
|
|
ALIGN(16)
|
|
case0d: movhpd %xmm1,(%rdi,%rcx,8)
|
|
endfn: ret
|
|
|
|
|
|
|
|
srcisodd:
|
|
mov $4,%rcx
|
|
sub %rdx,%rcx
|
|
lea -32(%rsi,%rdx,8),%rsi
|
|
lea -32(%rdi,%rdx,8),%rdi
|
|
movapd -8(%rsi,%rcx,8),%xmm1
|
|
sub $8,%rsi
|
|
cmp $0,%rcx
|
|
jge skiplpus
|
|
ALIGN(16)
|
|
lpus: movapd 16(%rsi,%rcx,8),%xmm0
|
|
add $4,%rcx
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,-32(%rdi,%rcx,8)
|
|
movapd 32-32(%rsi,%rcx,8),%xmm1
|
|
shufpd $1,%xmm1,%xmm0
|
|
movapd %xmm0,16-32(%rdi,%rcx,8)
|
|
jnc lpus
|
|
skiplpus:
|
|
cmp $2,%rcx
|
|
ja case0s
|
|
jz case1s
|
|
jp case2s
|
|
ALIGN(16)
|
|
case3s: movapd 16(%rsi,%rcx,8),%xmm0
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,(%rdi,%rcx,8)
|
|
movapd 32(%rsi,%rcx,8),%xmm1 # read past
|
|
shufpd $1,%xmm1,%xmm0
|
|
movapd %xmm0,16(%rdi,%rcx,8)
|
|
ret
|
|
ALIGN(16)
|
|
case2s: movapd 16(%rsi,%rcx,8),%xmm0
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,(%rdi,%rcx,8)
|
|
movhpd %xmm0,16(%rdi,%rcx,8)
|
|
ret
|
|
ALIGN(16)
|
|
case1s: movapd 16(%rsi,%rcx,8),%xmm0 # read past
|
|
shufpd $1,%xmm0,%xmm1
|
|
movapd %xmm1,(%rdi,%rcx,8)
|
|
ret
|
|
ALIGN(16)
|
|
case0s: movhpd %xmm1,(%rdi,%rcx,8)
|
|
ret
|
|
|
|
|
|
ALIGN(16)
|
|
aligned:
|
|
sub $4,%rdx
|
|
test $0xF,%rdi
|
|
jz notodda
|
|
mov (%rsi),%rax
|
|
mov %rax,(%rdi)
|
|
sub $1,%rdx
|
|
lea 8(%rsi),%rsi
|
|
lea 8(%rdi),%rdi
|
|
notodda:
|
|
cmp $0,%rdx
|
|
jl skiplpa
|
|
ALIGN(16)
|
|
lpa: movdqa (%rsi),%xmm0
|
|
sub $4,%rdx
|
|
movdqa 16(%rsi),%xmm1
|
|
lea 32(%rsi),%rsi
|
|
movdqa %xmm0,(%rdi)
|
|
lea 32(%rdi),%rdi
|
|
movdqa %xmm1,16-32(%rdi)
|
|
jnc lpa
|
|
skiplpa:
|
|
cmp $-2,%rdx
|
|
jg casea3
|
|
je casea2
|
|
jnp casea0
|
|
casea1: mov (%rsi),%rax
|
|
mov %rax,(%rdi)
|
|
ret
|
|
casea3: movdqa (%rsi),%xmm0
|
|
mov 16(%rsi),%rax
|
|
movdqa %xmm0,(%rdi)
|
|
mov %rax,16(%rdi)
|
|
casea0: ret
|
|
casea2: movdqa (%rsi),%xmm0
|
|
movdqa %xmm0,(%rdi)
|
|
ret
|
|
EPILOGUE()
|