mpir/mpn/x86_64/atom/copyi.asm

dnl  mpn_copyi

dnl  Copyright 2009 Jason Moxham

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

C	ret mpn_copyi(mp_ptr,mp_ptr,mp_size_t)
C	rax             rdi,   rsi,      rdx

ASM_START()
PROLOGUE(mpn_copyi)
cmp $0,%rdx		#needed for case n=0
jz endfn		#needed for case n=0
mov %rdi,%rax
sub %rsi,%rax
test $0xF,%rax
jz aligned
test $0xF,%rdi
jz srcisodd
mov $5,%rcx
sub %rdx,%rcx
lea -40(%rsi,%rdx,8),%rsi
lea -40(%rdi,%rdx,8),%rdi
movapd (%rsi,%rcx,8),%xmm1
movq %xmm1,(%rdi,%rcx,8)
add $8,%rdi
cmp $1,%rdx		#needed for case n=1
jz endfn		#needed for case n=1
cmp $0,%rcx
jge skiplpud
ALIGN(16)
lpud:	movapd 16(%rsi,%rcx,8),%xmm0
	add $4,%rcx
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,-32(%rdi,%rcx,8)
	movapd 32-32(%rsi,%rcx,8),%xmm1
	shufpd $1,%xmm1,%xmm0
	movapd %xmm0,16-32(%rdi,%rcx,8)
	jnc lpud
skiplpud:
cmp $2,%rcx
ja case0d
jz case1d
jp case2d
ALIGN(16)
case3d:	movapd 16(%rsi,%rcx,8),%xmm0
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,(%rdi,%rcx,8)
	movapd 32(%rsi,%rcx,8),%xmm1  	# top is read past
	shufpd $1,%xmm1,%xmm0
	movapd %xmm0,16(%rdi,%rcx,8)
	ret
ALIGN(16)
case2d:	movapd 16(%rsi,%rcx,8),%xmm0
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,(%rdi,%rcx,8)
	movhpd %xmm0,16(%rdi,%rcx,8)
	ret
ALIGN(16)
case1d:	movapd 16(%rsi,%rcx,8),%xmm0	# top read past
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,(%rdi,%rcx,8)
	ret
ALIGN(16)
case0d:	movhpd %xmm1,(%rdi,%rcx,8)
endfn:	ret


srcisodd:
mov $4,%rcx
sub %rdx,%rcx
lea -32(%rsi,%rdx,8),%rsi
lea -32(%rdi,%rdx,8),%rdi
	movapd -8(%rsi,%rcx,8),%xmm1
	sub $8,%rsi
cmp $0,%rcx
jge skiplpus	
ALIGN(16)
lpus:	movapd 16(%rsi,%rcx,8),%xmm0
	add $4,%rcx
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,-32(%rdi,%rcx,8)
	movapd 32-32(%rsi,%rcx,8),%xmm1
	shufpd $1,%xmm1,%xmm0
	movapd %xmm0,16-32(%rdi,%rcx,8)
	jnc lpus
skiplpus:
cmp $2,%rcx
ja case0s
jz case1s
jp case2s
ALIGN(16)
case3s:	movapd 16(%rsi,%rcx,8),%xmm0
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,(%rdi,%rcx,8)
	movapd 32(%rsi,%rcx,8),%xmm1  	# read past
	shufpd $1,%xmm1,%xmm0
	movapd %xmm0,16(%rdi,%rcx,8)
	ret
ALIGN(16)
case2s: movapd 16(%rsi,%rcx,8),%xmm0
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,(%rdi,%rcx,8)
	movhpd %xmm0,16(%rdi,%rcx,8)
	ret
ALIGN(16)
case1s:	movapd 16(%rsi,%rcx,8),%xmm0	# read past
	shufpd $1,%xmm0,%xmm1
	movapd %xmm1,(%rdi,%rcx,8)
	ret
ALIGN(16)
case0s:	movhpd %xmm1,(%rdi,%rcx,8)
	ret


ALIGN(16)
aligned:
sub $4,%rdx
test $0xF,%rdi
jz notodda
	mov (%rsi),%rax
	mov %rax,(%rdi)
	sub $1,%rdx
	lea 8(%rsi),%rsi
	lea 8(%rdi),%rdi
notodda:
cmp $0,%rdx
jl skiplpa
ALIGN(16)
lpa:	movdqa (%rsi),%xmm0
	sub $4,%rdx
	movdqa 16(%rsi),%xmm1
	lea 32(%rsi),%rsi
	movdqa %xmm0,(%rdi)
	lea 32(%rdi),%rdi
	movdqa %xmm1,16-32(%rdi)
	jnc lpa
skiplpa:
cmp $-2,%rdx
jg casea3
je casea2
jnp casea0
casea1:	mov (%rsi),%rax
	mov %rax,(%rdi)
	ret
casea3:	movdqa (%rsi),%xmm0
	mov 16(%rsi),%rax
	movdqa %xmm0,(%rdi)
	mov %rax,16(%rdi)
casea0:	ret
casea2:	movdqa (%rsi),%xmm0
	movdqa %xmm0,(%rdi)
	ret
EPILOGUE()
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`dnl mpn_copyi`

			`dnl Copyright 2009 Jason Moxham`

			`dnl This file is part of the MPIR Library.`

			`dnl The MPIR Library is free software; you can redistribute it and/or modify`
			`dnl it under the terms of the GNU Lesser General Public License as published`
			`dnl by the Free Software Foundation; either version 2.1 of the License, or (at`
			`dnl your option) any later version.`

			`dnl The MPIR Library is distributed in the hope that it will be useful, but`
			`dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`dnl License for more details.`

			`dnl You should have received a copy of the GNU Lesser General Public License`
			`dnl along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`dnl Boston, MA 02110-1301, USA.`

			include(`../config.m4')

			`C ret mpn_copyi(mp_ptr,mp_ptr,mp_size_t)`
			`C rax rdi, rsi, rdx`

			`ASM_START()`
			`PROLOGUE(mpn_copyi)`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`cmp $0,%rdx #needed for case n=0`
			`jz endfn #needed for case n=0`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`mov %rdi,%rax`
			`sub %rsi,%rax`
			`test $0xF,%rax`
			`jz aligned`
			`test $0xF,%rdi`
			`jz srcisodd`
			`mov $5,%rcx`
			`sub %rdx,%rcx`
			`lea -40(%rsi,%rdx,8),%rsi`
			`lea -40(%rdi,%rdx,8),%rdi`
			`movapd (%rsi,%rcx,8),%xmm1`
			`movq %xmm1,(%rdi,%rcx,8)`
			`add $8,%rdi`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`cmp $1,%rdx #needed for case n=1`
			`jz endfn #needed for case n=1`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`cmp $0,%rcx`
			`jge skiplpud`
			`ALIGN(16)`
			`lpud: movapd 16(%rsi,%rcx,8),%xmm0`
			`add $4,%rcx`
			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,-32(%rdi,%rcx,8)`
			`movapd 32-32(%rsi,%rcx,8),%xmm1`
			`shufpd $1,%xmm1,%xmm0`
			`movapd %xmm0,16-32(%rdi,%rcx,8)`
			`jnc lpud`
			`skiplpud:`
			`cmp $2,%rcx`
			`ja case0d`
			`jz case1d`
			`jp case2d`
			`ALIGN(16)`
			`case3d: movapd 16(%rsi,%rcx,8),%xmm0`
			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,(%rdi,%rcx,8)`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`movapd 32(%rsi,%rcx,8),%xmm1 # top is read past`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`shufpd $1,%xmm1,%xmm0`
			`movapd %xmm0,16(%rdi,%rcx,8)`
			`ret`
			`ALIGN(16)`
			`case2d: movapd 16(%rsi,%rcx,8),%xmm0`
			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,(%rdi,%rcx,8)`
			`movhpd %xmm0,16(%rdi,%rcx,8)`
			`ret`
			`ALIGN(16)`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`case1d: movapd 16(%rsi,%rcx,8),%xmm0 # top read past`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,(%rdi,%rcx,8)`
			`ret`
			`ALIGN(16)`
			`case0d: movhpd %xmm1,(%rdi,%rcx,8)`
			`endfn: ret`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00


Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`srcisodd:`
			`mov $4,%rcx`
			`sub %rdx,%rcx`
			`lea -32(%rsi,%rdx,8),%rsi`
			`lea -32(%rdi,%rdx,8),%rdi`
			`movapd -8(%rsi,%rcx,8),%xmm1`
			`sub $8,%rsi`
			`cmp $0,%rcx`
			`jge skiplpus`
			`ALIGN(16)`
			`lpus: movapd 16(%rsi,%rcx,8),%xmm0`
			`add $4,%rcx`
			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,-32(%rdi,%rcx,8)`
			`movapd 32-32(%rsi,%rcx,8),%xmm1`
			`shufpd $1,%xmm1,%xmm0`
			`movapd %xmm0,16-32(%rdi,%rcx,8)`
			`jnc lpus`
			`skiplpus:`
			`cmp $2,%rcx`
			`ja case0s`
			`jz case1s`
			`jp case2s`
			`ALIGN(16)`
			`case3s: movapd 16(%rsi,%rcx,8),%xmm0`
			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,(%rdi,%rcx,8)`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`movapd 32(%rsi,%rcx,8),%xmm1 # read past`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`shufpd $1,%xmm1,%xmm0`
			`movapd %xmm0,16(%rdi,%rcx,8)`
			`ret`
			`ALIGN(16)`
			`case2s: movapd 16(%rsi,%rcx,8),%xmm0`
			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,(%rdi,%rcx,8)`
			`movhpd %xmm0,16(%rdi,%rcx,8)`
			`ret`
			`ALIGN(16)`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`case1s: movapd 16(%rsi,%rcx,8),%xmm0 # read past`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`shufpd $1,%xmm0,%xmm1`
			`movapd %xmm1,(%rdi,%rcx,8)`
			`ret`
			`ALIGN(16)`
			`case0s: movhpd %xmm1,(%rdi,%rcx,8)`
			`ret`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00

Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`ALIGN(16)`
			`aligned:`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`sub $4,%rdx`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`test $0xF,%rdi`
			`jz notodda`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`mov (%rsi),%rax`
			`mov %rax,(%rdi)`
			`sub $1,%rdx`
			`lea 8(%rsi),%rsi`
			`lea 8(%rdi),%rdi`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`notodda:`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`cmp $0,%rdx`
			`jl skiplpa`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`ALIGN(16)`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`lpa: movdqa (%rsi),%xmm0`
			`sub $4,%rdx`
			`movdqa 16(%rsi),%xmm1`
			`lea 32(%rsi),%rsi`
			`movdqa %xmm0,(%rdi)`
			`lea 32(%rdi),%rdi`
			`movdqa %xmm1,16-32(%rdi)`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`jnc lpa`
			`skiplpa:`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`cmp $-2,%rdx`
			`jg casea3`
			`je casea2`
			`jnp casea0`
			`casea1: mov (%rsi),%rax`
			`mov %rax,(%rdi)`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`ret`
New atom asm funtion mpn-copyi 2009-11-19 04:09:50 -05:00			`casea3: movdqa (%rsi),%xmm0`
			`mov 16(%rsi),%rax`
			`movdqa %xmm0,(%rdi)`
			`mov %rax,16(%rdi)`
			`casea0: ret`
			`casea2: movdqa (%rsi),%xmm0`
			`movdqa %xmm0,(%rdi)`
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`ret`
			`EPILOGUE()`