mpir/mpn/x86_64/atom/addlsh_n.asm

dnl  AMD64 mpn_addlsh_n

dnl  Copyright 2009 Jason Moxham

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

C    carry+(xp,n)= (yp,n) + (zp,n)<<c with carry in ci
C mp_limb_t	mpn_addlsh_nc(mp_ptr xp, mp_srcptr yp,mp_srcptr zp,mp_size_t n,unsigned int c,mp_limb_t ci)
C xp in rdi	yp in rsi	zp in rdx	n  in rcx	c  in r8	ci in r9

MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc)

ASM_START()
PROLOGUE(mpn_addlsh_n)
xor %r9,%r9
EPILOGUE()
PROLOGUE(mpn_addlsh_nc)
mov	%rcx,%r10
lea	(%rdi,%r10,8),%rdi
lea	(%rsi,%r10,8),%rsi
lea	(%rdx,%r10,8),%rdx
mov	%r8,%rcx
neg	%rcx
shr	%cl,%r9
neg	%r10
xor	%rax,%rax
test	$3,%r10
jz	next
lp:
	mov	(%rdx,%r10,8),%r8
	mov	%r8,%r11
	neg	%rcx
	shl	%cl,%r8
	neg	%rcx
	shr	%cl,%r11
	or	%r9,%r8
	mov	%r11,%r9
	add	$1,%rax
	adc	(%rsi,%r10,8),%r8
	sbb	%rax,%rax
	mov	%r8,(%rdi,%r10,8)
	inc	%r10
	test	$3,%r10
	jnz	lp
next:
cmp	$0,%r10
jz	end
push	%rbx
push	%rbp
push	%r12
push	%r13
push	%r14
push	%r15
ALIGN(16)
loop:
	mov	(%rdx,%r10,8),%r8
	mov	8(%rdx,%r10,8),%rbp
	mov	16(%rdx,%r10,8),%rbx
	mov	24(%rdx,%r10,8),%r12
	mov	%r8,%r11
	mov	%rbp,%r13
	mov	%rbx,%r14
	mov	%r12,%r15
	neg	%rcx
	shl	%cl,%r8
	shl	%cl,%rbp
	shl	%cl,%rbx
	shl	%cl,%r12
	neg	%rcx
	shr	%cl,%r11
	shr	%cl,%r13
	shr	%cl,%r14
	shr	%cl,%r15
	or	%r9,%r8
	or	%r11,%rbp
	or	%r13,%rbx
	or	%r14,%r12
	mov	%r15,%r9
	add	$1,%rax
	adc	(%rsi,%r10,8),%r8
	adc	8(%rsi,%r10,8),%rbp
	adc	16(%rsi,%r10,8),%rbx
	adc	24(%rsi,%r10,8),%r12
	sbb	%rax,%rax
	mov	%r8,(%rdi,%r10,8)
	mov	%rbp,8(%rdi,%r10,8)
	mov	%rbx,16(%rdi,%r10,8)
	mov	%r12,24(%rdi,%r10,8)
	add	$4,%r10
	jnz	loop
pop	%r15
pop	%r14
pop	%r13
pop	%r12
pop	%rbp
pop	%rbx
end:
neg	%rax
add	%r9,%rax
ret
EPILOGUE()
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`dnl AMD64 mpn_addlsh_n`

			`dnl Copyright 2009 Jason Moxham`

			`dnl This file is part of the MPIR Library.`

			`dnl The MPIR Library is free software; you can redistribute it and/or modify`
			`dnl it under the terms of the GNU Lesser General Public License as published`
			`dnl by the Free Software Foundation; either version 2.1 of the License, or (at`
			`dnl your option) any later version.`

			`dnl The MPIR Library is distributed in the hope that it will be useful, but`
			`dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`dnl License for more details.`

			`dnl You should have received a copy of the GNU Lesser General Public License`
			`dnl along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`dnl to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`dnl Boston, MA 02110-1301, USA.`

			include(`../config.m4')

			`C carry+(xp,n)= (yp,n) + (zp,n)<<c with carry in ci`
			`C mp_limb_t mpn_addlsh_nc(mp_ptr xp, mp_srcptr yp,mp_srcptr zp,mp_size_t n,unsigned int c,mp_limb_t ci)`
			`C xp in rdi yp in rsi zp in rdx n in rcx c in r8 ci in r9`

			`MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc)`

			`ASM_START()`
			`PROLOGUE(mpn_addlsh_n)`
			`xor %r9,%r9`
			`EPILOGUE()`
			`PROLOGUE(mpn_addlsh_nc)`
			`mov %rcx,%r10`
			`lea (%rdi,%r10,8),%rdi`
			`lea (%rsi,%r10,8),%rsi`
			`lea (%rdx,%r10,8),%rdx`
			`mov %r8,%rcx`
			`neg %rcx`
			`shr %cl,%r9`
			`neg %r10`
			`xor %rax,%rax`
			`test $3,%r10`
			`jz next`
			`lp:`
			`mov (%rdx,%r10,8),%r8`
			`mov %r8,%r11`
			`neg %rcx`
			`shl %cl,%r8`
			`neg %rcx`
			`shr %cl,%r11`
			`or %r9,%r8`
			`mov %r11,%r9`
			`add $1,%rax`
			`adc (%rsi,%r10,8),%r8`
			`sbb %rax,%rax`
			`mov %r8,(%rdi,%r10,8)`
			`inc %r10`
			`test $3,%r10`
			`jnz lp`
			`next:`
			`cmp $0,%r10`
			`jz end`
			`push %rbx`
			`push %rbp`
			`push %r12`
			`push %r13`
			`push %r14`
			`push %r15`
			`ALIGN(16)`
			`loop:`
			`mov (%rdx,%r10,8),%r8`
			`mov 8(%rdx,%r10,8),%rbp`
			`mov 16(%rdx,%r10,8),%rbx`
			`mov 24(%rdx,%r10,8),%r12`
			`mov %r8,%r11`
			`mov %rbp,%r13`
			`mov %rbx,%r14`
			`mov %r12,%r15`
			`neg %rcx`
			`shl %cl,%r8`
			`shl %cl,%rbp`
			`shl %cl,%rbx`
			`shl %cl,%r12`
			`neg %rcx`
			`shr %cl,%r11`
			`shr %cl,%r13`
			`shr %cl,%r14`
			`shr %cl,%r15`
			`or %r9,%r8`
			`or %r11,%rbp`
			`or %r13,%rbx`
			`or %r14,%r12`
			`mov %r15,%r9`
			`add $1,%rax`
			`adc (%rsi,%r10,8),%r8`
			`adc 8(%rsi,%r10,8),%rbp`
			`adc 16(%rsi,%r10,8),%rbx`
			`adc 24(%rsi,%r10,8),%r12`
			`sbb %rax,%rax`
			`mov %r8,(%rdi,%r10,8)`
			`mov %rbp,8(%rdi,%r10,8)`
			`mov %rbx,16(%rdi,%r10,8)`
			`mov %r12,24(%rdi,%r10,8)`
			`add $4,%r10`
			`jnz loop`
			`pop %r15`
			`pop %r14`
			`pop %r13`
			`pop %r12`
			`pop %rbp`
			`pop %rbx`
			`end:`
			`neg %rax`
			`add %r9,%rax`
			`ret`
			`EPILOGUE()`