mpir/mpn/x86_64/core2/sumdiff_n.asm

dnl  core2 mpn_sumdiff_n

dnl  Copyright 2009 Jason Moxham

dnl  This file is part of the MPIR Library.

dnl  The MPIR Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The MPIR Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

include(`../config.m4')

C	(rdi,r8)=(rdx,r8)+(rcx,r8)  (rsi,r8)=(rdx,r8)-(rcx,r8)
C	return 2*add_carry+sub_borrow

ASM_START()
PROLOGUE(mpn_sumdiff_n)
# Version 1.0.4
lea	(%rsi,%r8,8),%rsi
lea	(%rdx,%r8,8),%rdx
lea	(%rdi,%r8,8),%rdi
lea	(%rcx,%r8,8),%rcx
neg	%r8
xor	%r9,%r9
xor	%r10,%r10
test	$3,%r8
jz	next
lp1:
	mov	(%rdx,%r8,8),%rax
	mov	%rax,%r11
	add	$1,%r9
	adc	(%rcx,%r8,8),%rax
	sbb	%r9,%r9
	add	$1,%r10
	sbb	(%rcx,%r8,8),%r11
	sbb	%r10,%r10
	mov	%rax,(%rdi,%r8,8)
	mov	%r11,(%rsi,%r8,8)
	#inc	%r8
	add	$1,%r8
	test	$3,%r8
	jnz lp1
next:
cmp	$0,%r8
jz	skiploop
# linux red zone , change to push/pop and can move to pro/epilog
mov %rbx,-8(%rsp)
mov %rbp,-16(%rsp)
mov %r12,-24(%rsp)
mov %r13,-32(%rsp)
mov %r14,-40(%rsp)
mov %r15,-48(%rsp)
ALIGN(16)
loop:
	mov	(%rdx,%r8,8),%rax
	mov	8(%rdx,%r8,8),%rbx
	mov	16(%rdx,%r8,8),%rbp
	mov	24(%rdx,%r8,8),%r12
	mov	%rax,%r11
	mov	%rbx,%r13
	mov	%rbp,%r14
	mov	%r12,%r15
	add	$1,%r9
	adc	(%rcx,%r8,8),%rax
	adc	8(%rcx,%r8,8),%rbx
	adc	16(%rcx,%r8,8),%rbp
	adc	24(%rcx,%r8,8),%r12
	sbb	%r9,%r9
	add	$1,%r10
	sbb	(%rcx,%r8,8),%r11
	sbb	8(%rcx,%r8,8),%r13
	sbb	16(%rcx,%r8,8),%r14
	sbb	24(%rcx,%r8,8),%r15
	sbb	%r10,%r10
	mov	%rax,(%rdi,%r8,8)
	mov	%rbx,8(%rdi,%r8,8)
	mov	%rbp,16(%rdi,%r8,8)
	mov	%r12,24(%rdi,%r8,8)
	mov	%r11,(%rsi,%r8,8)
	mov	%r13,8(%rsi,%r8,8)
	mov	%r14,16(%rsi,%r8,8)
	mov	%r15,24(%rsi,%r8,8)
	add	$4,%r8
	jnz loop
mov -8(%rsp),%rbx
mov -16(%rsp),%rbp
mov -24(%rsp),%r12
mov -32(%rsp),%r13
mov -40(%rsp),%r14
mov -48(%rsp),%r15
skiploop:
lea (%r10,%r9,2),%rax
neg	%rax
ret
EPILOGUE()