From 30e7a89da6eb85fb46a161bc5f2b51a369070f84 Mon Sep 17 00:00:00 2001
From: jasonmoxham <jason@njkfrudils.plus.com>
Date: Fri, 22 Jul 2011 21:47:35 +0000
Subject: [PATCH] New mpn_sumdiff_n for nehalem

---
 mpn/x86_64/nehalem/sumdiff_n.asm | 161 +++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 mpn/x86_64/nehalem/sumdiff_n.asm

diff --git a/mpn/x86_64/nehalem/sumdiff_n.asm b/mpn/x86_64/nehalem/sumdiff_n.asm
new file mode 100644
index 00000000..7e719cc2
--- /dev/null
+++ b/mpn/x86_64/nehalem/sumdiff_n.asm
@@ -0,0 +1,161 @@
+dnl  mpn_sumdiff
+
+dnl  Copyright 2011 The Code Cavern
+
+dnl  This file is part of the MPIR Library.
+
+dnl  The MPIR Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The MPIR Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the MPIR Library; see the file COPYING.LIB.  If not, write
+dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+dnl  Boston, MA 02110-1301, USA.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sumdiff_n)
+push %rbx
+xor %r9,%r9
+push %rbp
+xor %rax,%rax
+push %r12
+mov $3,%r10d
+push %r13
+lea -24(%rdi,%r8,8),%rdi
+push %r14
+lea -24(%rsi,%r8,8),%rsi
+push %r15
+sub %r8,%r10
+lea -24(%rdx,%r8,8),%rdx
+lea -24(%rcx,%r8,8),%rcx
+jnc skiplp
+.align 16
+lp:	sahf
+	mov (%rdx,%r10,8),%r8
+	mov 24(%rdx,%r10,8),%r12
+	mov %r8,%r11
+	adc (%rcx,%r10,8),%r8
+	mov 8(%rdx,%r10,8),%rbx
+	mov %rbx,%r13
+	adc 8(%rcx,%r10,8),%rbx
+	mov 16(%rdx,%r10,8),%rbp
+	mov %rbp,%r14
+	adc 16(%rcx,%r10,8),%rbp
+	mov %r12,%r15
+	adc 24(%rcx,%r10,8),%r12
+	lahf
+	add $255,%r9b
+	sbb (%rcx,%r10,8),%r11
+	mov %r11,(%rsi,%r10,8)
+	sbb 8(%rcx,%r10,8),%r13
+	sbb 16(%rcx,%r10,8),%r14
+	sbb 24(%rcx,%r10,8),%r15
+	setc %r9b
+	add $4,%r10
+	mov %r8,-32(%rdi,%r10,8)
+	mov %rbp,16-32(%rdi,%r10,8)
+	mov %r13,8-32(%rsi,%r10,8)
+	mov %r15,24-32(%rsi,%r10,8)
+	mov %r12,24-32(%rdi,%r10,8)
+	mov %r14,16-32(%rsi,%r10,8)
+	mov %rbx,8-32(%rdi,%r10,8)
+	jnc lp
+skiplp:
+cmp $2,%r10
+jg case0
+je case1
+jp case2
+case3:	sahf
+	mov (%rdx),%r8
+	mov %r8,%r11
+	adc (%rcx),%r8
+	mov 8(%rdx),%rbx
+	mov %rbx,%r13
+	adc 8(%rcx),%rbx
+	mov 16(%rdx),%rbp
+	mov %rbp,%r14
+	adc 16(%rcx),%rbp
+	lahf
+	add $255,%r9b
+	sbb (%rcx),%r11
+	mov %r11,(%rsi)
+	sbb 8(%rcx),%r13
+	sbb 16(%rcx),%r14
+	setc %r9b
+	mov %r8,(%rdi)
+	mov %rbp,16(%rdi)
+	mov %r13,8(%rsi)
+	mov %r14,16(%rsi)
+	mov %rbx,8(%rdi)
+	sahf
+	mov $0,%rax
+	adc $0,%rax
+	add $255,%r9b
+	rcl $1,%rax
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbp
+	pop %rbx
+	ret
+case2:	sahf
+	mov 8(%rdx),%r8
+	mov %r8,%r11
+	adc 8(%rcx),%r8
+	mov 16(%rdx),%rbx
+	mov %rbx,%r13
+	adc 16(%rcx),%rbx
+	lahf
+	add $255,%r9b
+	sbb 8(%rcx),%r11
+	mov %r11,8(%rsi)
+	sbb 16(%rcx),%r13
+	setc %r9b
+	mov %r8,8(%rdi)
+	mov %r13,16(%rsi)
+	mov %rbx,16(%rdi)
+	sahf
+	mov $0,%rax
+	adc $0,%rax
+	add $255,%r9b
+	rcl $1,%rax
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbp
+	pop %rbx
+	ret
+case1:	sahf
+	mov 16(%rdx),%r8
+	mov %r8,%r11
+	adc 16(%rcx),%r8
+	lahf
+	add $255,%r9b
+	sbb 16(%rcx),%r11
+	mov %r11,16(%rsi)
+	setc %r9b
+	mov %r8,16(%rdi)
+case0:	sahf
+	mov $0,%rax
+	adc $0,%rax
+	add $255,%r9b
+	rcl $1,%rax
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbp
+	pop %rbx
+	ret
+EPILOGUE()