dnl  x86_64 mpn_add_n -- Add two limb vectors of the same length > 0 and store
dnl  sum in a third limb vector.

dnl  Copyright 2004 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write
dnl  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
dnl  Boston, MA 02110-1301, USA.

dnl **********************************************************************
dnl Actually, this file was created by Jason Martin <martin@math.jmu.edu>
dnl in an attempt to get GMP to run well on the Woodcrest CPU (aka Xeon).
dnl The GMP developers do not maintain this code and should not be
dnl bother with questions about it.  If you find errors in it, please
dnl let me know!
dnl **********************************************************************

dnl
dnl This is just a check to see if we are in my code testing sandbox
dnl or if we are actually in the GMP source tree
dnl
ifdef(`__JWM_Test_Code__',`
include(`./config.m4')
define(`MPN_PREFIX',`jwm_mpn_')',`
include(`../config.m4')')

dnl
dnl This is just a little test to see if the lahf and sahf
dnl instructions are available.  These instructions allow
dnl us to quickly save the EFLAGS register into AH and
dnl restore the EFLAGS from AH.  However, the earliest 64 bit
dnl CPUs didn't support this function, so the GNU assembler
dnl doesn't allow the lahf and sahf operands on 64 bit machines.
dnl To get around this, we check to see if the instructions are
dnl available.  If they are, then we use hand assembled bytes.
dnl If they aren't available then we fall back to using the
dnl setc and bt instructions which are slightly slower.
dnl
ifdef(`__JWM_Test_Code__',`
define(`LAHF_SAHF_AVAIL',esyscmd(`./lahf_sahf_test.sh'))',`
define(`LAHF_SAHF_AVAIL',esyscmd(`x86_64/lahf_sahf_test.sh'))')
ifelse(LAHF_SAHF_AVAIL,`Yes',`
define(`save_CF_to_reg_a',`.byte	0x9f')
define(`get_CF_from_reg_a',`.byte	0x9e')',`
define(`save_CF_to_reg_a',`setc	%al')
define(`get_CF_from_reg_a',`bt	`$'0x0,%rax')')


C         cycles/limb
C Hammer:     2.5 (for 1024 limbs)
C Woodcrest:  2.6 (for 1024 limbs)	

C INPUT PARAMETERS
C rp	rdi
C up	rsi
C vp	rdx
C n	rcx

ASM_START()
PROLOGUE(mpn_add_n)
	pushq	%rbp		C Save off callee-save registers
	pushq	%rbx
	pushq	%r12
	pushq	%r13
	pushq	%r14
	pushq	%r15

	xor	%r15,%r15		C r15 will be our index, so
					C I'll call it i here after
	save_CF_to_reg_a		C Save CF

	mov	%rcx,%r9
	sub	$4,%r9			C r9 = n-(i+4)

	ALIGN(4)			C aligning for loop
L_mpn_add_n_main_loop:
	C The goal of our main unrolled loop is to keep all the
	C execution units as busy as possible.  Since
	C there are three ALUs, we try to perform three
	C adds at a time.  Of course, we will have the
	C carry dependency, so there is at least one
	C clock cycle between each adc.  However, we'll
	C try to keep the other execution units busy
	C with loads and stores at the same time so that
	C our net throughput is close to one add per clock
	C cycle.  Hopefully this function will have asymptotic
	C behavior of taking 3*n clock cycles where n is the
	C number of limbs to add.
	C
	C Note that I'm using FOUR adds at a time, this is just
	C because I wanted to use up all available registers since
	C I'm hoping the out-of-order and loop-pipeline logic in
	C the Xeon will help us out.

	C See if we are still looping
	jle	L_mpn_add_n_loop_done

	get_CF_from_reg_a		C recover CF
	
	C Load inputs into rbx and r8
	C add with carry, and put result in r8
	C then store r8 to output.
	movq	(%rsi,%r15,8),%rbx
	movq	(%rdx,%r15,8),%r8
	adc	%rbx,%r8
	movq	%r8,(%rdi,%r15,8)
	
	C Load inputs into r9 and r10
	C add with carry, and put result in r10
	C then store r10 to output.
	movq	8(%rsi,%r15,8),%r9
	movq	8(%rdx,%r15,8),%r10
	adc	%r9,%r10
	movq	%r10,8(%rdi,%r15,8)
	
	C Load inputs into r11 and r12
	C add with carry, and put result in r12
	C then store r12 to output.
	movq	16(%rsi,%r15,8),%r11
	movq	16(%rdx,%r15,8),%r12
	adc	%r11,%r12
	movq	%r12,16(%rdi,%r15,8)

	C Load inputs into r13 and r14
	C add with carry, and put result in r14
	C then store r14 to output.
	movq	24(%rsi,%r15,8),%r13
	movq	24(%rdx,%r15,8),%r14
	adc	%r13,%r14
	movq	%r14,24(%rdi,%r15,8)

	save_CF_to_reg_a		C save CF

	mov	%r15,%r10
	add	$8,%r10
	add	$4,%r15		C increment by 4.
	
	mov	%rcx,%r9
	sub	%r10,%r9	C r9 = n-(i+4)
	jmp	L_mpn_add_n_main_loop

L_mpn_add_n_loop_done:
	mov	%rcx,%r15	C 
	sub	%r9,%r15	C r15 = n-(n-(i+4))=i+4
	sub	$4,%r15		C r15 = i
	cmp	%rcx,%r15
L_mpn_add_n_post_loop:
	je	L_mpn_add_n_exit
	get_CF_from_reg_a		C recover CF
	
	C Load inputs into rbx and r8
	C add with carry, and put result in r8
	C then store r8 to output.
	movq	(%rsi,%r15,8),%rbx
	movq	(%rdx,%r15,8),%r8
	adc	%rbx,%r8
	movq	%r8,(%rdi,%r15,8)
	save_CF_to_reg_a		C save CF
	add	$1,%r15
	cmp	%rcx,%r15
	jmp	L_mpn_add_n_post_loop
	
	
L_mpn_add_n_exit:
	xor	%rcx,%rcx
	get_CF_from_reg_a	C recover the CF
	mov	%rcx,%rax	C Clears rax without affecting carry flag 
	adc	%rax,%rax	C returns carry status.
	
	popq	%r15		C restore callee-save registers
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbx
	popq	%rbp
	ret

EPILOGUE()