;; *********************************************************************
;;  Intel64 mpn_submul_1 -- Multiply a limb vector with a limb and
;;  subtract the result from a second limb vector.
;;
;;  Copyright (C) 2006  Jason Worth Martin <jason.worth.martin@gmail.com>
;;
;;  This program is free software; you can redistribute it and/or modify
;;  it under the terms of the GNU General Public License as published by
;;  the Free Software Foundation; either version 2 of the License, or
;;  (at your option) any later version.
;;
;;  This program is distributed in the hope that it will be useful,
;;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;  GNU General Public License for more details.
;;
;;  You should have received a copy of the GNU General Public License along
;;  with this program; if not, write to the Free Software Foundation, Inc.,
;;  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;;
;; **************************************************************************
;;
;;
;; CREDITS
;;
;; This code is based largely on Pierrick Gaudry's excellent assembly
;; support for the AMD64 architecture.  (Note that Intel64 and AMD64,
;; while using the same instruction set, have very different
;; microarchitectures.  So, this code performs very poorly on AMD64
;; machines even though it is near-optimal on Intel64.)
;;
;; Roger Golliver works for Intel and provided insightful improvements
;; particularly in using the "lea" instruction to perform additions
;; and register-to-register moves.
;;
;; Eric Bainville has a brilliant exposition of optimizing arithmetic for
;; AMD64 (http://www.bealto.it).  I adapted many of the ideas he
;; describes to Intel64.
;;
;; Agner Fog is a demigod in the x86 world.  If you are reading assembly
;; code files and you haven't heard of Agner Fog, then take a minute to
;; look over his software optimization manuals (http://www.agner.org/).
;; They are superb.
;;
;; *********************************************************************


;; With a 4-way unroll the code has
;;
;;         	cycles/limb
;; Hammer:           4.6
;; Woodcrest:        4.6
;;
;; With increased unrolling, it appears to converge to 4 cycles/limb
;; on Intel Core 2 machines.  I believe that this is optimal, however
;; it requires such absurd unrolling that it becomes unusable for all
;; but the largest inputs.  A 4-way unroll seems like a good balance
;; to me because then commonly used input sizes (e.g. 1024bit Public
;; keys) still benifit from the speed up.

;;
;; This is just a check to see if we are in my code testing sandbox
;; or if we are actually in the GMP source tree
;;
%ifdef	__JWM_Test_Code__
%include 'yasm_mac.inc'
%else
%include '../yasm_mac.inc'
%endif


;; *********************************************************************
;; *********************************************************************
;;
;; Here are the important macro parameters for the code
;;
;;      BpL is Bytes per Limb (8 since this is 64bit code)
;;
;;	UNROLL_SIZE is a power of 2 for which we unroll the code.
;;                  possible values are 2,4,8,15,..., 256.  A reasonable
;;                  value is probably 4.  If really large inputs
;;                  are expected, then 16 is probably good.  Larger
;;                  values are really only useful for flashy
;;                  benchmarks and testing asymptotic behavior.
;;
;;      THRESHOLD is the minimum number of limbs needed before we bother
;;                using the complicated loop.  A reasonable value is
;;                2*UNROLL_SIZE + 6
;;
;; *********************************************************************
;; *********************************************************************
%define	BpL	8
%define	UNROLL_SIZE	4
%define	UNROLL_MASK	UNROLL_SIZE-1
%define	THRESHOLD	2*UNROLL_SIZE+6

;; Here is a convenient Macro for addressing
;; memory.  Entries of the form
;;
;;      ADDR(ptr,index,displacement)
;;
;; get converted to
;;
;;      [displacement*BpL + ptr + index*BpL]
;;
%define	ADDR(a,b,c)	[c*BpL+a+b*BpL]


;; Register	Usage
;; --------	-----
;; rax		low word from mul
;; rbx*
;; rcx		s2limb
;; rdx		high word from mul
;; rsi		s1p
;; rdi		rp
;; rbp*		Base Pointer
;; rsp*		Stack Pointer
;; r8		A_x
;; r9		A_y
;; r10		A_z
;; r11		B_x
;; r12*		B_y
;; r13*		B_z
;; r14*		temp
;; r15*		index
;;
;; * indicates that the register must be
;; preserved for the caller.
%define	s2limb	rcx
%define	s1p	rsi
%define	rp	rdi
%define	A_x	r8
%define	A_y	r9
%define	A_z	r10
%define	B_x	r11
%define	B_y	r12
%define	B_z	r13
%define	temp	r14
%define	index	r15

	
;; INPUT PARAMETERS
;; rp		rdi
;; s1p		rsi
;; n		rdx
;; s2limb	rcx
	BITS	64
GLOBAL_FUNC mpn_submul_1
					;; Compare the limb count
					;; with the threshold value.
					;; If the limb count is small
					;; we just use the small loop,
					;; otherwise we jump to the
					;; more complicated loop.
	cmp	rdx,THRESHOLD
	jge	L_mpn_submul_1_main_loop_prep
	mov	r11,rdx
	lea	rsi,[rsi+rdx*8]
	lea	rdi,[rdi+rdx*8]
	neg	r11
	xor	r8, r8
	xor	r10, r10
	jmp	L_mpn_submul_1_small_loop
	
	align	16
L_mpn_submul_1_small_loop:
	mov	rax,[rsi+r11*8]
	mul	rcx
	add	rax,r8
	adc	rdx,r10
	sub	[rdi+r11*8],rax
	mov	r8,r10
	adc	r8,rdx
	inc	r11
	jne	L_mpn_submul_1_small_loop

	mov	rax,r8
	ret

L_mpn_submul_1_main_loop_prep:
	push	r15
	push	r14
	push	r13
	push	r12
				;; If n is even, we need to do three
				;; pre-multiplies, if n is odd we only
				;; need to do two.
	mov	temp,rdx
	mov	index,0
	mov	A_x,0
	mov	A_y,0
	and	rdx,1
	jnz	L_mpn_submul_1_odd_n

					;; Case n is even
	mov	rax,ADDR(s1p,index,0)
	mul	s2limb
	sub	ADDR(rp,index,0),rax
	adc	A_x,rdx
	add	index,1
					;; At this point
					;;  temp = n (even)
					;; index = 1

L_mpn_submul_1_odd_n:
					;; Now
					;; temp = n
					;; index = 1 if n even
					;;       = 0 if n odd
					;;
	mov	rax,ADDR(s1p,index,0)
	mul	s2limb
	mov	A_z,ADDR(rp,index,0)
	add	A_x,rax
	adc	A_y,rdx

	mov	rax,ADDR(s1p,index,1)
	mul	s2limb
	mov	B_z,ADDR(rp,index,1)
	mov	B_x,rax
	mov	B_y,rdx
	mov	rax,ADDR(s1p,index,2)

	add	index,3
	lea	s1p,ADDR(s1p,temp,-1)
	lea	rp,ADDR(rp,temp,-1)
	neg	temp
	add	index,temp
				;; At this point:
				;; s1p   = address of last s1limb
				;; rp    = address of last rplimb
				;; temp  = -n
				;; index = 4 - n if n even
				;;       = 3 - n if n odd
				;;
				;; So, index is a (negative) even
				;; number.
				;;
				;; *****************************************
				;; ATTENTION:
				;;
				;; From here on, I will use array
				;; indexing notation in the comments
				;; because it is convenient.  So, I
				;; will pretend that index is positive
				;; because then a comment like
				;;      B_z = rp[index-1]
				;; is easier to read.
				;; However, keep in mind that index is
				;; actually a negative number indexing
				;; back from the end of the array.
				;; This is a common trick to remove one
				;; compare operation from the main loop.
				;; *****************************************

				;;
				;; Now we enter a spin-up loop the
				;; will make sure that the index is
				;; a multiple of UNROLL_SIZE before
				;; going to our main unrolled loop.
	mov	temp,index
	neg	temp
	and	temp,UNROLL_MASK
	jz	L_mpn_submul_1_main_loop
	shr	temp,1
L_mpn_submul_1_main_loop_spin_up:
				;; At this point we should have:
				;;
				;; A_x = low_mul[index-2] + carry_in
				;; A_y = high_mul[index-2] + CF
				;; A_z = rp[index-2]
				;;
				;; B_x = low_mul[index-1]
				;; B_y = high_mul[index-1]
				;; B_z = rp[index-1]
				;;
				;; rax = s1p[index]
	mul	s2limb
	sub	A_z,A_x
	lea	A_x,[rax]
	mov	rax,ADDR(s1p,index,1)
	adc	B_x,A_y
	mov	ADDR(rp,index,-2),A_z
	mov	A_z,ADDR(rp,index,0)
	adc	B_y,0
	lea	A_y,[rdx]
				;; At this point we should have:
				;;
				;; B_x = low_mul[index-1] + carry_in
				;; B_y = high_mul[index-1] + CF
				;; B_z = rp[index-1]
				;;
				;; A_x = low_mul[index]
				;; A_y = high_mul[index]
				;; A_z = rp[index]
				;;
				;; rax = s1p[index+1]
	mul	s2limb
	sub	B_z,B_x
	lea	B_x,[rax]
	mov	rax,ADDR(s1p,index,2)
	adc	A_x,B_y
	mov	ADDR(rp,index,-1),B_z
	mov	B_z,ADDR(rp,index,1)
	adc	A_y,0
	lea	B_y,[rdx]

	add	index,2
	sub	temp,1
	jnz	L_mpn_submul_1_main_loop_spin_up
	
	jmp	L_mpn_submul_1_main_loop
	
	align	16
L_mpn_submul_1_main_loop:
				;; The code here is really the same
				;; logic as the spin-up loop.  It's
				;; just been unrolled.
%assign	unroll_index 0
%rep	UNROLL_SIZE/2
	mul	s2limb
	sub	A_z,A_x
	lea	A_x,[rax]
	mov	rax,ADDR(s1p,index,(2*unroll_index+1))
	adc	B_x,A_y
	mov	ADDR(rp,index,(2*unroll_index-2)),A_z
	mov	A_z,ADDR(rp,index,(2*unroll_index))
	adc	B_y,0
	lea	A_y,[rdx]

	mul	s2limb
	sub	B_z,B_x
	lea	B_x,[rax]
	mov	rax,ADDR(s1p,index,(2*unroll_index+2))
	adc	A_x,B_y
	mov	ADDR(rp,index,(2*unroll_index-1)),B_z
	mov	B_z,ADDR(rp,index,(2*unroll_index+1))
	adc	A_y,0
	lea	B_y,[rdx]
%assign	unroll_index	unroll_index+1
%endrep


	add	index,UNROLL_SIZE
	jnz	L_mpn_submul_1_main_loop

L_mpn_submul_1_finish:	
				;; At this point we should have:
				;;
				;; index = n-1
				;;
				;; A_x = low_mul[index-2] + carry_in
				;; A_y = high_mul[index-2] + CF
				;; A_z = rp[index-2]
				;;
				;; B_x = low_mul[index-1]
				;; B_y = high_mul[index-1]
				;; B_z = rp[index-1]
				;;
				;; rax = s1p[index]
	mul	s2limb
	sub	A_z,A_x
	mov	A_x,rax
	mov	ADDR(rp,index,-2),A_z
	mov	A_z,ADDR(rp,index,0)
	adc	B_x,A_y
	adc	B_y,0
	mov	A_y,rdx
				;; At this point we should have:
				;;
				;; index = n-1
				;;
				;; B_x = low_mul[index-1] + carry_in
				;; B_y = high_mul[index-1] + CF
				;; B_z = rp[index-1]
				;;
				;; A_x = low_mul[index]
				;; A_y = high_mul[index]
				;; A_z = rp[index]
	sub	B_z,B_x
	mov	ADDR(rp,index,-1),B_z
	adc	A_x,B_y
	adc	A_y,0
				;; At this point we should have:
				;;
				;; index = n-1
				;;
				;; A_x = low_mul[index] + carry_in
				;; A_y = high_mul[index] + CF
				;; A_z = rp[index]
				;;
	sub	A_z,A_x
	mov	ADDR(rp,index,0),A_z
	adc	A_y,0

	mov	rax,A_y
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	ret