mpir/mpn/x86_64/core2/addmul_1.as

;; *********************************************************************
;;  Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and
;;  add the result to a second limb vector.
;;
;;  Copyright (C) 2006  Jason Worth Martin <jason.worth.martin@gmail.com>
;;
;;  This program is free software; you can redistribute it and/or modify
;;  it under the terms of the GNU General Public License as published by
;;  the Free Software Foundation; either version 2 of the License, or
;;  (at your option) any later version.
;;
;;  This program is distributed in the hope that it will be useful,
;;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;  GNU General Public License for more details.
;;
;;  You should have received a copy of the GNU General Public License along
;;  with this program; if not, write to the Free Software Foundation, Inc.,
;;  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;;
;; **************************************************************************
;;
;;
;; CREDITS
;;
;; This code is based largely on Pierrick Gaudry's excellent assembly
;; support for the AMD64 architecture.  (Note that Intel64 and AMD64,
;; while using the same instruction set, have very different
;; microarchitectures.  So, this code performs very poorly on AMD64
;; machines even though it is near-optimal on Intel64.)
;;
;; Roger Golliver works for Intel and provided insightful improvements
;; particularly in using the "lea" instruction to perform additions
;; and register-to-register moves.
;;
;; Eric Bainville has a brilliant exposition of optimizing arithmetic for
;; AMD64 (http://www.bealto.it).  I adapted many of the ideas he
;; describes to Intel64.
;;
;; Agner Fog is a demigod in the x86 world.  If you are reading assembly
;; code files and you haven't heard of Agner Fog, then take a minute to
;; look over his software optimization manuals (http://www.agner.org/).
;; They are superb.
;;
;; *********************************************************************


;; With a 4-way unroll the code has
;;
;;         	cycles/limb
;; Hammer:           4.8
;; Woodcrest:        4.6
;;
;; With increased unrolling, it appears to converge to 4 cycles/limb
;; on Intel Core 2 machines.  I believe that this is optimal, however
;; it requires such absurd unrolling that it becomes unusable for all
;; but the largest inputs.  A 4-way unroll seems like a good balance
;; to me because then commonly used input sizes (e.g. 1024bit Public
;; keys) still benifit from the speed up.

;;
;; This is just a check to see if we are in my code testing sandbox
;; or if we are actually in the GMP source tree
;;
%ifdef	__JWM_Test_Code__
%include 'yasm_mac.inc'
%else
%include '../yasm_mac.inc'
%endif


;; *********************************************************************
;; *********************************************************************
;;
;; Here are the important macro parameters for the code
;;
;;      BpL is Bytes per Limb (8 since this is 64bit code)
;;
;;	UNROLL_SIZE is a power of 2 for which we unroll the code.
;;                  possible values are 2,4,8,15,..., 256.  A reasonable
;;                  value is probably 4.  If really large inputs
;;                  are expected, then 16 is probably good.  Larger
;;                  values are really only useful for flashy
;;                  benchmarks and testing asymptotic behavior.
;;
;;      THRESHOLD is the minimum number of limbs needed before we bother
;;                using the complicated loop.  A reasonable value is
;;                2*UNROLL_SIZE + 6
;;
;; *********************************************************************
;; *********************************************************************
%define	BpL	8
%define	UNROLL_SIZE	4
%define	UNROLL_MASK	UNROLL_SIZE-1
%define	THRESHOLD	2*UNROLL_SIZE+6

;; Here is a convenient Macro for addressing
;; memory.  Entries of the form
;;
;;      ADDR(ptr,index,displacement)
;;
;; get converted to
;;
;;      [displacement*BpL + ptr + index*BpL]
;;
%define	ADDR(a,b,c)	[c*BpL+a+b*BpL]


;; Register	Usage
;; --------	-----
;; rax		low word from mul
;; rbx*
;; rcx		s2limb
;; rdx		high word from mul
;; rsi		s1p
;; rdi		rp
;; rbp*		Base Pointer
;; rsp*		Stack Pointer
;; r8		A_x
;; r9		A_y
;; r10		A_z
;; r11		B_x
;; r12*		B_y
;; r13*		B_z
;; r14*		temp
;; r15*		index
;; 
;; * indicates that the register must be
;; preserved for the caller.
%define	s2limb	rcx
%define	s1p	rsi
%define	rp	rdi
%define	A_x	r8
%define	A_y	r9
%define	A_z	r10
%define	B_x	r11
%define	B_y	r12
%define	B_z	r13
%define	temp	r14
%define	index	r15

	
;; INPUT PARAMETERS
;; rp		rdi
;; s1p		rsi
;; n		rdx
;; s2limb	rcx
	BITS	64
GLOBAL_FUNC mpn_addmul_1
					;; Compare the limb count
					;; with the threshold value.
					;; If the limb count is small
					;; we just use the small loop,
					;; otherwise we jump to the
					;; more complicated loop.
	cmp	rdx,THRESHOLD
	jge	L_mpn_addmul_1_main_loop_prep
	mov	r11,rdx
	lea	rsi,[rsi+rdx*8]
	lea	rdi,[rdi+rdx*8]
	neg	r11
	xor	r8, r8
	xor	r10, r10
	jmp	L_mpn_addmul_1_small_loop
	
	align	16
L_mpn_addmul_1_small_loop:
	mov	rax,[rsi+r11*8]
	mul	rcx
	add	rax,[rdi+r11*8]
	adc	rdx,r10
	add	rax,r8
	mov	r8,r10
	mov	[rdi+r11*8],rax
	adc	r8,rdx
	inc	r11
	jne	L_mpn_addmul_1_small_loop

	mov	rax,r8
	ret

L_mpn_addmul_1_main_loop_prep:
	push	r15
	push	r14
	push	r13
	push	r12
				;; If n is even, we need to do three
				;; pre-multiplies, if n is odd we only
				;; need to do two.
	mov	temp,rdx
	mov	index,0
	mov	A_x,0
	mov	A_y,0
	and	rdx,1
	jnz	L_mpn_addmul_1_odd_n

					;; Case n is even
	mov	rax,ADDR(s1p,index,0)
	mul	s2limb
	add	ADDR(rp,index,0),rax
	adc	A_x,rdx
	add	index,1
					;; At this point
					;;  temp = n (even)
					;; index = 1

L_mpn_addmul_1_odd_n:
					;; Now
					;; temp = n
					;; index = 1 if n even
					;;       = 0 if n odd
					;;
	mov	rax,ADDR(s1p,index,0)
	mul	s2limb
	mov	A_z,ADDR(rp,index,0)
	add	A_x,rax
	adc	A_y,rdx

	mov	rax,ADDR(s1p,index,1)
	mul	s2limb
	mov	B_z,ADDR(rp,index,1)
	mov	B_x,rax
	mov	B_y,rdx
	mov	rax,ADDR(s1p,index,2)

	add	index,3
	lea	s1p,ADDR(s1p,temp,-1)
	lea	rp,ADDR(rp,temp,-1)
	neg	temp
	add	index,temp
				;; At this point:
				;; s1p   = address of last s1limb
				;; rp    = address of last rplimb
				;; temp  = -n
				;; index = 4 - n if n even
				;;       = 3 - n if n odd
				;;
				;; So, index is a (negative) even
				;; number.
				;;
				;; *****************************************
				;; ATTENTION:
				;;
				;; From here on, I will use array
				;; indexing notation in the comments
				;; because it is convenient.  So, I
				;; will pretend that index is positive
				;; because then a comment like
				;;      B_z = rp[index-1]
				;; is easier to read.
				;; However, keep in mind that index is
				;; actually a negative number indexing
				;; back from the end of the array.
				;; This is a common trick to remove one
				;; compare operation from the main loop.
				;; *****************************************

				;;
				;; Now we enter a spin-up loop the
				;; will make sure that the index is
				;; a multiple of UNROLL_SIZE before
				;; going to our main unrolled loop.
	mov	temp,index
	neg	temp
	and	temp,UNROLL_MASK
	jz	L_mpn_addmul_1_main_loop
	shr	temp,1
L_mpn_addmul_1_main_loop_spin_up:
				;; At this point we should have:
				;;
				;; A_x = low_mul[index-2] + carry_in
				;; A_y = high_mul[index-2] + CF
				;; A_z = rp[index-2]
				;;
				;; B_x = low_mul[index-1]
				;; B_y = high_mul[index-1]
				;; B_z = rp[index-1]
				;;
				;; rax = s1p[index]
	mul	s2limb
	add	A_z,A_x
	lea	A_x,[rax]
	mov	rax,ADDR(s1p,index,1)
	adc	B_x,A_y
	mov	ADDR(rp,index,-2),A_z
	mov	A_z,ADDR(rp,index,0)
	adc	B_y,0
	lea	A_y,[rdx]
				;; At this point we should have:
				;;
				;; B_x = low_mul[index-1] + carry_in
				;; B_y = high_mul[index-1] + CF
				;; B_z = rp[index-1]
				;;
				;; A_x = low_mul[index]
				;; A_y = high_mul[index]
				;; A_z = rp[index]
				;;
				;; rax = s1p[index+1]
	mul	s2limb
	add	B_z,B_x
	lea	B_x,[rax]
	mov	rax,ADDR(s1p,index,2)
	adc	A_x,B_y
	mov	ADDR(rp,index,-1),B_z
	mov	B_z,ADDR(rp,index,1)
	adc	A_y,0
	lea	B_y,[rdx]

	add	index,2
	sub	temp,1
	jnz	L_mpn_addmul_1_main_loop_spin_up
	
	jmp	L_mpn_addmul_1_main_loop
	
	align	16
L_mpn_addmul_1_main_loop:
				;; The code here is really the same
				;; logic as the spin-up loop.  It's
				;; just been unrolled.
%assign	unroll_index 0
%rep	UNROLL_SIZE/2
	mul	s2limb
	add	A_z,A_x
	lea	A_x,[rax]
	mov	rax,ADDR(s1p,index,(2*unroll_index+1))
	adc	B_x,A_y
	mov	ADDR(rp,index,(2*unroll_index-2)),A_z
	mov	A_z,ADDR(rp,index,(2*unroll_index))
	adc	B_y,0
	lea	A_y,[rdx]

	mul	s2limb
	add	B_z,B_x
	lea	B_x,[rax]
	mov	rax,ADDR(s1p,index,(2*unroll_index+2))
	adc	A_x,B_y
	mov	ADDR(rp,index,(2*unroll_index-1)),B_z
	mov	B_z,ADDR(rp,index,(2*unroll_index+1))
	adc	A_y,0
	lea	B_y,[rdx]
%assign	unroll_index	unroll_index+1
%endrep

	add	index,UNROLL_SIZE
	jnz	L_mpn_addmul_1_main_loop

L_mpn_addmul_1_finish:	
				;; At this point we should have:
				;;
				;; index = n-1
				;;
				;; A_x = low_mul[index-2] + carry_in
				;; A_y = high_mul[index-2] + CF
				;; A_z = rp[index-2]
				;; 
				;; B_x = low_mul[index-1]
				;; B_y = high_mul[index-1]
				;; B_z = rp[index-1]
				;;
				;; rax = s1p[index]
	mul	s2limb
	add	A_z,A_x
	mov	A_x,rax
	mov	ADDR(rp,index,-2),A_z
	mov	A_z,ADDR(rp,index,0)
	adc	B_x,A_y
	adc	B_y,0
	mov	A_y,rdx
				;; At this point we should have:
				;;
				;; index = n-1
				;;
				;; B_x = low_mul[index-1] + carry_in
				;; B_y = high_mul[index-1] + CF
				;; B_z = rp[index-1]
				;;
				;; A_x = low_mul[index]
				;; A_y = high_mul[index]
				;; A_z = rp[index]
	add	B_z,B_x
	mov	ADDR(rp,index,-1),B_z
	adc	A_x,B_y
	adc	A_y,0
				;; At this point we should have:
				;;
				;; index = n-1
				;; 
				;; A_x = low_mul[index] + carry_in
				;; A_y = high_mul[index] + CF
				;; A_z = rp[index]
				;; 
	add	A_z,A_x
	mov	ADDR(rp,index,0),A_z
	adc	A_y,0

	mov	rax,A_y
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	ret
Adding file addmul_1.as in core2 assembly code 2008-07-07 20:04:04 -04:00			`;; *********************************************************************`
			`;; Intel64 mpn_addmul_1 -- Multiply a limb vector with a limb and`
			`;; add the result to a second limb vector.`
			`;;`
			`;; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>`
			`;;`
			`;; This program is free software; you can redistribute it and/or modify`
			`;; it under the terms of the GNU General Public License as published by`
			`;; the Free Software Foundation; either version 2 of the License, or`
			`;; (at your option) any later version.`
			`;;`
			`;; This program is distributed in the hope that it will be useful,`
			`;; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`;; GNU General Public License for more details.`
			`;;`
			`;; You should have received a copy of the GNU General Public License along`
			`;; with this program; if not, write to the Free Software Foundation, Inc.,`
			`;; 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`
			`;;`
			`;; **************************************************************************`
			`;;`
			`;;`
			`;; CREDITS`
			`;;`
			`;; This code is based largely on Pierrick Gaudry's excellent assembly`
			`;; support for the AMD64 architecture. (Note that Intel64 and AMD64,`
			`;; while using the same instruction set, have very different`
			`;; microarchitectures. So, this code performs very poorly on AMD64`
			`;; machines even though it is near-optimal on Intel64.)`
			`;;`
			`;; Roger Golliver works for Intel and provided insightful improvements`
			`;; particularly in using the "lea" instruction to perform additions`
			`;; and register-to-register moves.`
			`;;`
			`;; Eric Bainville has a brilliant exposition of optimizing arithmetic for`
			`;; AMD64 (http://www.bealto.it). I adapted many of the ideas he`
			`;; describes to Intel64.`
			`;;`
			`;; Agner Fog is a demigod in the x86 world. If you are reading assembly`
			`;; code files and you haven't heard of Agner Fog, then take a minute to`
			`;; look over his software optimization manuals (http://www.agner.org/).`
			`;; They are superb.`
			`;;`
			`;; *********************************************************************`


			`;; With a 4-way unroll the code has`
			`;;`
			`;; cycles/limb`
			`;; Hammer: 4.8`
			`;; Woodcrest: 4.6`
			`;;`
			`;; With increased unrolling, it appears to converge to 4 cycles/limb`
			`;; on Intel Core 2 machines. I believe that this is optimal, however`
			`;; it requires such absurd unrolling that it becomes unusable for all`
			`;; but the largest inputs. A 4-way unroll seems like a good balance`
			`;; to me because then commonly used input sizes (e.g. 1024bit Public`
			`;; keys) still benifit from the speed up.`

			`;;`
			`;; This is just a check to see if we are in my code testing sandbox`
			`;; or if we are actually in the GMP source tree`
			`;;`
			`%ifdef __JWM_Test_Code__`
			`%include 'yasm_mac.inc'`
			`%else`
			`%include '../yasm_mac.inc'`
			`%endif`



			`;; *********************************************************************`
			`;; *********************************************************************`
			`;;`
			`;; Here are the important macro parameters for the code`
			`;;`
			`;; BpL is Bytes per Limb (8 since this is 64bit code)`
			`;;`
			`;; UNROLL_SIZE is a power of 2 for which we unroll the code.`
			`;; possible values are 2,4,8,15,..., 256. A reasonable`
			`;; value is probably 4. If really large inputs`
			`;; are expected, then 16 is probably good. Larger`
			`;; values are really only useful for flashy`
			`;; benchmarks and testing asymptotic behavior.`
			`;;`
			`;; THRESHOLD is the minimum number of limbs needed before we bother`
			`;; using the complicated loop. A reasonable value is`
			`;; 2*UNROLL_SIZE + 6`
			`;;`
			`;; *********************************************************************`
			`;; *********************************************************************`
			`%define BpL 8`
			`%define UNROLL_SIZE 4`
			`%define UNROLL_MASK UNROLL_SIZE-1`
			`%define THRESHOLD 2*UNROLL_SIZE+6`

			`;; Here is a convenient Macro for addressing`
			`;; memory. Entries of the form`
			`;;`
			`;; ADDR(ptr,index,displacement)`
			`;;`
			`;; get converted to`
			`;;`
			`;; [displacementBpL + ptr + indexBpL]`
			`;;`
			`%define ADDR(a,b,c) [cBpL+a+bBpL]`


			`;; Register Usage`
			`;; -------- -----`
			`;; rax low word from mul`
			`;; rbx*`
			`;; rcx s2limb`
			`;; rdx high word from mul`
			`;; rsi s1p`
			`;; rdi rp`
			`;; rbp* Base Pointer`
			`;; rsp* Stack Pointer`
			`;; r8 A_x`
			`;; r9 A_y`
			`;; r10 A_z`
			`;; r11 B_x`
			`;; r12* B_y`
			`;; r13* B_z`
			`;; r14* temp`
			`;; r15* index`
			`;;`
			`;; * indicates that the register must be`
			`;; preserved for the caller.`
			`%define s2limb rcx`
			`%define s1p rsi`
			`%define rp rdi`
			`%define A_x r8`
			`%define A_y r9`
			`%define A_z r10`
			`%define B_x r11`
			`%define B_y r12`
			`%define B_z r13`
			`%define temp r14`
			`%define index r15`


			`;; INPUT PARAMETERS`
			`;; rp rdi`
			`;; s1p rsi`
			`;; n rdx`
			`;; s2limb rcx`
			`BITS 64`
			`GLOBAL_FUNC mpn_addmul_1`
			`;; Compare the limb count`
			`;; with the threshold value.`
			`;; If the limb count is small`
			`;; we just use the small loop,`
			`;; otherwise we jump to the`
			`;; more complicated loop.`
			`cmp rdx,THRESHOLD`
			`jge L_mpn_addmul_1_main_loop_prep`
			`mov r11,rdx`
			`lea rsi,[rsi+rdx*8]`
			`lea rdi,[rdi+rdx*8]`
			`neg r11`
			`xor r8, r8`
			`xor r10, r10`
			`jmp L_mpn_addmul_1_small_loop`

			`align 16`
			`L_mpn_addmul_1_small_loop:`
			`mov rax,[rsi+r11*8]`
			`mul rcx`
			`add rax,[rdi+r11*8]`
			`adc rdx,r10`
			`add rax,r8`
			`mov r8,r10`
			`mov [rdi+r11*8],rax`
			`adc r8,rdx`
			`inc r11`
			`jne L_mpn_addmul_1_small_loop`

			`mov rax,r8`
			`ret`

			`L_mpn_addmul_1_main_loop_prep:`
			`push r15`
			`push r14`
			`push r13`
			`push r12`
			`;; If n is even, we need to do three`
			`;; pre-multiplies, if n is odd we only`
			`;; need to do two.`
			`mov temp,rdx`
			`mov index,0`
			`mov A_x,0`
			`mov A_y,0`
			`and rdx,1`
			`jnz L_mpn_addmul_1_odd_n`

			`;; Case n is even`
			`mov rax,ADDR(s1p,index,0)`
			`mul s2limb`
			`add ADDR(rp,index,0),rax`
			`adc A_x,rdx`
			`add index,1`
			`;; At this point`
			`;; temp = n (even)`
			`;; index = 1`

			`L_mpn_addmul_1_odd_n:`
			`;; Now`
			`;; temp = n`
			`;; index = 1 if n even`
			`;; = 0 if n odd`
			`;;`
			`mov rax,ADDR(s1p,index,0)`
			`mul s2limb`
			`mov A_z,ADDR(rp,index,0)`
			`add A_x,rax`
			`adc A_y,rdx`

			`mov rax,ADDR(s1p,index,1)`
			`mul s2limb`
			`mov B_z,ADDR(rp,index,1)`
			`mov B_x,rax`
			`mov B_y,rdx`
			`mov rax,ADDR(s1p,index,2)`

			`add index,3`
			`lea s1p,ADDR(s1p,temp,-1)`
			`lea rp,ADDR(rp,temp,-1)`
			`neg temp`
			`add index,temp`
			`;; At this point:`
			`;; s1p = address of last s1limb`
			`;; rp = address of last rplimb`
			`;; temp = -n`
			`;; index = 4 - n if n even`
			`;; = 3 - n if n odd`
			`;;`
			`;; So, index is a (negative) even`
			`;; number.`
			`;;`
			`;; *****************************************`
			`;; ATTENTION:`
			`;;`
			`;; From here on, I will use array`
			`;; indexing notation in the comments`
			`;; because it is convenient. So, I`
			`;; will pretend that index is positive`
			`;; because then a comment like`
			`;; B_z = rp[index-1]`
			`;; is easier to read.`
			`;; However, keep in mind that index is`
			`;; actually a negative number indexing`
			`;; back from the end of the array.`
			`;; This is a common trick to remove one`
			`;; compare operation from the main loop.`
			`;; *****************************************`

			`;;`
			`;; Now we enter a spin-up loop the`
			`;; will make sure that the index is`
			`;; a multiple of UNROLL_SIZE before`
			`;; going to our main unrolled loop.`
			`mov temp,index`
			`neg temp`
			`and temp,UNROLL_MASK`
			`jz L_mpn_addmul_1_main_loop`
			`shr temp,1`
			`L_mpn_addmul_1_main_loop_spin_up:`
			`;; At this point we should have:`
			`;;`
			`;; A_x = low_mul[index-2] + carry_in`
			`;; A_y = high_mul[index-2] + CF`
			`;; A_z = rp[index-2]`
			`;;`
			`;; B_x = low_mul[index-1]`
			`;; B_y = high_mul[index-1]`
			`;; B_z = rp[index-1]`
			`;;`
			`;; rax = s1p[index]`
			`mul s2limb`
			`add A_z,A_x`
			`lea A_x,[rax]`
			`mov rax,ADDR(s1p,index,1)`
			`adc B_x,A_y`
			`mov ADDR(rp,index,-2),A_z`
			`mov A_z,ADDR(rp,index,0)`
			`adc B_y,0`
			`lea A_y,[rdx]`
			`;; At this point we should have:`
			`;;`
			`;; B_x = low_mul[index-1] + carry_in`
			`;; B_y = high_mul[index-1] + CF`
			`;; B_z = rp[index-1]`
			`;;`
			`;; A_x = low_mul[index]`
			`;; A_y = high_mul[index]`
			`;; A_z = rp[index]`
			`;;`
			`;; rax = s1p[index+1]`
			`mul s2limb`
			`add B_z,B_x`
			`lea B_x,[rax]`
			`mov rax,ADDR(s1p,index,2)`
			`adc A_x,B_y`
			`mov ADDR(rp,index,-1),B_z`
			`mov B_z,ADDR(rp,index,1)`
			`adc A_y,0`
			`lea B_y,[rdx]`

			`add index,2`
			`sub temp,1`
			`jnz L_mpn_addmul_1_main_loop_spin_up`

			`jmp L_mpn_addmul_1_main_loop`

			`align 16`
			`L_mpn_addmul_1_main_loop:`
			`;; The code here is really the same`
			`;; logic as the spin-up loop. It's`
			`;; just been unrolled.`
			`%assign unroll_index 0`
			`%rep UNROLL_SIZE/2`
			`mul s2limb`
			`add A_z,A_x`
			`lea A_x,[rax]`
			`mov rax,ADDR(s1p,index,(2*unroll_index+1))`
			`adc B_x,A_y`
			`mov ADDR(rp,index,(2*unroll_index-2)),A_z`
			`mov A_z,ADDR(rp,index,(2*unroll_index))`
			`adc B_y,0`
			`lea A_y,[rdx]`

			`mul s2limb`
			`add B_z,B_x`
			`lea B_x,[rax]`
			`mov rax,ADDR(s1p,index,(2*unroll_index+2))`
			`adc A_x,B_y`
			`mov ADDR(rp,index,(2*unroll_index-1)),B_z`
			`mov B_z,ADDR(rp,index,(2*unroll_index+1))`
			`adc A_y,0`
			`lea B_y,[rdx]`
			`%assign unroll_index unroll_index+1`
			`%endrep`

			`add index,UNROLL_SIZE`
			`jnz L_mpn_addmul_1_main_loop`

			`L_mpn_addmul_1_finish:`
			`;; At this point we should have:`
			`;;`
			`;; index = n-1`
			`;;`
			`;; A_x = low_mul[index-2] + carry_in`
			`;; A_y = high_mul[index-2] + CF`
			`;; A_z = rp[index-2]`
			`;;`
			`;; B_x = low_mul[index-1]`
			`;; B_y = high_mul[index-1]`
			`;; B_z = rp[index-1]`
			`;;`
			`;; rax = s1p[index]`
			`mul s2limb`
			`add A_z,A_x`
			`mov A_x,rax`
			`mov ADDR(rp,index,-2),A_z`
			`mov A_z,ADDR(rp,index,0)`
			`adc B_x,A_y`
			`adc B_y,0`
			`mov A_y,rdx`
			`;; At this point we should have:`
			`;;`
			`;; index = n-1`
			`;;`
			`;; B_x = low_mul[index-1] + carry_in`
			`;; B_y = high_mul[index-1] + CF`
			`;; B_z = rp[index-1]`
			`;;`
			`;; A_x = low_mul[index]`
			`;; A_y = high_mul[index]`
			`;; A_z = rp[index]`
			`add B_z,B_x`
			`mov ADDR(rp,index,-1),B_z`
			`adc A_x,B_y`
			`adc A_y,0`
			`;; At this point we should have:`
			`;;`
			`;; index = n-1`
			`;;`
			`;; A_x = low_mul[index] + carry_in`
			`;; A_y = high_mul[index] + CF`
			`;; A_z = rp[index]`
			`;;`
			`add A_z,A_x`
			`mov ADDR(rp,index,0),A_z`
			`adc A_y,0`

			`mov rax,A_y`
			`pop r12`
			`pop r13`
			`pop r14`
			`pop r15`
			`ret`