mpir/mpn/x86_64/atom/submul_1.as

; **************************************************************************
;  Intel64 mpn_submul_1 -- Multiply a limb vector with a limb and
;  subtract the result from a second limb vector.
;
;  Copyright (C) 2006  Jason Worth Martin <jason.worth.martin@gmail.com>
;
;  This program is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public License
;  along with this program; if not, write to the Free Software Foundation,
;  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;
; **************************************************************************
;
;
; CREDITS
;
; This code is based largely on Pierrick Gaudry's excellent assembly
; support for the AMD64 architecture.  (Note that Intel64 and AMD64,
; while using the same instruction set, have very different
; microarchitectures.  So, this code performs very poorly on AMD64
; machines even though it is near-optimal on Intel64.)
;
; Roger Golliver works for Intel and provided insightful improvements
; particularly in using the "lea" instruction to perform additions
; and register-to-register moves.
;
; Eric Bainville has a brilliant exposition of optimizing arithmetic for
; AMD64 (http://www.bealto.it).  I adapted many of the ideas he
; describes to Intel64.
;
; Agner Fog is a demigod in the x86 world.  If you are reading assembly
; code files and you haven't heard of Agner Fog, then take a minute to
; look over his software optimization manuals (http://www.agner.org/).
; They are superb.
;
; *********************************************************************


; With a 4-way unroll the code has
;
;         	cycles/limb
; Hammer:           4.6
; Woodcrest:        4.6
;
; With increased unrolling, it appears to converge to 4 cycles/limb
; on Intel Core 2 machines.  I believe that this is optimal, however
; it requires such absurd unrolling that it becomes unusable for all
; but the largest inputs.  A 4-way unroll seems like a good balance
; to me because then commonly used input sizes (e.g. 1024bit Public
; keys) still benifit from the speed up.

;
; This is just a check to see if we are in my code testing sandbox
; or if we are actually in the GMP source tree
;

%include 'yasm_mac.inc'


; *********************************************************************
; *********************************************************************
;
; Here are the important macro parameters for the code
;
;      BpL is Bytes per Limb (8 since this is 64bit code)
;
;	UNROLL_SIZE is a power of 2 for which we unroll the code.
;                  possible values are 2,4,8,15,..., 256.  A reasonable
;                  value is probably 4.  If really large inputs
;                  are expected, then 16 is probably good.  Larger
;                  values are really only useful for flashy
;                  benchmarks and testing asymptotic behavior.
;
;      THRESHOLD is the minimum number of limbs needed before we bother
;                using the complicated loop.  A reasonable value is
;                2*UNROLL_SIZE + 6
;
; *********************************************************************
; *********************************************************************
%define	BpL	8
%define	UNROLL_SIZE	4
%define	UNROLL_MASK	UNROLL_SIZE-1
%define	THRESHOLD	2*UNROLL_SIZE+6

; Here is a convenient Macro for addressing
; memory.  Entries of the form
;
;      ADDR(ptr,index,displacement)
;
; get converted to
;
;      [displacement*BpL + ptr + index*BpL]
;
%define	ADDR(a,b,c)	[c*BpL+a+b*BpL]


; Register	Usage
; --------	-----
; rax		low word from mul
; rbx*
; rcx		s2limb
; rdx		high word from mul
; rsi		s1p
; rdi		rp
; rbp*		Base Pointer
; rsp*		Stack Pointer
; r8		A_x
; r9		A_y
; r10		A_z
; r11		B_x
; r12*		B_y
; r13*		B_z
; r14*		temp
; r15*		index
;
; * indicates that the register must be
; preserved for the caller.
%define	s2limb	rcx
%define	s1p	rsi
%define	rp	rdi
%define	A_x	r8
%define	A_y	r9
%define	A_z	r10
%define	B_x	r11
%define	B_y	r12
%define	B_z	r13
%define	temp	r14
%define	index	r15

	
; INPUT PARAMETERS
; rp		rdi
; s1p		rsi
; n		rdx
; s2limb	rcx
	BITS	64
GLOBAL_FUNC mpn_submul_1
					; Compare the limb count
					; with the threshold value.
					; If the limb count is small
					; we just use the small loop,
					; otherwise we jump to the
					; more complicated loop.
	cmp	rdx,THRESHOLD
	jge	L_mpn_submul_1_main_loop_prep
	mov	r11,rdx
	lea	rsi,[rsi+rdx*8]
	lea	rdi,[rdi+rdx*8]
	neg	r11
	xor	r8, r8
	xor	r10, r10
	jmp	L_mpn_submul_1_small_loop
	
	align	16
L_mpn_submul_1_small_loop:
	mov	rax,[rsi+r11*8]
	mul	rcx
	add	rax,r8
	adc	rdx,r10
	sub	[rdi+r11*8],rax
	mov	r8,r10
	adc	r8,rdx
	inc	r11
	jne	L_mpn_submul_1_small_loop

	mov	rax,r8
	ret

L_mpn_submul_1_main_loop_prep:
	push	r15
	push	r14
	push	r13
	push	r12
				; If n is even, we need to do three
				; pre-multiplies, if n is odd we only
				; need to do two.
	mov	temp,rdx
	mov	index,0
	mov	A_x,0
	mov	A_y,0
	and	rdx,1
	jnz	L_mpn_submul_1_odd_n

					; Case n is even
	mov	rax,ADDR(s1p,index,0)
	mul	s2limb
	sub	ADDR(rp,index,0),rax
	adc	A_x,rdx
	add	index,1
					; At this point
					;  temp = n (even)
					; index = 1

L_mpn_submul_1_odd_n:
					; Now
					; temp = n
					; index = 1 if n even
					;       = 0 if n odd
					;
	mov	rax,ADDR(s1p,index,0)
	mul	s2limb
	mov	A_z,ADDR(rp,index,0)
	add	A_x,rax
	adc	A_y,rdx

	mov	rax,ADDR(s1p,index,1)
	mul	s2limb
	mov	B_z,ADDR(rp,index,1)
	mov	B_x,rax
	mov	B_y,rdx
	mov	rax,ADDR(s1p,index,2)

	add	index,3
	lea	s1p,ADDR(s1p,temp,-1)
	lea	rp,ADDR(rp,temp,-1)
	neg	temp
	add	index,temp
				; At this point:
				; s1p   = address of last s1limb
				; rp    = address of last rplimb
				; temp  = -n
				; index = 4 - n if n even
				;       = 3 - n if n odd
				;
				; So, index is a (negative) even
				; number.
				;
				; *****************************************
				; ATTENTION:
				;
				; From here on, I will use array
				; indexing notation in the comments
				; because it is convenient.  So, I
				; will pretend that index is positive
				; because then a comment like
				;      B_z = rp[index-1]
				; is easier to read.
				; However, keep in mind that index is
				; actually a negative number indexing
				; back from the end of the array.
				; This is a common trick to remove one
				; compare operation from the main loop.
				; *****************************************

				;
				; Now we enter a spin-up loop the
				; will make sure that the index is
				; a multiple of UNROLL_SIZE before
				; going to our main unrolled loop.
	mov	temp,index
	neg	temp
	and	temp,UNROLL_MASK
	jz	L_mpn_submul_1_main_loop
	shr	temp,1
L_mpn_submul_1_main_loop_spin_up:
				; At this point we should have:
				;
				; A_x = low_mul[index-2] + carry_in
				; A_y = high_mul[index-2] + CF
				; A_z = rp[index-2]
				;
				; B_x = low_mul[index-1]
				; B_y = high_mul[index-1]
				; B_z = rp[index-1]
				;
				; rax = s1p[index]
	mul	s2limb
	sub	A_z,A_x
	lea	A_x,[rax]
	mov	rax,ADDR(s1p,index,1)
	adc	B_x,A_y
	mov	ADDR(rp,index,-2),A_z
	mov	A_z,ADDR(rp,index,0)
	adc	B_y,0
	lea	A_y,[rdx]
				; At this point we should have:
				;
				; B_x = low_mul[index-1] + carry_in
				; B_y = high_mul[index-1] + CF
				; B_z = rp[index-1]
				;
				; A_x = low_mul[index]
				; A_y = high_mul[index]
				; A_z = rp[index]
				;
				; rax = s1p[index+1]
	mul	s2limb
	sub	B_z,B_x
	lea	B_x,[rax]
	mov	rax,ADDR(s1p,index,2)
	adc	A_x,B_y
	mov	ADDR(rp,index,-1),B_z
	mov	B_z,ADDR(rp,index,1)
	adc	A_y,0
	lea	B_y,[rdx]

	add	index,2
	sub	temp,1
	jnz	L_mpn_submul_1_main_loop_spin_up
	
	jmp	L_mpn_submul_1_main_loop
	
	align	16
L_mpn_submul_1_main_loop:
				; The code here is really the same
				; logic as the spin-up loop.  It's
				; just been unrolled.
%assign	unroll_index 0
%rep	UNROLL_SIZE/2
	mul	s2limb
	sub	A_z,A_x
	lea	A_x,[rax]
	mov	rax,ADDR(s1p,index,(2*unroll_index+1))
	adc	B_x,A_y
	mov	ADDR(rp,index,(2*unroll_index-2)),A_z
	mov	A_z,ADDR(rp,index,(2*unroll_index))
	adc	B_y,0
	lea	A_y,[rdx]

	mul	s2limb
	sub	B_z,B_x
	lea	B_x,[rax]
	mov	rax,ADDR(s1p,index,(2*unroll_index+2))
	adc	A_x,B_y
	mov	ADDR(rp,index,(2*unroll_index-1)),B_z
	mov	B_z,ADDR(rp,index,(2*unroll_index+1))
	adc	A_y,0
	lea	B_y,[rdx]
%assign	unroll_index	unroll_index+1
%endrep


	add	index,UNROLL_SIZE
	jnz	L_mpn_submul_1_main_loop

L_mpn_submul_1_finish:	
				; At this point we should have:
				;
				; index = n-1
				;
				; A_x = low_mul[index-2] + carry_in
				; A_y = high_mul[index-2] + CF
				; A_z = rp[index-2]
				;
				; B_x = low_mul[index-1]
				; B_y = high_mul[index-1]
				; B_z = rp[index-1]
				;
				; rax = s1p[index]
	mul	s2limb
	sub	A_z,A_x
	mov	A_x,rax
	mov	ADDR(rp,index,-2),A_z
	mov	A_z,ADDR(rp,index,0)
	adc	B_x,A_y
	adc	B_y,0
	mov	A_y,rdx
				; At this point we should have:
				;
				; index = n-1
				;
				; B_x = low_mul[index-1] + carry_in
				; B_y = high_mul[index-1] + CF
				; B_z = rp[index-1]
				;
				; A_x = low_mul[index]
				; A_y = high_mul[index]
				; A_z = rp[index]
	sub	B_z,B_x
	mov	ADDR(rp,index,-1),B_z
	adc	A_x,B_y
	adc	A_y,0
				; At this point we should have:
				;
				; index = n-1
				;
				; A_x = low_mul[index] + carry_in
				; A_y = high_mul[index] + CF
				; A_z = rp[index]
				;
	sub	A_z,A_x
	mov	ADDR(rp,index,0),A_z
	adc	A_y,0

	mov	rax,A_y
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	ret
Select best asm functions from existing for Atom cpu 64bit 2009-09-06 08:49:19 -04:00			`; **************************************************************************`
			`; Intel64 mpn_submul_1 -- Multiply a limb vector with a limb and`
			`; subtract the result from a second limb vector.`
			`;`
			`; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com>`
			`;`
			`; This program is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2 of the License, or`
			`; (at your option) any later version.`
			`;`
			`; This program is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`; GNU General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with this program; if not, write to the Free Software Foundation,`
			`; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`
			`;`
			`; **************************************************************************`
			`;`
			`;`
			`; CREDITS`
			`;`
			`; This code is based largely on Pierrick Gaudry's excellent assembly`
			`; support for the AMD64 architecture. (Note that Intel64 and AMD64,`
			`; while using the same instruction set, have very different`
			`; microarchitectures. So, this code performs very poorly on AMD64`
			`; machines even though it is near-optimal on Intel64.)`
			`;`
			`; Roger Golliver works for Intel and provided insightful improvements`
			`; particularly in using the "lea" instruction to perform additions`
			`; and register-to-register moves.`
			`;`
			`; Eric Bainville has a brilliant exposition of optimizing arithmetic for`
			`; AMD64 (http://www.bealto.it). I adapted many of the ideas he`
			`; describes to Intel64.`
			`;`
			`; Agner Fog is a demigod in the x86 world. If you are reading assembly`
			`; code files and you haven't heard of Agner Fog, then take a minute to`
			`; look over his software optimization manuals (http://www.agner.org/).`
			`; They are superb.`
			`;`
			`; *********************************************************************`


			`; With a 4-way unroll the code has`
			`;`
			`; cycles/limb`
			`; Hammer: 4.6`
			`; Woodcrest: 4.6`
			`;`
			`; With increased unrolling, it appears to converge to 4 cycles/limb`
			`; on Intel Core 2 machines. I believe that this is optimal, however`
			`; it requires such absurd unrolling that it becomes unusable for all`
			`; but the largest inputs. A 4-way unroll seems like a good balance`
			`; to me because then commonly used input sizes (e.g. 1024bit Public`
			`; keys) still benifit from the speed up.`

			`;`
			`; This is just a check to see if we are in my code testing sandbox`
			`; or if we are actually in the GMP source tree`
			`;`

			`%include 'yasm_mac.inc'`



			`; *********************************************************************`
			`; *********************************************************************`
			`;`
			`; Here are the important macro parameters for the code`
			`;`
			`; BpL is Bytes per Limb (8 since this is 64bit code)`
			`;`
			`; UNROLL_SIZE is a power of 2 for which we unroll the code.`
			`; possible values are 2,4,8,15,..., 256. A reasonable`
			`; value is probably 4. If really large inputs`
			`; are expected, then 16 is probably good. Larger`
			`; values are really only useful for flashy`
			`; benchmarks and testing asymptotic behavior.`
			`;`
			`; THRESHOLD is the minimum number of limbs needed before we bother`
			`; using the complicated loop. A reasonable value is`
			`; 2*UNROLL_SIZE + 6`
			`;`
			`; *********************************************************************`
			`; *********************************************************************`
			`%define BpL 8`
			`%define UNROLL_SIZE 4`
			`%define UNROLL_MASK UNROLL_SIZE-1`
			`%define THRESHOLD 2*UNROLL_SIZE+6`

			`; Here is a convenient Macro for addressing`
			`; memory. Entries of the form`
			`;`
			`; ADDR(ptr,index,displacement)`
			`;`
			`; get converted to`
			`;`
			`; [displacementBpL + ptr + indexBpL]`
			`;`
			`%define ADDR(a,b,c) [cBpL+a+bBpL]`


			`; Register Usage`
			`; -------- -----`
			`; rax low word from mul`
			`; rbx*`
			`; rcx s2limb`
			`; rdx high word from mul`
			`; rsi s1p`
			`; rdi rp`
			`; rbp* Base Pointer`
			`; rsp* Stack Pointer`
			`; r8 A_x`
			`; r9 A_y`
			`; r10 A_z`
			`; r11 B_x`
			`; r12* B_y`
			`; r13* B_z`
			`; r14* temp`
			`; r15* index`
			`;`
			`; * indicates that the register must be`
			`; preserved for the caller.`
			`%define s2limb rcx`
			`%define s1p rsi`
			`%define rp rdi`
			`%define A_x r8`
			`%define A_y r9`
			`%define A_z r10`
			`%define B_x r11`
			`%define B_y r12`
			`%define B_z r13`
			`%define temp r14`
			`%define index r15`


			`; INPUT PARAMETERS`
			`; rp rdi`
			`; s1p rsi`
			`; n rdx`
			`; s2limb rcx`
			`BITS 64`
			`GLOBAL_FUNC mpn_submul_1`
			`; Compare the limb count`
			`; with the threshold value.`
			`; If the limb count is small`
			`; we just use the small loop,`
			`; otherwise we jump to the`
			`; more complicated loop.`
			`cmp rdx,THRESHOLD`
			`jge L_mpn_submul_1_main_loop_prep`
			`mov r11,rdx`
			`lea rsi,[rsi+rdx*8]`
			`lea rdi,[rdi+rdx*8]`
			`neg r11`
			`xor r8, r8`
			`xor r10, r10`
			`jmp L_mpn_submul_1_small_loop`

			`align 16`
			`L_mpn_submul_1_small_loop:`
			`mov rax,[rsi+r11*8]`
			`mul rcx`
			`add rax,r8`
			`adc rdx,r10`
			`sub [rdi+r11*8],rax`
			`mov r8,r10`
			`adc r8,rdx`
			`inc r11`
			`jne L_mpn_submul_1_small_loop`

			`mov rax,r8`
			`ret`

			`L_mpn_submul_1_main_loop_prep:`
			`push r15`
			`push r14`
			`push r13`
			`push r12`
			`; If n is even, we need to do three`
			`; pre-multiplies, if n is odd we only`
			`; need to do two.`
			`mov temp,rdx`
			`mov index,0`
			`mov A_x,0`
			`mov A_y,0`
			`and rdx,1`
			`jnz L_mpn_submul_1_odd_n`

			`; Case n is even`
			`mov rax,ADDR(s1p,index,0)`
			`mul s2limb`
			`sub ADDR(rp,index,0),rax`
			`adc A_x,rdx`
			`add index,1`
			`; At this point`
			`; temp = n (even)`
			`; index = 1`

			`L_mpn_submul_1_odd_n:`
			`; Now`
			`; temp = n`
			`; index = 1 if n even`
			`; = 0 if n odd`
			`;`
			`mov rax,ADDR(s1p,index,0)`
			`mul s2limb`
			`mov A_z,ADDR(rp,index,0)`
			`add A_x,rax`
			`adc A_y,rdx`

			`mov rax,ADDR(s1p,index,1)`
			`mul s2limb`
			`mov B_z,ADDR(rp,index,1)`
			`mov B_x,rax`
			`mov B_y,rdx`
			`mov rax,ADDR(s1p,index,2)`

			`add index,3`
			`lea s1p,ADDR(s1p,temp,-1)`
			`lea rp,ADDR(rp,temp,-1)`
			`neg temp`
			`add index,temp`
			`; At this point:`
			`; s1p = address of last s1limb`
			`; rp = address of last rplimb`
			`; temp = -n`
			`; index = 4 - n if n even`
			`; = 3 - n if n odd`
			`;`
			`; So, index is a (negative) even`
			`; number.`
			`;`
			`; *****************************************`
			`; ATTENTION:`
			`;`
			`; From here on, I will use array`
			`; indexing notation in the comments`
			`; because it is convenient. So, I`
			`; will pretend that index is positive`
			`; because then a comment like`
			`; B_z = rp[index-1]`
			`; is easier to read.`
			`; However, keep in mind that index is`
			`; actually a negative number indexing`
			`; back from the end of the array.`
			`; This is a common trick to remove one`
			`; compare operation from the main loop.`
			`; *****************************************`

			`;`
			`; Now we enter a spin-up loop the`
			`; will make sure that the index is`
			`; a multiple of UNROLL_SIZE before`
			`; going to our main unrolled loop.`
			`mov temp,index`
			`neg temp`
			`and temp,UNROLL_MASK`
			`jz L_mpn_submul_1_main_loop`
			`shr temp,1`
			`L_mpn_submul_1_main_loop_spin_up:`
			`; At this point we should have:`
			`;`
			`; A_x = low_mul[index-2] + carry_in`
			`; A_y = high_mul[index-2] + CF`
			`; A_z = rp[index-2]`
			`;`
			`; B_x = low_mul[index-1]`
			`; B_y = high_mul[index-1]`
			`; B_z = rp[index-1]`
			`;`
			`; rax = s1p[index]`
			`mul s2limb`
			`sub A_z,A_x`
			`lea A_x,[rax]`
			`mov rax,ADDR(s1p,index,1)`
			`adc B_x,A_y`
			`mov ADDR(rp,index,-2),A_z`
			`mov A_z,ADDR(rp,index,0)`
			`adc B_y,0`
			`lea A_y,[rdx]`
			`; At this point we should have:`
			`;`
			`; B_x = low_mul[index-1] + carry_in`
			`; B_y = high_mul[index-1] + CF`
			`; B_z = rp[index-1]`
			`;`
			`; A_x = low_mul[index]`
			`; A_y = high_mul[index]`
			`; A_z = rp[index]`
			`;`
			`; rax = s1p[index+1]`
			`mul s2limb`
			`sub B_z,B_x`
			`lea B_x,[rax]`
			`mov rax,ADDR(s1p,index,2)`
			`adc A_x,B_y`
			`mov ADDR(rp,index,-1),B_z`
			`mov B_z,ADDR(rp,index,1)`
			`adc A_y,0`
			`lea B_y,[rdx]`

			`add index,2`
			`sub temp,1`
			`jnz L_mpn_submul_1_main_loop_spin_up`

			`jmp L_mpn_submul_1_main_loop`

			`align 16`
			`L_mpn_submul_1_main_loop:`
			`; The code here is really the same`
			`; logic as the spin-up loop. It's`
			`; just been unrolled.`
			`%assign unroll_index 0`
			`%rep UNROLL_SIZE/2`
			`mul s2limb`
			`sub A_z,A_x`
			`lea A_x,[rax]`
			`mov rax,ADDR(s1p,index,(2*unroll_index+1))`
			`adc B_x,A_y`
			`mov ADDR(rp,index,(2*unroll_index-2)),A_z`
			`mov A_z,ADDR(rp,index,(2*unroll_index))`
			`adc B_y,0`
			`lea A_y,[rdx]`

			`mul s2limb`
			`sub B_z,B_x`
			`lea B_x,[rax]`
			`mov rax,ADDR(s1p,index,(2*unroll_index+2))`
			`adc A_x,B_y`
			`mov ADDR(rp,index,(2*unroll_index-1)),B_z`
			`mov B_z,ADDR(rp,index,(2*unroll_index+1))`
			`adc A_y,0`
			`lea B_y,[rdx]`
			`%assign unroll_index unroll_index+1`
			`%endrep`


			`add index,UNROLL_SIZE`
			`jnz L_mpn_submul_1_main_loop`

			`L_mpn_submul_1_finish:`
			`; At this point we should have:`
			`;`
			`; index = n-1`
			`;`
			`; A_x = low_mul[index-2] + carry_in`
			`; A_y = high_mul[index-2] + CF`
			`; A_z = rp[index-2]`
			`;`
			`; B_x = low_mul[index-1]`
			`; B_y = high_mul[index-1]`
			`; B_z = rp[index-1]`
			`;`
			`; rax = s1p[index]`
			`mul s2limb`
			`sub A_z,A_x`
			`mov A_x,rax`
			`mov ADDR(rp,index,-2),A_z`
			`mov A_z,ADDR(rp,index,0)`
			`adc B_x,A_y`
			`adc B_y,0`
			`mov A_y,rdx`
			`; At this point we should have:`
			`;`
			`; index = n-1`
			`;`
			`; B_x = low_mul[index-1] + carry_in`
			`; B_y = high_mul[index-1] + CF`
			`; B_z = rp[index-1]`
			`;`
			`; A_x = low_mul[index]`
			`; A_y = high_mul[index]`
			`; A_z = rp[index]`
			`sub B_z,B_x`
			`mov ADDR(rp,index,-1),B_z`
			`adc A_x,B_y`
			`adc A_y,0`
			`; At this point we should have:`
			`;`
			`; index = n-1`
			`;`
			`; A_x = low_mul[index] + carry_in`
			`; A_y = high_mul[index] + CF`
			`; A_z = rp[index]`
			`;`
			`sub A_z,A_x`
			`mov ADDR(rp,index,0),A_z`
			`adc A_y,0`

			`mov rax,A_y`
			`pop r12`
			`pop r13`
			`pop r14`
			`pop r15`
			`ret`