mpir/mpn/x86_64/haswell/avx/addmul_1.as

;  AMD64 mpn_addmul_1 optimised for Intel Haswell.

;  Contributed to the GNU project by Torbjörn Granlund.
;  Converted to MPIR by Alexander Kruppa.

;  Copyright 2013 Free Software Foundation, Inc.

;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or modify
;  it under the terms of either:
;
;    * the GNU Lesser General Public License as published by the Free
;      Software Foundation; either version 3 of the License, or (at your
;      option) any later version.
;
;  or
;
;    * the GNU General Public License as published by the Free Software
;      Foundation; either version 2 of the License, or (at your option) any
;      later version.
;
;  or both in parallel, as here.
;
;  The GNU MP Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
;  for more details.
;
;  You should have received copies of the GNU General Public License and the
;  GNU Lesser General Public License along with the GNU MP Library.  If not,
;  see https://www.gnu.org/licenses/.


%include 'yasm_mac.inc'

BITS 64

%ifdef USE_WIN64
    %define RP      rcx
    %define S1P     rbp
    %define Size    r8
    %define Sizeb   r8b
    %define Limb    r9

    %define Tmp0    r12
    %define Tmp1    r13
    %define Tmp2    rax
    %define Tmp3    rbx
    %define Tmp4    rsi
    %define Tmp5    rdi
    %define Tmp6    r10
    %define Tmp7    r11
    %define Tmp8    r9
%else
    %define RP      rdi
    %define S1P     rsi
    %define Size    rbp
    %define Sizeb   bpl
    %define Limb    rcx

    %define Tmp0    r12
    %define Tmp1    r13
    %define Tmp2    rax
    %define Tmp3    rbx
    %define Tmp4    r8
    %define Tmp5    r9
    %define Tmp6    r10
    %define Tmp7    r11
    %define Tmp8    rcx
%endif

%define ADDSUB add
%define ADCSBB adc

align 16

GLOBAL_FUNC mpn_addmul_1

	push 	rbx
	push 	rbp
	push 	r12
	push 	r13

	mov 	rbp, rdx ; mulx requires one input in rdx
	mov 	rdx, Limb

	test 	Sizeb, 1
	jnz 	.Lbx1

.Lbx0:  shr 	Size, 2
	jc 	.Lb10 ;ajs:notshortform

.Lb00:	mulx 	Tmp0, Tmp1, [S1P]
	mulx 	Tmp2, Tmp3, [S1P+8]
	add 	Tmp3, Tmp0
	adc 	Tmp2, 0
	mov 	Tmp0, [RP]
	mov 	Tmp8, [RP+8]
	mulx 	Tmp4, Tmp5, [S1P+16]
	lea 	RP, [RP-16]
	lea 	S1P, [S1P+16]
	ADDSUB 	Tmp0, Tmp1
	jmp 	.Llo0 ;ajs:notshortform

.Lbx1:	shr 	Size, 2
	jc 	.Lb11

.Lb01:	mulx 	Tmp6, Tmp7, [S1P]
	jnz 	.Lgt1
.Ln1:	ADDSUB 	[RP], Tmp7
	mov 	eax, 0
	adc 	Tmp2, Tmp6
	jmp 	.Lret ;ajs:notshortform

.Lgt1:	mulx 	Tmp0, Tmp1, [S1P+8]
	mulx 	Tmp2, Tmp3, [S1P+16]
	lea 	S1P, [S1P+24]
	add 	Tmp1, Tmp6
	adc 	Tmp3, Tmp0
	adc 	Tmp2, 0
	mov 	Tmp6, [RP]
	mov 	Tmp0, [RP+8]
	mov 	Tmp8, [RP+16]
	lea 	RP, [RP-8]
	ADDSUB 	Tmp6, Tmp7
	jmp 	.Llo1

.Lb11:	mulx 	Tmp2, Tmp3, [S1P]
	mov 	Tmp8, [RP]
	mulx 	Tmp4, Tmp5, [S1P+8]
	lea 	S1P, [S1P+8]
	lea 	RP, [RP-24]
	inc 	Size	
	ADDSUB 	Tmp8, Tmp3
	jmp 	.Llo3

.Lb10:	mulx 	Tmp4, Tmp5, [S1P]
	mulx 	Tmp6, Tmp7, [S1P+8]
	lea 	RP, [RP-32]
	mov 	eax, 0
	clc
	jz 	.Lend ;ajs:notshortform	

	align 16
.Ltop:	adc 	Tmp5, Tmp2
	lea 	RP, [RP+32]
	adc 	Tmp7, Tmp4
	mulx 	Tmp0, Tmp1, [S1P+16]
	mov 	Tmp4, [RP]
	mulx 	Tmp2, Tmp3, [S1P+24]
	lea 	S1P, [S1P+32]
	adc 	Tmp1, Tmp6
	adc 	Tmp3, Tmp0
	adc 	Tmp2, 0
	mov 	Tmp6, [RP+8]
	mov 	Tmp0, [RP+16]
	ADDSUB 	Tmp4, Tmp5
	mov 	Tmp8, [RP+24]
	mov 	[RP], Tmp4
	ADCSBB 	Tmp6, Tmp7
.Llo1:	mulx 	Tmp4, Tmp5, [S1P]
	mov 	[RP+8], Tmp6
	ADCSBB 	Tmp0, Tmp1
.Llo0:	mov 	[RP+16], Tmp0
	ADCSBB 	Tmp8, Tmp3
.Llo3:	mulx 	Tmp6, Tmp7, [S1P+8]
	mov 	[RP+24], Tmp8
	dec 	Size
	jnz 	.Ltop

.Lend:	adc 	Tmp5, Tmp2
	adc 	Tmp7, Tmp4
	mov 	Tmp4, [RP+32]
	mov 	Tmp2, Tmp6
	adc 	Tmp2, 0
	mov 	Tmp6, [RP+40]
	ADDSUB 	Tmp4, Tmp5
	mov 	[RP+32], Tmp4
	ADCSBB 	Tmp6, Tmp7
	mov 	[RP+40], Tmp6
	adc 	Tmp2, 0

.Lret:	pop 	r13
	pop 	r12
	pop 	rbp
	pop 	rbx

	ret
addmul_1 and submul_1, converted from GMP 2016-12-05 16:55:21 -05:00			`; AMD64 mpn_addmul_1 optimised for Intel Haswell.`

			`; Contributed to the GNU project by Torbjörn Granlund.`
			`; Converted to MPIR by Alexander Kruppa.`

			`; Copyright 2013 Free Software Foundation, Inc.`

			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or modify`
			`; it under the terms of either:`
			`;`
			`; * the GNU Lesser General Public License as published by the Free`
			`; Software Foundation; either version 3 of the License, or (at your`
			`; option) any later version.`
			`;`
			`; or`
			`;`
			`; * the GNU General Public License as published by the Free Software`
			`; Foundation; either version 2 of the License, or (at your option) any`
			`; later version.`
			`;`
			`; or both in parallel, as here.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
			`; for more details.`
			`;`
			`; You should have received copies of the GNU General Public License and the`
			`; GNU Lesser General Public License along with the GNU MP Library. If not,`
			`; see https://www.gnu.org/licenses/.`


			`%include 'yasm_mac.inc'`

			`BITS 64`

			`%ifdef USE_WIN64`
			`%define RP rcx`
			`%define S1P rbp`
			`%define Size r8`
			`%define Sizeb r8b`
			`%define Limb r9`

			`%define Tmp0 r12`
			`%define Tmp1 r13`
			`%define Tmp2 rax`
			`%define Tmp3 rbx`
			`%define Tmp4 rsi`
			`%define Tmp5 rdi`
			`%define Tmp6 r10`
			`%define Tmp7 r11`
			`%define Tmp8 r9`
			`%else`
			`%define RP rdi`
			`%define S1P rsi`
			`%define Size rbp`
			`%define Sizeb bpl`
			`%define Limb rcx`

			`%define Tmp0 r12`
			`%define Tmp1 r13`
			`%define Tmp2 rax`
			`%define Tmp3 rbx`
			`%define Tmp4 r8`
			`%define Tmp5 r9`
			`%define Tmp6 r10`
			`%define Tmp7 r11`
			`%define Tmp8 rcx`
			`%endif`

			`%define ADDSUB add`
			`%define ADCSBB adc`

			`align 16`

			`GLOBAL_FUNC mpn_addmul_1`

			`push rbx`
			`push rbp`
			`push r12`
			`push r13`

			`mov rbp, rdx ; mulx requires one input in rdx`
			`mov rdx, Limb`

			`test Sizeb, 1`
			`jnz .Lbx1`

			`.Lbx0: shr Size, 2`
			`jc .Lb10 ;ajs:notshortform`

			`.Lb00: mulx Tmp0, Tmp1, [S1P]`
			`mulx Tmp2, Tmp3, [S1P+8]`
			`add Tmp3, Tmp0`
			`adc Tmp2, 0`
			`mov Tmp0, [RP]`
			`mov Tmp8, [RP+8]`
			`mulx Tmp4, Tmp5, [S1P+16]`
			`lea RP, [RP-16]`
			`lea S1P, [S1P+16]`
			`ADDSUB Tmp0, Tmp1`
			`jmp .Llo0 ;ajs:notshortform`

			`.Lbx1: shr Size, 2`
			`jc .Lb11`

			`.Lb01: mulx Tmp6, Tmp7, [S1P]`
			`jnz .Lgt1`
			`.Ln1: ADDSUB [RP], Tmp7`
			`mov eax, 0`
			`adc Tmp2, Tmp6`
			`jmp .Lret ;ajs:notshortform`

			`.Lgt1: mulx Tmp0, Tmp1, [S1P+8]`
			`mulx Tmp2, Tmp3, [S1P+16]`
			`lea S1P, [S1P+24]`
			`add Tmp1, Tmp6`
			`adc Tmp3, Tmp0`
			`adc Tmp2, 0`
			`mov Tmp6, [RP]`
			`mov Tmp0, [RP+8]`
			`mov Tmp8, [RP+16]`
			`lea RP, [RP-8]`
			`ADDSUB Tmp6, Tmp7`
			`jmp .Llo1`

			`.Lb11: mulx Tmp2, Tmp3, [S1P]`
			`mov Tmp8, [RP]`
			`mulx Tmp4, Tmp5, [S1P+8]`
			`lea S1P, [S1P+8]`
			`lea RP, [RP-24]`
			`inc Size`
			`ADDSUB Tmp8, Tmp3`
			`jmp .Llo3`

			`.Lb10: mulx Tmp4, Tmp5, [S1P]`
			`mulx Tmp6, Tmp7, [S1P+8]`
			`lea RP, [RP-32]`
			`mov eax, 0`
			`clc`
			`jz .Lend ;ajs:notshortform`

			`align 16`
			`.Ltop: adc Tmp5, Tmp2`
			`lea RP, [RP+32]`
			`adc Tmp7, Tmp4`
			`mulx Tmp0, Tmp1, [S1P+16]`
			`mov Tmp4, [RP]`
			`mulx Tmp2, Tmp3, [S1P+24]`
			`lea S1P, [S1P+32]`
			`adc Tmp1, Tmp6`
			`adc Tmp3, Tmp0`
			`adc Tmp2, 0`
			`mov Tmp6, [RP+8]`
			`mov Tmp0, [RP+16]`
			`ADDSUB Tmp4, Tmp5`
			`mov Tmp8, [RP+24]`
			`mov [RP], Tmp4`
			`ADCSBB Tmp6, Tmp7`
			`.Llo1: mulx Tmp4, Tmp5, [S1P]`
			`mov [RP+8], Tmp6`
			`ADCSBB Tmp0, Tmp1`
			`.Llo0: mov [RP+16], Tmp0`
			`ADCSBB Tmp8, Tmp3`
			`.Llo3: mulx Tmp6, Tmp7, [S1P+8]`
			`mov [RP+24], Tmp8`
			`dec Size`
			`jnz .Ltop`

			`.Lend: adc Tmp5, Tmp2`
			`adc Tmp7, Tmp4`
			`mov Tmp4, [RP+32]`
			`mov Tmp2, Tmp6`
			`adc Tmp2, 0`
			`mov Tmp6, [RP+40]`
			`ADDSUB Tmp4, Tmp5`
			`mov [RP+32], Tmp4`
			`ADCSBB Tmp6, Tmp7`
			`mov [RP+40], Tmp6`
			`adc Tmp2, 0`

			`.Lret: pop r13`
			`pop r12`
			`pop rbp`
			`pop rbx`

			`ret`