mpir/mpn/x86_64w/core2/submul_1.asm

;  x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".

;  Copyright 2003, 2004, 2005, 2007, 2008, 2009, 2011, 2012 Free Software
;  Foundation, Inc.

;  This file is part of the GNU MP Library.

;  The GNU MP Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 3 of the License, or (at
;  your option) any later version.

;  The GNU MP Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.

;  You should have received a copy of the GNU Lesser General Public License
;  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

; mp_limb_t mpn_addmul_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t)
;  rax                       rdi        rsi        rdx        rcx
;  rax                       rcx        rdx         r8         r9

%include 'yasm_mac.inc'

%define reg_save_list   rbx, rbp, rsi, rdi

	TEXT

	xalign  16
	WIN64_GCC_PROC mpn_submul_1c, 4
	lea     rbx, [rdx]
	neg     rbx
	mov     rax, [rsi]
	mov     r10, [rdi]
	lea     rdi, [rdi+rdx*8-16]
	lea     rsi, [rsi+rdx*8]
	mul     rcx
	add     rax, r8
	adc     rdx, 0
	bt      ebx, 0
	jc      .1
	lea     r11, [rax]
	mov     rax, [rsi+rbx*8+8]
	lea     rbp, [rdx]
	mul     rcx
	add     rbx, 2
	jns     .5
	lea     r8, [rax]
	mov     rax, [rsi+rbx*8]
	lea     r9, [rdx]
	jmp     .3
.1:	add     rbx, 1
	jns     .6
	lea     r8, [rax]
	mov     rax, [rsi+rbx*8]
	lea     r9, [rdx]
	mul     rcx
	lea     r11, [rax]
	mov     rax, [rsi+rbx*8+8]
	lea     rbp, [rdx]
	jmp     .4

	xalign  16
.2:	mul     rcx
	sub     r10, r8
	lea     r8, [rax]
	mov     rax, [rsi+rbx*8]
	adc     r11, r9
	mov     [rdi+rbx*8-8], r10
	mov     r10, [rdi+rbx*8]
	lea     r9, [rdx]
	adc     rbp, 0
.3:	mul     rcx
	sub     r10, r11
	lea     r11, [rax]
	mov     rax, [rsi+rbx*8+8]
	adc     r8, rbp
	mov     [rdi+rbx*8], r10
	mov     r10, [rdi+rbx*8+8]
	lea     rbp, [rdx]
	adc     r9, 0
.4:	add     rbx, 2
	js      .2
	mul     rcx
	sub     r10, r8
	adc     r11, r9
	mov     [rdi-8], r10
	adc     rbp, 0
.5:	mov     r10, [rdi]
	sub     r10, r11
	adc     rax, rbp
	mov     [rdi], r10
	adc     rdx, 0
.6:	mov     r10, [rdi+8]
	sub     r10, rax
	mov     [rdi+8], r10
	mov     eax, ebx
	adc     rax, rdx
	WIN64_GCC_END

	xalign  16
	WIN64_GCC_PROC mpn_submul_1, 4
	lea     rbx, [rdx]
	neg     rbx
	mov     rax, [rsi]
	mov     r10, [rdi]
	lea     rdi, [rdi+rdx*8-16]
	lea     rsi, [rsi+rdx*8]
	mul     rcx
	bt      ebx, 0
	jc      .1
	lea     r11, [rax]
	mov     rax, [rsi+rbx*8+8]
	lea     rbp, [rdx]
	mul     rcx
	add     rbx, 2
	jns     .5
	lea     r8, [rax]
	mov     rax, [rsi+rbx*8]
	lea     r9, [rdx]
	jmp     .3
.1:	add     rbx, 1
	jns     .6
	lea     r8, [rax]
	mov     rax, [rsi+rbx*8]
	lea     r9, [rdx]
	mul     rcx
	lea     r11, [rax]
	mov     rax, [rsi+rbx*8+8]
	lea     rbp, [rdx]
	jmp     .4

	xalign  16
.2:	mul     rcx
	sub     r10, r8
	lea     r8, [rax]
	mov     rax, [rsi+rbx*8]
	adc     r11, r9
	mov     [rdi+rbx*8-8], r10
	mov     r10, [rdi+rbx*8]
	lea     r9, [rdx]
	adc     rbp, 0
.3:	mul     rcx
	sub     r10, r11
	lea     r11, [rax]
	mov     rax, [rsi+rbx*8+8]
	adc     r8, rbp
	mov     [rdi+rbx*8], r10
	mov     r10, [rdi+rbx*8+8]
	lea     rbp, [rdx]
	adc     r9, 0
.4:	add     rbx, 2
	js      .2
	mul     rcx
	sub     r10, r8
	adc     r11, r9
	mov     [rdi-8], r10
	adc     rbp, 0
.5:	mov     r10, [rdi]
	sub     r10, r11
	adc     rax, rbp
	mov     [rdi], r10
	adc     rdx, 0
.6:	mov     r10, [rdi+8]
	sub     r10, rax
	mov     [rdi+8], r10
	mov     eax, ebx
	adc     rax, rdx
	WIN64_GCC_END

	end