mpir/mpn/x86_64w/haswell/mul_basecase.asm
2017-01-22 22:41:01 +00:00

426 lines
6.5 KiB
NASM

; AMD64 mpn_mul_basecase optimised for Intel Haswell.
; Contributed to the GNU project by Torbjörn Granlund.
; Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of either:
;
; * the GNU Lesser General Public License as published by the Free
; Software Foundation; either version 3 of the License, or (at your
; option) any later version.
;
; or
;
; * the GNU General Public License as published by the Free Software
; Foundation; either version 2 of the License, or (at your option) any
; later version.
;
; or both in parallel, as here.
;
; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
; for more details.
;
; You should have received copies of the GNU General Public License and the
; GNU Lesser General Public License along with the GNU MP Library. If not,
; see https://www.gnu.org/licenses/.
;
; mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
; rax rdi rsi rdx rcx r8
; rax rcx rdx r8 r9 [rsp+40]
%include 'yasm_mac.inc'
%define reg_save_list rbx, rbp, rsi, rdi, r12, r13, r14, r15
%define rp rdi
%define up rsi
%define un_param rdx
%define vp rcx
%define un rbx
%define un8 bl
%define w0 r10
%define w1 r11
%define w2 r12
%define w3 r13
%define n rbp
BITS 64
align 16
FRAME_PROC mpn_mul_basecase, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, [rsp+stack_use+40]
mov un, rdx
neg un
mov n, rdx
sar n, 2
test r8b, 1
jz .Ldo_mul_2
mov rdx, [vp]
.Ldo_mul_1:
test un8, 1
jnz .Lm1x1
.Lm1x0:
test un8, 2
jnz .Lm110
.Lm100:
mulx w2, r14, [up]
mulx w3, w1, [up+8]
lea rp, [rp-24]
jmp .Lm1l0
.Lm110:
mulx r9, w3, [up]
mulx r14, w1, [up+8]
lea rp, [rp-8]
test n, n
jz .Lcj2
mulx w2, w0, [up+16]
lea up, [up+16]
jmp .Lm1l2
.Lm1x1:
test un8, 2
jz .Lm111
.Lm101:
mulx r14, r9, [up]
lea rp, [rp-16]
test n, n
jz .Lcj1
mulx w2, w0, [up+8]
lea up, [up+8]
jmp .Lm1l1
.Lm111:
mulx w3, w2, [up]
mulx r9, w0, [up+8]
mulx r14, w1, [up+16]
lea up, [up+24]
test n, n
jnz .Lgt3
add w3, w0
jmp .Lcj3
.Lgt3:
add w3, w0
jmp .Lm1l3
align 32
.Lm1tp:
lea rp, [rp+32]
.Lm1l3:
mov [rp], w2
mulx w2, w0, [up]
.Lm1l2:
mov [rp+8], w3
adc r9, w1
.Lm1l1:
adc r14, w0
mov [rp+16], r9
mulx w3, w1, [up+8]
.Lm1l0:
mov [rp+24], r14
mulx r9, w0, [up+16]
adc w2, w1
mulx r14, w1, [up+24]
adc w3, w0
lea up, [up+32]
dec n
jnz .Lm1tp
.Lm1ed:
lea rp, [rp+32]
.Lcj3:
mov [rp], w2
.Lcj2:
mov [rp+8], w3
adc r9, w1
.Lcj1:
mov [rp+16], r9
adc r14, 0
mov [rp+24], r14
dec r8d
jz .Lret5
lea vp, [vp+8]
lea rp, [rp+32]
jmp .Ldo_addmul
.Ldo_mul_2:
mov r9, [vp]
mov r14, [vp+8]
lea n, [un]
sar n, 2
test un8, 1
jnz .Lm2x1
.Lm2x0:
xor w0, w0
test un8, 2
mov rdx, [up]
mulx w1, w2, r9
jz .Lm2l0
.Lm210:
lea rp, [rp-16]
lea up, [up-16]
jmp .Lm2l2
.Lm2x1:
xor w2, w2
test un8, 2
mov rdx, [up]
mulx w3, w0, r9
jz .Lm211
.Lm201:
lea rp, [rp-24]
lea up, [up+8]
jmp .Lm2l1
.Lm211:
lea rp, [rp-8]
lea up, [up-8]
jmp .Lm2l3
align 16
.Lm2tp:
mulx w0, rax, r14
add w2, rax
mov rdx, [up]
mulx w1, rax, r9
adc w0, 0
add w2, rax
adc w1, 0
add w2, w3
.Lm2l0:
mov [rp], w2
adc w1, 0
mulx w2, rax, r14
add w0, rax
mov rdx, [up+8]
adc w2, 0
mulx w3, rax, r9
add w0, rax
adc w3, 0
add w0, w1
.Lm2l3:
mov [rp+8], w0
adc w3, 0
mulx w0, rax, r14
add w2, rax
mov rdx, [up+16]
mulx w1, rax, r9
adc w0, 0
add w2, rax
adc w1, 0
add w2, w3
.Lm2l2:
mov [rp+16], w2
adc w1, 0
mulx w2, rax, r14
add w0, rax
mov rdx, [up+24]
adc w2, 0
mulx w3, rax, r9
add w0, rax
adc w3, 0
add w0, w1
lea up, [up+32]
.Lm2l1:
mov [rp+24], w0
adc w3, 0
inc n
lea rp, [rp+32]
jnz .Lm2tp
.Lm2ed:
mulx rax, rdx, r14
add w2, rdx
adc rax, 0
add w2, w3
mov [rp], w2
adc rax, 0
mov [rp+8], rax
add r8d, -2
jz .Lret5
lea vp, [vp+16]
lea rp, [rp+16]
.Ldo_addmul:
mov [rsp+stack_use+8], r8
lea rp, [un*8+rp]
lea up, [un*8+up]
.Louter:
mov r9, [vp]
mov r8, [vp+8]
lea n, [un+2]
sar n, 2
mov rdx, [up]
test un8, 1
jnz .Lbx1
.Lbx0:
mov r14, [rp]
mov r15, [rp+8]
mulx w1, rax, r9
add r14, rax
mulx w2, rax, r8
adc w1, 0
mov [rp], r14
add r15, rax
adc w2, 0
mov rdx, [up+8]
test un8, 2
jnz .Lb10
.Lb00:
lea up, [up+16]
lea rp, [rp+16]
jmp .Llo0
.Lb10:
mov r14, [rp+16]
lea up, [up+32]
mulx w3, rax, r9
jmp .Llo2
.Lbx1:
mov r15, [rp]
mov r14, [rp+8]
mulx w3, rax, r9
add r15, rax
adc w3, 0
mulx w0, rax, r8
add r14, rax
adc w0, 0
mov rdx, [up+8]
mov [rp], r15
mulx w1, rax, r9
test un8, 2
jz .Lb11
.Lb01:
mov r15, [rp+16]
lea rp, [rp+24]
lea up, [up+24]
jmp .Llo1
.Lb11:
lea rp, [rp+8]
lea up, [up+8]
jmp .Llo3
align 16
.Ltop:
mulx w3, rax, r9
add r15, w0
adc w2, 0
.Llo2:
add r15, rax
adc w3, 0
mulx w0, rax, r8
add r14, rax
adc w0, 0
lea rp, [rp+32]
add r15, w1
mov rdx, [up-16]
mov [rp-24], r15
adc w3, 0
add r14, w2
mov r15, [rp-8]
mulx w1, rax, r9
adc w0, 0
.Llo1:
add r14, rax
mulx w2, rax, r8
adc w1, 0
add r14, w3
mov [rp-16], r14
adc w1, 0
add r15, rax
adc w2, 0
add r15, w0
mov rdx, [up-8]
adc w2, 0
.Llo0:
mulx w3, rax, r9
add r15, rax
adc w3, 0
mov r14, [rp]
mulx w0, rax, r8
add r14, rax
adc w0, 0
add r15, w1
mov [rp-8], r15
adc w3, 0
mov rdx, [up]
add r14, w2
mulx w1, rax, r9
adc w0, 0
.Llo3:
add r14, rax
adc w1, 0
mulx w2, rax, r8
add r14, w3
mov r15, [rp+8]
mov [rp], r14
mov r14, [rp+16]
adc w1, 0
add r15, rax
adc w2, 0
mov rdx, [up+8]
lea up, [up+32]
inc n
jnz .Ltop
.Lend:
mulx w3, rax, r9
add r15, w0
adc w2, 0
add r15, rax
adc w3, 0
mulx rax, rdx, r8
add r15, w1
mov [rp+8], r15
adc w3, 0
add rdx, w2
adc rax, 0
add rdx, w3
mov [rp+16], rdx
adc rax, 0
mov [rp+24], rax
add DWORD [rsp+stack_use+8], -2
lea vp, [vp+16]
lea up, [up+un*8-16]
lea rp, [rp+un*8+32]
jnz .Louter
mov rax, [rsp+stack_use+8]
.Lret5:
END_PROC reg_save_list