mpir/mpn/x86_64w/skylake/avx/addsub_n.asm

158 lines
4.0 KiB
NASM

; AMD64 mpn_addsub_n
;
; Copyright 2017 Jens Nurmann
;
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,r8) = (rsi,r8)+(rdx,r8)-(rcx,r8)
; rax = summed carry and borrow in range [ -1..1 ]
; the main loop has been enhanced with the MPIR SuperOptimizer
; the gain was roughly 4% execution speed for operands in LD1$
;
; mp_limb_t mpn_addsub_n(mp_ptr, mp_ptr, mp_ptr, mp_ptr, mp_size_t)
; rax rdi rsi rdx rcx r8
; rax rcx rdx r8 r9 [rsp+40]
; cycles per limb with all operands in:
; LD1$ LD2$
; Haswell ??? ???
; Broadwell ??? ???
; Skylake 1.6-1.7 1.7-1.85
%include 'yasm_mac.inc'
%define reg_save_list rsi, rdi, r12
; definition according to Linux 64 bit ABI
%define ResP rdi
%define Src1P rsi
%define Src2P rdx
%define Src3P rcx
%define Size r8
%define Spills eax
%define Carry al
%define Borrow ah
%define Limb0 r9
%define Limb1 r10
%define Limb2 r11
%define Limb3 r12
align 32
BITS 64
FRAME_PROC mpn_addsub_n, 0, reg_save_list
mov rdi, rcx
mov rsi, rdx
mov rdx, r8
mov rcx, r9
mov r8, [rsp+stack_use+40]
sub Src3P, 32
sub ResP, 32
xor Spills, Spills ; clears carry & borrow
jmp .Check
align 16
.Loop:
; do not delete!
; this seemingly unreasoned AVX instruction optimizes the allocation of
; read/write operations to ports 2, 3 & 7 (write allways ending up
; on port 7) which allows a sustained 2r1w execution per cycle
vpor ymm0, ymm0, ymm0
shr Carry, 1
mov Limb0, [Src1P]
mov Limb1, [Src1P+8]
mov Limb2, [Src1P+16]
mov Limb3, [Src1P+24]
lea Src3P, [Src3P+32]
lea ResP, [ResP+32]
adc Limb0, [Src2P]
adc Limb1, [Src2P+8]
adc Limb2, [Src2P+16]
adc Limb3, [Src2P+24]
setc Carry
lea Src2P, [Src2P+32]
shr Borrow, 1
sbb Limb0, [Src3P]
sbb Limb1, [Src3P+8]
lea Src1P, [Src1P+32]
mov [ResP], Limb0
sbb Limb2, [Src3P+16]
mov [ResP+8], Limb1
mov [ResP+16], Limb2
sbb Limb3, [Src3P+24]
setc Borrow
mov [ResP+24], Limb3
; label @ $a (mod $10) seems ok from benchmark figures
.Check:
sub Size, 4
jnc .Loop
add Src3P, 32
add ResP, 32
add Size, 4
je .Exit
shr Carry, 1
mov Limb0, [Src1P]
adc Limb0, [Src2P]
setc Carry
shr Borrow, 1
sbb Limb0, [Src3P]
setc Borrow
mov [ResP], Limb0
dec Size
je .Exit
shr Carry, 1
mov Limb0, [Src1P+8]
adc Limb0, [Src2P+8]
setc Carry
shr Borrow, 1
sbb Limb0, [Src3P+8]
setc Borrow
mov [ResP+8], Limb0
dec Size
je .Exit
shr Carry, 1
mov Limb0, [Src1P+16]
adc Limb0, [Src2P+16]
setc Carry
shr Borrow, 1
sbb Limb0, [Src3P+16]
mov [ResP+16], Limb0
setc Borrow
; label @ $2 (mod $10) ok from benchmark figures
.Exit:
sub Carry, Borrow
movsx rax, Carry
END_PROC reg_save_list