mpir/mpn/x86_64/haswell/sumdiff_n.as
Alexander Kruppa cfc589609e Move to haswell/
This sumdiff_n is much slower on Haswell (2.6c/l) than on Skylake (2c/l)
but it still provides a ~3% speed up for a 1M limb FFT compared to having
no sumdiff_n at all.
2016-12-08 16:23:48 +01:00

358 lines
9.2 KiB
ActionScript

; ============================================================================
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t mpn_sumdiff_n(mp_ptr Op3, mp_ptr Op4, mp_srcptr Op1, mp_srcptr Op2, mp_size_t Size)
; Linux RAX RDI RSI RDX RCX R8
; Win7 RAX RCX RDX R8 R9 Stack
;
; Description:
; The function adds Op2 to Op1 and stores the result in Op3 while at the same
; time subtracting Op2 from Op1 with result in Op4. The final carries from
; addition and subtraction are handed back as a combined mp_limb_t. There is a
; gain in execution speed compared to separate addition and subtraction by
; reducing memory access. The factor depends on the size of the operands (the
; cache hierarchy in which the operands can be handled).
;
; Comments:
; - asm version implemented, tested & benched on 10.06.2015 by jn
; - On Nehalem per limb saving is 0.5 cycle in LD1$, LD2$ and LD3$
; - includes prefetching
; - includes XMM save & restore
; ============================================================================
%include 'yasm_mac.inc'
%ifdef USE_WIN64
%define Op3 RCX
%define Op4 RDX
%define Op1 R8
%define Op2 R9
%define Size RBX ; SAVE!
%define Limb0 RBP ; SAVE!
%define Limb1 RSI ; SAVE!
%define Limb2 RDI ; SAVE!
%define Limb3 R10
%define Limb4 R11
%define Limb5 R12 ; SAVE!
%define Limb6 R13 ; SAVE!
%define Limb7 R14 ; SAVE!
%define Limb8 R15 ; SAVE!
%ifdef USE_PREFETCH
%define Offs PREFETCH_STRIDE ; no more regs avail. => fallback to const
%endif
%define SaveRBX XMM0 ; use scratch XMM for fast save & restore
%define SaveRBP XMM1 ; R14 and R15 handled via stack
%define SaveRSI XMM2
%define SaveRDI XMM3
%define SaveR12 XMM4
%define SaveR13 XMM5
%else
%define Op3 RDI
%define Op4 RSI
%define Op1 RDX
%define Op2 RCX
%define Size R8
%define Limb0 RBP ; SAVE!
%define Limb1 RBX ; SAVE!
%define Limb2 R9
%define Limb3 R10
%define Limb4 R11
%define Limb5 R12 ; SAVE!
%define Limb6 R13 ; SAVE!
%define Limb7 R14 ; SAVE!
%define Limb8 R15 ; SAVE!
%ifdef USE_PREFETCH
%define Offs PREFETCH_STRIDE ; no more regs avail. => fallback to const
%endif
%define SaveRBX XMM0 ; use scratch XMM for fast save & restore
%define SaveRBP XMM1
%define SaveR12 XMM2
%define SaveR13 XMM3
%define SaveR14 XMM4
%define SaveR15 XMM5
%endif
%define SaveAC setc AL
%define LoadAC shr AL, 1
%define SaveSC sbb AH, AH
%define LoadSC add AH, AH
BITS 64
align 32
GLOBAL_FUNC mpn_sumdiff_n
%ifdef USE_WIN64
sub RSP, 16
mov [RSP+8], R15
mov [RSP], R14
movq SaveR13, R13
movq SaveR12, R12
movq SaveRDI, RDI
movq SaveRSI, RSI
movq SaveRBP, RBP
movq SaveRBX, RBX
%else
movq SaveR15, R15
movq SaveR14, R14
movq SaveR13, R13
movq SaveR12, R12
movq SaveRBP, RBP
movq SaveRBX, RBX
%endif
xor EAX, EAX ; clear add & sub carry
shr Size, 1
jnc .sumdiff_n_two
mov Limb1, [Op1]
mov Limb5, [Op2]
mov Limb2, Limb1
add Limb2, Limb5
mov [Op3], Limb2
SaveAC
sub Limb1, Limb5
mov [Op4], Limb1
SaveSC
add Op1, 8
add Op2, 8
add Op3, 8
add Op4, 8
.sumdiff_n_two:
shr Size, 1
jnc .sumdiff_n_four
mov Limb1, [Op1]
mov Limb2, [Op1+8]
mov Limb5, [Op2]
mov Limb6, [Op2+8]
LoadAC
mov Limb3, Limb1
adc Limb3, Limb5
mov [Op3], Limb3
mov Limb4, Limb2
adc Limb4, Limb6
mov [Op3+8], Limb4
SaveAC
LoadSC
sbb Limb1, Limb5
mov [Op4], Limb1
sbb Limb2, Limb6
mov [Op4+8], Limb2
SaveSC
add Op1, 16
add Op2, 16
add Op3, 16
add Op4, 16
.sumdiff_n_four:
shr Size, 1
jnc .sumdiff_n_loop_pre ;ajs:notshortform
LoadAC
; slight change of scheme here - avoid too many
; memory to reg or reg to memory moves in a row
mov Limb1, [Op1]
mov Limb5, [Op2]
mov Limb0, Limb1
adc Limb0, Limb5
mov [Op3], Limb0
mov Limb2, [Op1+8]
mov Limb6, [Op2+8]
mov Limb0, Limb2
adc Limb0, Limb6
mov [Op3+8], Limb0
mov Limb3, [Op1+16]
mov Limb7, [Op2+16]
mov Limb0, Limb3
adc Limb0, Limb7
mov [Op3+16], Limb0
mov Limb4, [Op1+24]
mov Limb8, [Op2+24]
mov Limb0, Limb4
adc Limb0, Limb8
mov [Op3+24], Limb0
SaveAC
LoadSC
sbb Limb1, Limb5
mov [Op4], Limb1
sbb Limb2, Limb6
mov [Op4+8], Limb2
sbb Limb3, Limb7
mov [Op4+16], Limb3
sbb Limb4, Limb8
mov [Op4+24], Limb4
SaveSC
add Op1, 32
add Op2, 32
add Op3, 32
add Op4, 32
test Size, Size
.sumdiff_n_loop_pre: ; If we jump here, ZF=1 iff Size=0
jz .sumdiff_n_post ;ajs:notshortform
LoadAC ; set carry for addition
; main loop - values below are best case - up to 50% fluctuation possible!
; - 3.50 cycles per limb in LD1$
; - 3.50 cycles per limb in LD2$
; - 5.10-5.50 cycles per limb in LD3$
align 16
.sumdiff_n_loop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
prefetchnta [Op2+Offs]
%endif
mov Limb1, [Op1] ; add the first quad-limb
mov Limb5, [Op2]
mov Limb0, Limb1
adc Limb0, Limb5
mov [Op3], Limb0
mov Limb2, [Op1+8]
mov Limb6, [Op2+8]
mov Limb0, Limb2
adc Limb0, Limb6
mov [Op3+8], Limb0
mov Limb3, [Op1+16]
mov Limb7, [Op2+16]
mov Limb0, Limb3
adc Limb0, Limb7
mov [Op3+16], Limb0
mov Limb4, [Op1+24]
mov Limb8, [Op2+24]
mov Limb0, Limb4
adc Limb0, Limb8
mov [Op3+24], Limb0
lea Op3, [Op3 + 64]
SaveAC ; memorize add-carry
LoadSC ; set carry for subtraction
sbb Limb1, Limb5 ; now sub the first quad-limb
mov [Op4], Limb1
sbb Limb2, Limb6
mov [Op4+8], Limb2
sbb Limb3, Limb7
mov [Op4+16], Limb3
sbb Limb4, Limb8
mov [Op4+24], Limb4
mov Limb1, [Op1+32] ; sub the second quad-limb
mov Limb5, [Op2+32]
mov Limb0, Limb1
sbb Limb0, Limb5
mov [Op4+32], Limb0
mov Limb2, [Op1+40]
mov Limb6, [Op2+40]
mov Limb0, Limb2
sbb Limb0, Limb6
mov [Op4+40], Limb0
mov Limb3, [Op1+48]
mov Limb7, [Op2+48]
mov Limb0, Limb3
sbb Limb0, Limb7
mov [Op4+48], Limb0
mov Limb4, [Op1+56]
mov Limb8, [Op2+56]
mov Limb0, Limb4
sbb Limb0, Limb8
mov [Op4+56], Limb0
lea Op4, [Op4 + 64]
SaveSC ; memorize sub-carry
LoadAC ; set carry for addition
adc Limb1, Limb5 ; add the second quad-limb
mov [Op3+32-64], Limb1
adc Limb2, Limb6
mov [Op3+40-64], Limb2
adc Limb3, Limb7
mov [Op3+48-64], Limb3
adc Limb4, Limb8
mov [Op3+56-64], Limb4
lea Op1, [Op1 + 64]
lea Op2, [Op2 + 64]
dec Size
jnz .sumdiff_n_loop ;ajs:notshortform
SaveAC ; memorize add-carry
; hand back carries
.sumdiff_n_post:
; AL = add_carry, AH = -sub_carry
LoadSC ; AL = add_carry, CY = sub_carry
adc AL, AL ; AL = 2*add_carry + sub_carry
movsx RAX, AL
.Exit:
%ifdef USE_WIN64
movq RBX, SaveRBX
movq RBP, SaveRBP
movq RSI, SaveRSI
movq RDI, SaveRDI
movq R12, SaveR12
movq R13, SaveR13
mov R14, [RSP]
mov R15, [RSP+8]
add RSP, 16
%else
movq R15, SaveR15
movq R14, SaveR14
movq R13, SaveR13
movq R12, SaveR12
movq RBP, SaveRBP
movq RBX, SaveRBX
%endif
ret
.end: