mpir/mpn/x86_64/haswell/sublsh1_n.as
Alexander Kruppa e3d7be3b31 sublsh1_n by Nurmann, adapted to MPIR
addlsh1_n.as and sublsh1_n.as mostly unified now
2016-12-08 06:07:02 +01:00

311 lines
7.3 KiB
ActionScript

; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t sublsh1_n(mp_ptr Op3, mp_srcptr Op2, mp_srcptr Op1; mp_size_t Size )
; Linux RAX RDI RSI RDX RCX
; Win7 RAX RCX RDX R8 R9
;
; Description:
; The function shifts Op1 left one bit, subtracts it from Op2, stores the result
; in Op3 and hands back the total carry. There is a gain in execution speed
; compared to separate shift and subtract by interleaving the elementary operations
; and reducing memory access. The factor depends on the size of the operands
; (the cache hierarchy in which the operands can be handled).
;
; Caveats:
; - for asm the processor MUST support LAHF/SAHF in 64 bit mode!
; - the total carry is in [0..2]!
;
; Comments:
; - asm version implemented, tested & benched on 16.05.2015 by jn
; - improved asm version implemented, tested & benched on 30.07.2015 by jn
; - On Nehalem per limb saving is 0.7 cycles in LD1$, LD2$ and 1-2 in LD3$
; - includes LAHF / SAHF
; - includes prefetching
; - includes XMM save & restore
;
; Linux: (rdi, rcx) = (rsi, rcx) - (rdx, rcx)<<1
; ============================================================================
%define ADDSUB sub
%define ADCSBB sbb
%include "yasm_mac.inc"
BITS 64
%ifdef USE_WIN64
%define Op3 RCX
%define Op2 RDX
%define Op1 R8
%define Size R9
%ifdef USE_PREFETCH
%define Offs RBP ; SAVE!
%endif
%define Limb0 RBX ; SAVE!
%define Limb1 RDI ; SAVE!
%define Limb2 RSI ; SAVE!
%define Limb3 R10
%define Limb4 R11
%define Limb5 R12 ; SAVE!
%define Limb6 R13 ; SAVE!
%define Limb7 R14 ; SAVE!
%define Limb8 R15 ; SAVE!
%define SaveRBX XMM0 ; use available scratch XMM to
%define SaveRSI XMM1 ; save as many regs as possible
%define SaveRDI XMM2
%define SaveR12 XMM3
%define SaveR13 XMM4
%define SaveR14 XMM5
%else
%define Op3 RDI
%define Op2 RSI
%define Op1 RDX
%define Size RCX
%ifdef USE_PREFETCH
%define Offs RBP ; SAVE!
%endif
%define Limb0 RBX ; SAVE!
%define Limb1 R8
%define Limb2 R9
%define Limb3 R10
%define Limb4 R11
%define Limb5 R12 ; SAVE!
%define Limb6 R13 ; SAVE!
%define Limb7 R14 ; SAVE!
%define Limb8 R15 ; SAVE!
%define SaveRBX XMM0 ; use available scratch XMM to save all regs
%define SaveR12 XMM1
%define SaveR13 XMM2
%define SaveR14 XMM3
%define SaveR15 XMM4
%ifdef USE_PREFETCH
%define SaveRBP XMM5
%endif
%endif
%macro ACCUMULATE 1
mov rax, [Op2 + 8 * %1]
ADCSBB rax, Limb%1
mov [Op3 + 8 * %1], rax
%endmacro
align 32
GLOBAL_FUNC mpn_sublsh1_n
%ifdef USE_WIN64
%ifdef USE_PREFETCH
sub RSP, 16
mov [RSP+8], R15
mov [RSP], RBP
%else
sub RSP, 8
mov [RSP], R15
%endif
movq SaveRBX, RBX
movq SaveRSI, RSI
movq SaveRDI, RDI
movq SaveR12, R12
movq SaveR13, R13
movq SaveR14, R14
%else
%ifdef USE_PREFETCH
movq SaveRBP, RBP
%endif
movq SaveRBX, RBX
movq SaveR12, R12
movq SaveR13, R13
movq SaveR14, R14
movq SaveR15, R15
%endif
%ifdef USE_PREFETCH
mov Offs, PREFETCH_STRIDE ; Attn: check if redefining Offs
%endif
; prepare shift & subtraction with loop-unrolling 8
xor Limb0, Limb0
lahf ; memorize clear carry (from "xor" above)
test Size, 1
je .n_two
mov Limb1, [Op1]
shrd Limb0, Limb1, 63
mov rax, [Op2]
ADDSUB rax, Limb0
mov [Op3], rax
lahf
add Op1, 8
add Op2, 8
add Op3, 8
mov Limb0, Limb1
.n_two:
test Size, 2
je .n_four
mov Limb1, [Op1]
mov Limb2, [Op1+8]
shrd Limb0, Limb1, 63
shrd Limb1, Limb2, 63
sahf
ACCUMULATE 0
ACCUMULATE 1
lahf
add Op1, 16
add Op2, 16
add Op3, 16
mov Limb0, Limb2
.n_four:
test Size, 4
je .n_test ;ajs:notshortform
mov Limb1, [Op1]
mov Limb2, [Op1+8]
shrd Limb0, Limb1, 63
shrd Limb1, Limb2, 63
mov Limb3, [Op1+16]
mov Limb4, [Op1+24]
shrd Limb2, Limb3, 63
shrd Limb3, Limb4, 63
sahf
ACCUMULATE 0
ACCUMULATE 1
ACCUMULATE 2
ACCUMULATE 3
lahf
add Op1, 32
add Op2, 32
add Op3, 32
mov Limb0, Limb4
jmp .n_test ;ajs:notshortform
; main loop
; - 2.40-2.50 cycles per limb in L1D$
; - 2.6 cycles per limb in L2D$
; - 2.80-3.30 cycles per limb in L3D$
align 16
.n_loop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
prefetchnta [Op2+Offs]
%endif
mov Limb1, [Op1] ; prepare shifted oct-limb from Op1
mov Limb2, [Op1+8]
mov Limb3, [Op1+16]
shrd Limb0, Limb1, 63
shrd Limb1, Limb2, 63
shrd Limb2, Limb3, 63
mov Limb4, [Op1+24]
mov Limb5, [Op1+32]
mov Limb6, [Op1+40]
shrd Limb3, Limb4, 63
shrd Limb4, Limb5, 63
shrd Limb5, Limb6, 63
mov Limb7, [Op1+48]
mov Limb8, [Op1+56]
shrd Limb6, Limb7, 63
shrd Limb7, Limb8, 63
sahf ; restore carry
ACCUMULATE 0 ; sub shifted Op1 from Op2 with result in Op3
ACCUMULATE 1
ACCUMULATE 2
ACCUMULATE 3
ACCUMULATE 4
ACCUMULATE 5
ACCUMULATE 6
ACCUMULATE 7
lahf ; remember carry for next round
add Op1, 64
add Op2, 64
add Op3, 64
mov Limb0, Limb8
.n_test:
sub Size, 8
jnc .n_loop
; housekeeping - hand back total carry
shr Limb0, 63
sahf
adc Limb0, 0 ; Limb0=0/1/2 depending on final carry and shift
mov RAX, Limb0
.Exit:
%ifdef USE_WIN64
movq SaveR14, R14
movq SaveR13, R13
movq SaveR12, R12
movq SaveRDI, RDI
movq SaveRSI, RSI
movq SaveRBX, RBX
%ifdef USE_PREFETCH
mov [RSP], RBP
mov [RSP+8], R15
add RSP, 16
%else
mov [RSP], R15
add RSP, 8
%endif
%else
movq R15, SaveR15
movq R14, SaveR14
movq R13, SaveR13
movq R12, SaveR12
movq RBX, SaveRBX
%ifdef USE_PREFETCH
movq RBP, SaveRBP
%endif
%endif
ret
.end: