e3d7be3b31
addlsh1_n.as and sublsh1_n.as mostly unified now
311 lines
7.3 KiB
ActionScript
311 lines
7.3 KiB
ActionScript
|
|
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
|
|
|
; This file is part of the MPIR Library.
|
|
|
|
; The MPIR Library is free software; you can redistribute it and/or modify
|
|
; it under the terms of the GNU Lesser General Public License as published
|
|
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
|
; your option) any later version.
|
|
|
|
; The MPIR Library is distributed in the hope that it will be useful, but
|
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
; License for more details.
|
|
|
|
; You should have received a copy of the GNU Lesser General Public License
|
|
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
|
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
; Boston, MA 02110-1301, USA.
|
|
|
|
; mp_limb_t sublsh1_n(mp_ptr Op3, mp_srcptr Op2, mp_srcptr Op1; mp_size_t Size )
|
|
; Linux RAX RDI RSI RDX RCX
|
|
; Win7 RAX RCX RDX R8 R9
|
|
;
|
|
; Description:
|
|
; The function shifts Op1 left one bit, subtracts it from Op2, stores the result
|
|
; in Op3 and hands back the total carry. There is a gain in execution speed
|
|
; compared to separate shift and subtract by interleaving the elementary operations
|
|
; and reducing memory access. The factor depends on the size of the operands
|
|
; (the cache hierarchy in which the operands can be handled).
|
|
;
|
|
; Caveats:
|
|
; - for asm the processor MUST support LAHF/SAHF in 64 bit mode!
|
|
; - the total carry is in [0..2]!
|
|
;
|
|
; Comments:
|
|
; - asm version implemented, tested & benched on 16.05.2015 by jn
|
|
; - improved asm version implemented, tested & benched on 30.07.2015 by jn
|
|
; - On Nehalem per limb saving is 0.7 cycles in LD1$, LD2$ and 1-2 in LD3$
|
|
; - includes LAHF / SAHF
|
|
; - includes prefetching
|
|
; - includes XMM save & restore
|
|
;
|
|
; Linux: (rdi, rcx) = (rsi, rcx) - (rdx, rcx)<<1
|
|
; ============================================================================
|
|
|
|
%define ADDSUB sub
|
|
%define ADCSBB sbb
|
|
|
|
%include "yasm_mac.inc"
|
|
|
|
BITS 64
|
|
|
|
%ifdef USE_WIN64
|
|
|
|
%define Op3 RCX
|
|
%define Op2 RDX
|
|
%define Op1 R8
|
|
%define Size R9
|
|
%ifdef USE_PREFETCH
|
|
%define Offs RBP ; SAVE!
|
|
%endif
|
|
|
|
%define Limb0 RBX ; SAVE!
|
|
%define Limb1 RDI ; SAVE!
|
|
%define Limb2 RSI ; SAVE!
|
|
%define Limb3 R10
|
|
%define Limb4 R11
|
|
%define Limb5 R12 ; SAVE!
|
|
%define Limb6 R13 ; SAVE!
|
|
%define Limb7 R14 ; SAVE!
|
|
%define Limb8 R15 ; SAVE!
|
|
|
|
%define SaveRBX XMM0 ; use available scratch XMM to
|
|
%define SaveRSI XMM1 ; save as many regs as possible
|
|
%define SaveRDI XMM2
|
|
%define SaveR12 XMM3
|
|
%define SaveR13 XMM4
|
|
%define SaveR14 XMM5
|
|
|
|
%else
|
|
|
|
%define Op3 RDI
|
|
%define Op2 RSI
|
|
%define Op1 RDX
|
|
%define Size RCX
|
|
%ifdef USE_PREFETCH
|
|
%define Offs RBP ; SAVE!
|
|
%endif
|
|
|
|
%define Limb0 RBX ; SAVE!
|
|
%define Limb1 R8
|
|
%define Limb2 R9
|
|
%define Limb3 R10
|
|
%define Limb4 R11
|
|
%define Limb5 R12 ; SAVE!
|
|
%define Limb6 R13 ; SAVE!
|
|
%define Limb7 R14 ; SAVE!
|
|
%define Limb8 R15 ; SAVE!
|
|
|
|
%define SaveRBX XMM0 ; use available scratch XMM to save all regs
|
|
%define SaveR12 XMM1
|
|
%define SaveR13 XMM2
|
|
%define SaveR14 XMM3
|
|
%define SaveR15 XMM4
|
|
%ifdef USE_PREFETCH
|
|
%define SaveRBP XMM5
|
|
%endif
|
|
|
|
%endif
|
|
|
|
|
|
%macro ACCUMULATE 1
|
|
mov rax, [Op2 + 8 * %1]
|
|
ADCSBB rax, Limb%1
|
|
mov [Op3 + 8 * %1], rax
|
|
%endmacro
|
|
|
|
|
|
align 32
|
|
|
|
GLOBAL_FUNC mpn_sublsh1_n
|
|
|
|
%ifdef USE_WIN64
|
|
%ifdef USE_PREFETCH
|
|
sub RSP, 16
|
|
mov [RSP+8], R15
|
|
mov [RSP], RBP
|
|
%else
|
|
sub RSP, 8
|
|
mov [RSP], R15
|
|
%endif
|
|
movq SaveRBX, RBX
|
|
movq SaveRSI, RSI
|
|
movq SaveRDI, RDI
|
|
movq SaveR12, R12
|
|
movq SaveR13, R13
|
|
movq SaveR14, R14
|
|
%else
|
|
%ifdef USE_PREFETCH
|
|
movq SaveRBP, RBP
|
|
%endif
|
|
movq SaveRBX, RBX
|
|
movq SaveR12, R12
|
|
movq SaveR13, R13
|
|
movq SaveR14, R14
|
|
movq SaveR15, R15
|
|
%endif
|
|
|
|
%ifdef USE_PREFETCH
|
|
mov Offs, PREFETCH_STRIDE ; Attn: check if redefining Offs
|
|
%endif
|
|
|
|
; prepare shift & subtraction with loop-unrolling 8
|
|
xor Limb0, Limb0
|
|
lahf ; memorize clear carry (from "xor" above)
|
|
|
|
test Size, 1
|
|
je .n_two
|
|
|
|
mov Limb1, [Op1]
|
|
shrd Limb0, Limb1, 63
|
|
|
|
mov rax, [Op2]
|
|
ADDSUB rax, Limb0
|
|
mov [Op3], rax
|
|
lahf
|
|
|
|
add Op1, 8
|
|
add Op2, 8
|
|
add Op3, 8
|
|
mov Limb0, Limb1
|
|
|
|
.n_two:
|
|
|
|
test Size, 2
|
|
je .n_four
|
|
|
|
mov Limb1, [Op1]
|
|
mov Limb2, [Op1+8]
|
|
shrd Limb0, Limb1, 63
|
|
shrd Limb1, Limb2, 63
|
|
|
|
sahf
|
|
ACCUMULATE 0
|
|
ACCUMULATE 1
|
|
lahf
|
|
|
|
add Op1, 16
|
|
add Op2, 16
|
|
add Op3, 16
|
|
mov Limb0, Limb2
|
|
|
|
.n_four:
|
|
|
|
test Size, 4
|
|
je .n_test ;ajs:notshortform
|
|
|
|
mov Limb1, [Op1]
|
|
mov Limb2, [Op1+8]
|
|
shrd Limb0, Limb1, 63
|
|
shrd Limb1, Limb2, 63
|
|
mov Limb3, [Op1+16]
|
|
mov Limb4, [Op1+24]
|
|
shrd Limb2, Limb3, 63
|
|
shrd Limb3, Limb4, 63
|
|
|
|
sahf
|
|
ACCUMULATE 0
|
|
ACCUMULATE 1
|
|
ACCUMULATE 2
|
|
ACCUMULATE 3
|
|
lahf
|
|
|
|
add Op1, 32
|
|
add Op2, 32
|
|
add Op3, 32
|
|
mov Limb0, Limb4
|
|
jmp .n_test ;ajs:notshortform
|
|
|
|
; main loop
|
|
; - 2.40-2.50 cycles per limb in L1D$
|
|
; - 2.6 cycles per limb in L2D$
|
|
; - 2.80-3.30 cycles per limb in L3D$
|
|
align 16
|
|
.n_loop:
|
|
|
|
%ifdef USE_PREFETCH
|
|
prefetchnta [Op1+Offs]
|
|
prefetchnta [Op2+Offs]
|
|
%endif
|
|
|
|
mov Limb1, [Op1] ; prepare shifted oct-limb from Op1
|
|
mov Limb2, [Op1+8]
|
|
mov Limb3, [Op1+16]
|
|
shrd Limb0, Limb1, 63
|
|
shrd Limb1, Limb2, 63
|
|
shrd Limb2, Limb3, 63
|
|
mov Limb4, [Op1+24]
|
|
mov Limb5, [Op1+32]
|
|
mov Limb6, [Op1+40]
|
|
shrd Limb3, Limb4, 63
|
|
shrd Limb4, Limb5, 63
|
|
shrd Limb5, Limb6, 63
|
|
mov Limb7, [Op1+48]
|
|
mov Limb8, [Op1+56]
|
|
shrd Limb6, Limb7, 63
|
|
shrd Limb7, Limb8, 63
|
|
|
|
sahf ; restore carry
|
|
ACCUMULATE 0 ; sub shifted Op1 from Op2 with result in Op3
|
|
ACCUMULATE 1
|
|
ACCUMULATE 2
|
|
ACCUMULATE 3
|
|
ACCUMULATE 4
|
|
ACCUMULATE 5
|
|
ACCUMULATE 6
|
|
ACCUMULATE 7
|
|
lahf ; remember carry for next round
|
|
|
|
add Op1, 64
|
|
add Op2, 64
|
|
add Op3, 64
|
|
mov Limb0, Limb8
|
|
|
|
.n_test:
|
|
|
|
sub Size, 8
|
|
jnc .n_loop
|
|
|
|
; housekeeping - hand back total carry
|
|
shr Limb0, 63
|
|
sahf
|
|
adc Limb0, 0 ; Limb0=0/1/2 depending on final carry and shift
|
|
mov RAX, Limb0
|
|
|
|
.Exit:
|
|
|
|
%ifdef USE_WIN64
|
|
|
|
movq SaveR14, R14
|
|
movq SaveR13, R13
|
|
movq SaveR12, R12
|
|
movq SaveRDI, RDI
|
|
movq SaveRSI, RSI
|
|
movq SaveRBX, RBX
|
|
%ifdef USE_PREFETCH
|
|
mov [RSP], RBP
|
|
mov [RSP+8], R15
|
|
add RSP, 16
|
|
%else
|
|
mov [RSP], R15
|
|
add RSP, 8
|
|
%endif
|
|
|
|
%else
|
|
|
|
movq R15, SaveR15
|
|
movq R14, SaveR14
|
|
movq R13, SaveR13
|
|
movq R12, SaveR12
|
|
movq RBX, SaveRBX
|
|
%ifdef USE_PREFETCH
|
|
movq RBP, SaveRBP
|
|
%endif
|
|
|
|
%endif
|
|
|
|
ret
|
|
.end:
|