267 lines
6.8 KiB
NASM
267 lines
6.8 KiB
NASM
|
|
||
|
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||
|
|
||
|
; This file is part of the MPIR Library.
|
||
|
|
||
|
; The MPIR Library is free software; you can redistribute it and/or modify
|
||
|
; it under the terms of the GNU Lesser General Public License as published
|
||
|
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||
|
; your option) any later version.
|
||
|
|
||
|
; The MPIR Library is distributed in the hope that it will be useful, but
|
||
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
; License for more details.
|
||
|
|
||
|
; You should have received a copy of the GNU Lesser General Public License
|
||
|
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||
|
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||
|
; Boston, MA 02110-1301, USA.
|
||
|
|
||
|
; mp_limb_t mpn_lshift1(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1 )
|
||
|
; Linux RAX RDI RSI RDX
|
||
|
; Win7 RAX RCX RDX R8
|
||
|
;
|
||
|
; Description:
|
||
|
; The function shifts Op1 left by one bit, stores the result in Op2 (non-
|
||
|
; destructive shl) and hands back the shifted-out most significant bit of Op1.
|
||
|
; The function operates decreasing in memory supporting in-place operation.
|
||
|
;
|
||
|
; Caveats:
|
||
|
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
|
||
|
; Skylake cores
|
||
|
; - the behaviour of cache prefetching in combination with AVX shifting seems
|
||
|
; somewhat erratic
|
||
|
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
|
||
|
; - slight (a few percent) improvement for full LD1$ sizes
|
||
|
; - substantial (>10%) improvement for 1/2 LD2$ sizes
|
||
|
; - slight (a few percent) improvement for full LD2$ sizes
|
||
|
; - slight (a few percent) degradation for 1/2 LD3$ sizes
|
||
|
; - substantial (around 10%) degradation for full LD3$ sizes
|
||
|
;
|
||
|
; Comments:
|
||
|
; - implemented, tested and benched on 21.02.2016 by jn
|
||
|
; - includes cache prefetching
|
||
|
|
||
|
%define USE_WIN64
|
||
|
|
||
|
%include 'yasm_mac.inc'
|
||
|
|
||
|
BITS 64
|
||
|
|
||
|
%ifdef USE_WIN64
|
||
|
|
||
|
%define Op2 RCX
|
||
|
%define Op1 RDX
|
||
|
%define Size1 R8
|
||
|
%define Limb1 R9
|
||
|
%define Limb2 R10
|
||
|
%define Offs -512 ; used direct def. to stay in Win scratch regs
|
||
|
|
||
|
%define ShlDL0 XMM2 ; ATTN: this must match ShlQL0 definition
|
||
|
%define ShrDL0 XMM3 ; ATTN: this must match ShrQL0 definition
|
||
|
|
||
|
%define QLimb0 YMM0
|
||
|
%define QLimb1 YMM1
|
||
|
%define ShlQL0 YMM2
|
||
|
%define ShrQL0 YMM3
|
||
|
%define ShlQL1 YMM4
|
||
|
%define ShrQL1 YMM5
|
||
|
|
||
|
%else
|
||
|
|
||
|
%define Op2 RDI
|
||
|
%define Op1 RSI
|
||
|
%define Size1 RDX
|
||
|
%define Limb1 R8
|
||
|
%define Limb2 R9
|
||
|
%define Offs -512 ; used direct def. to stay in Win scratch regs
|
||
|
|
||
|
%define ShlDL0 XMM2 ; ATTN: this must match ShlQL0 definition
|
||
|
%define ShrDL0 XMM3 ; ATTN: this must match ShrQL0 definition
|
||
|
|
||
|
%define QLimb0 YMM0
|
||
|
%define QLimb1 YMM1
|
||
|
%define ShlQL0 YMM2
|
||
|
%define ShrQL0 YMM3
|
||
|
%define ShlQL1 YMM4
|
||
|
%define ShrQL1 YMM5
|
||
|
|
||
|
%endif
|
||
|
|
||
|
align 32
|
||
|
|
||
|
LEAF_PROC mpn_lshift1
|
||
|
|
||
|
xor EAX, EAX
|
||
|
sub Size1, 1
|
||
|
jc .Exit ;ajs:notshortform ; Size1=0 =>
|
||
|
|
||
|
lea Op1, [Op1+8*Size1]
|
||
|
lea Op2, [Op2+8*Size1]
|
||
|
|
||
|
mov Limb1, [Op1]
|
||
|
shld RAX, Limb1, 1
|
||
|
|
||
|
or Size1, Size1
|
||
|
je .lShl1EquPost ;ajs:notshortform ; Size1=1 =>
|
||
|
|
||
|
cmp Size1, 8
|
||
|
jc .lShl1EquFour ;ajs:notshortform ; AVX inefficient =>
|
||
|
|
||
|
; first align Op2 to 32 bytes
|
||
|
test Op2, 8
|
||
|
jne .lShl1EquA16
|
||
|
|
||
|
mov Limb2, [Op1-8]
|
||
|
shld Limb1, Limb2, 1
|
||
|
mov [Op2], Limb1
|
||
|
mov Limb1, Limb2
|
||
|
|
||
|
sub Op1, 8
|
||
|
sub Op2, 8
|
||
|
sub Size1, 1
|
||
|
|
||
|
.lShl1EquA16:
|
||
|
|
||
|
test Op2, 16
|
||
|
jne .lShl1EquAVX
|
||
|
|
||
|
mov Limb2, [Op1-8]
|
||
|
shld Limb1, Limb2, 1
|
||
|
mov [Op2], Limb1
|
||
|
mov Limb1, [Op1-16]
|
||
|
shld Limb2, Limb1, 1
|
||
|
mov [Op2-8], Limb2
|
||
|
|
||
|
sub Op1, 16
|
||
|
sub Op2, 16
|
||
|
sub Size1, 2
|
||
|
|
||
|
.lShl1EquAVX:
|
||
|
|
||
|
; pre-fetch first quad-limb
|
||
|
vmovdqu QLimb0, [Op1-24]
|
||
|
vpsrlq ShrQL0, QLimb0, 63
|
||
|
vpermq ShrQL0, ShrQL0, 147 ; 0b10010011
|
||
|
|
||
|
sub Op1, 32
|
||
|
sub Size1, 4
|
||
|
jmp .lShl1EquAVXCheck
|
||
|
|
||
|
; main loop requires on entry:
|
||
|
; - 0.60 cycles per limb in LD1$
|
||
|
; - 0.60-0.75 cycles per limb in LD2$
|
||
|
; - 0.75-1.00 cycles per limb in LD3$
|
||
|
align 16
|
||
|
.lShl1EquAVXLoop:
|
||
|
|
||
|
%ifdef USE_PREFETCH
|
||
|
prefetchnta [Op1+Offs]
|
||
|
%endif
|
||
|
|
||
|
vmovdqu QLimb1, [Op1-24]
|
||
|
vpsllq ShlQL0, QLimb0, 1
|
||
|
vmovdqu QLimb0, [Op1-56]
|
||
|
vpsrlq ShrQL1, QLimb1, 63
|
||
|
vpermq ShrQL1, ShrQL1, 147 ; 0b10010011
|
||
|
vpblendd ShrQL0, ShrQL0, ShrQL1, 3 ; 0b00000011
|
||
|
vpor ShlQL0, ShlQL0, ShrQL0
|
||
|
vpsllq ShlQL1, QLimb1, 1
|
||
|
vpsrlq ShrQL0, QLimb0, 63
|
||
|
vpermq ShrQL0, ShrQL0, 147 ; 0b10010011
|
||
|
vpblendd ShrQL1, ShrQL1, ShrQL0, 3 ; 0b00000011
|
||
|
vmovdqa [Op2-24], ShlQL0
|
||
|
vpor ShlQL1, ShlQL1, ShrQL1
|
||
|
vmovdqa [Op2-56], ShlQL1
|
||
|
|
||
|
sub Op1, 64
|
||
|
sub Op2, 64
|
||
|
|
||
|
.lShl1EquAVXCheck:
|
||
|
|
||
|
sub Size1, 8
|
||
|
jnc .lShl1EquAVXLoop
|
||
|
|
||
|
mov Limb2, [Op1]
|
||
|
mov Limb1, Limb2
|
||
|
shr Limb2, 63
|
||
|
%if 1
|
||
|
vmovq ShlDL0, Limb2
|
||
|
vpblendd ShrQL0, ShrQL0, ShlQL0, 3
|
||
|
%else
|
||
|
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
|
||
|
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
|
||
|
; but it is only executed once and there is no AVX2 based alternative
|
||
|
|
||
|
; Insert value of Limb2 into the 0-th qword of ShrDL0
|
||
|
pinsrq ShrDL0, Limb2, 0 ; SSE4.1
|
||
|
%endif
|
||
|
vpsllq ShlQL0, QLimb0, 1
|
||
|
vpor ShlQL0, ShlQL0, ShrQL0
|
||
|
vmovdqa [Op2-24], ShlQL0
|
||
|
|
||
|
sub Op2, 32
|
||
|
add Size1, 8
|
||
|
|
||
|
; shift remaining max. 7 limbs with SHLD mnemonic
|
||
|
.lShl1EquFour:
|
||
|
|
||
|
sub Op1, 8
|
||
|
test Size1, 4
|
||
|
je .lShl1EquTwo
|
||
|
|
||
|
mov Limb2, [Op1]
|
||
|
shld Limb1, Limb2, 1
|
||
|
mov [Op2], Limb1
|
||
|
mov Limb1, [Op1-8]
|
||
|
shld Limb2, Limb1, 1
|
||
|
mov [Op2-8], Limb2
|
||
|
mov Limb2, [Op1-16]
|
||
|
shld Limb1, Limb2, 1
|
||
|
mov [Op2-16], Limb1
|
||
|
mov Limb1, [Op1-24]
|
||
|
shld Limb2, Limb1, 1
|
||
|
mov [Op2-24], Limb2
|
||
|
|
||
|
sub Op1, 32
|
||
|
sub Op2, 32
|
||
|
|
||
|
.lShl1EquTwo:
|
||
|
|
||
|
test Size1, 2
|
||
|
je .lShl1EquOne
|
||
|
|
||
|
mov Limb2, [Op1]
|
||
|
shld Limb1, Limb2, 1
|
||
|
mov [Op2], Limb1
|
||
|
mov Limb1, [Op1-8]
|
||
|
shld Limb2, Limb1, 1
|
||
|
mov [Op2-8], Limb2
|
||
|
|
||
|
sub Op1, 16
|
||
|
sub Op2, 16
|
||
|
|
||
|
.lShl1EquOne:
|
||
|
|
||
|
test Size1, 1
|
||
|
je .lShl1EquPost
|
||
|
|
||
|
mov Limb2, [Op1]
|
||
|
shld Limb1, Limb2, 1
|
||
|
mov [Op2], Limb1
|
||
|
mov Limb1, Limb2
|
||
|
|
||
|
sub Op2, 8
|
||
|
|
||
|
.lShl1EquPost:
|
||
|
|
||
|
shl Limb1, 1
|
||
|
mov [Op2], Limb1
|
||
|
|
||
|
.Exit:
|
||
|
|
||
|
vzeroupper
|
||
|
ret
|
||
|
.end:
|