AVX-based rshift for 4-issue Intel cpus (Haswell and newer)

This commit is contained in:
Alexander Kruppa 2016-11-22 23:18:52 +01:00
parent 105c26c466
commit 4660be16f6

View File

@ -0,0 +1,277 @@
; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mp_limb_t mpn_rshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
; Linux RAX RDI RSI RDX RCX
; Win7 RAX RCX RDX R8 R9
;
; Description:
; The function shifts Op1 right by Shift bits, stores the result in Op2 (non-
; destructive shr) and hands back the shifted-out least significant bits of
; Op1. The function operates increasing in memory supporting in place shifts.
;
; Result:
; - Op2[ Size1-1..0 ] := ( ShrIn:Op1[ Size1-1..0 ] ) >> Shift
; - Op1[ 0 ] << ( 64-Shift )
;
; Caveats:
; - caller must ensure that Shift is in [ 1..63 ]!
; - currently Linux64 support only!
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
; Skylake cores
; - the behaviour of cache prefetching in combination with AVX shifting seems
; somewhat erratic
; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
; - slight (a few percent) improvement for full LD1$ sizes
; - substantial (>10%) improvement for 1/2 LD2$ sizes
; - slight (a few percent) improvement for full LD2$ sizes
; - slight (a few percent) degradation for 1/2 LD3$ sizes
; - substantial (around 10%) degradation for full LD3$ sizes
;
; Comments:
; - implemented, tested and benchmarked on 30.03.2016 by jn
; - includes prefetching
; ============================================================================
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 RCX
%define Op1 RDX
%define Size1 R8
%define Shift R9
%define Limb1 R10
%define Limb2 R11
%ifdef USE_PREFETCH
%define Offs -512 ; No caller-saves regs left, use immediate
%endif
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Shift RCX
%define Limb1 R8
%define Limb2 R9
%ifdef USE_PREFETCH
%define OFFS_REG 1
%define Offs R10
%endif
%endif
%define ShrDL0 XMM2 ; Attn: this must match ShrQL0 definition
%define ShlDL0 XMM3 ; Attn: this must match ShlQL0 definition
%define ShrDLCnt XMM6 ; Attn: this must match ShrQlCnt definition
%define ShlDLCnt XMM7 ; Attn: this must match ShlQlCnt definition
%define QLimb0 YMM0
%define QLimb1 YMM1
%define ShrQL0 YMM2
%define ShlQL0 YMM3
%define ShrQL1 YMM4
%define ShlQL1 YMM5
%define ShrQLCnt YMM6
%define ShlQLCnt YMM7
align 32
GLOBAL_FUNC mpn_rshift
xor EAX, EAX
or Size1, Size1
je .Exit
mov Limb1, [Op1]
shrd RAX, Limb1, CL
sub Size1, 1
je .lShrEquPost ; Size1=1 =>
%ifdef USE_PREFETCH
mov Offs, 512
%endif
cmp Size1, 8
jc .lShrEquFour ; AVX inefficient =>
; first align Op2 to 32 bytes
test Op2, 8
je .lShrEquAlign16
mov Limb2, [Op1+8]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, Limb2
add Op1, 8
add Op2, 8
sub Size1, 1
.lShrEquAlign16:
test Op2, 16
je .lShrEquAVX
mov Limb2, [Op1+8]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1+16]
shrd Limb2, Limb1, CL
mov [Op2+8], Limb2
add Op1, 16
add Op2, 16
sub Size1, 2
.lShrEquAVX:
; initialize AVX shift counter
vmovq ShrDLCnt, RCX
neg RCX
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
vmovq ShlDLCnt, RCX
neg RCX
and RCX, 63 ; must do, as AVX shifts set result=0 if Shift>63!
vpbroadcastq ShrQLCnt, ShrDLCnt
vpbroadcastq ShlQLCnt, ShlDLCnt
; pre-fetch first quad-limb
vmovdqu QLimb0, [Op1]
vpsllvq ShlQL0, QLimb0, ShlQLCnt
add Op1, 32
sub Size1, 4
jmp .lShrEquAVXCheck
; main loop (prefetching enabled, unloaded data cache)
; - 0.60 cycles per limb in LD1$
; - 0.60-0.70 cycles per limb in LD2$
; - 0.70-0.90 cycles per limb in LD3$
align 16
.lShrEquAVXLoop:
%ifdef USE_PREFETCH
prefetchnta [Op1+Offs]
%endif
vmovdqu QLimb1, [Op1]
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
vmovdqu QLimb0, [Op1+32]
vpsllvq ShlQL1, QLimb1, ShlQLCnt
vpblendd ShlQL0, ShlQL0, ShlQL1, 00000011b
vpermq ShlQL0, ShlQL0, 00111001b
vpor ShrQL0, ShrQL0, ShlQL0
vpsrlvq ShrQL1, QLimb1, ShrQLCnt
vpsllvq ShlQL0, QLimb0, ShlQLCnt
vpblendd ShlQL1, ShlQL1, ShlQL0, 00000011b
vpermq ShlQL1, ShlQL1, 00111001b
vmovdqa [Op2], ShrQL0
vpor ShrQL1, ShrQL1, ShlQL1
vmovdqa [Op2+32], ShrQL1
add Op1, 64
add Op2, 64
.lShrEquAVXCheck:
sub Size1, 8
jnc .lShrEquAVXLoop
mov Limb1, [Op1]
xor Limb2, Limb2
shrd Limb2, Limb1, CL
%if 1
vmovq ShrDL0, Limb2
vpblendd ShlQL0, ShlQL0, ShrQL0, 3
%else
; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
; but it is only executed once and there is no AVX2 based alternative
pinsrq ShlDL0, Limb2, 0 ; SSE4.1
%endif
vpsrlvq ShrQL0, QLimb0, ShrQLCnt
vpermq ShlQL0, ShlQL0, 00111001b
vpor ShrQL0, ShrQL0, ShlQL0
vmovdqa [Op2], ShrQL0
add Op2, 32
add Size1, 8
; shift remaining max. 7 limbs with SHRD mnemonic
.lShrEquFour:
add Op1, 8
test Size1, 4
je .lShrEquTwo
mov Limb2, [Op1]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1+8]
shrd Limb2, Limb1, CL
mov [Op2+8], Limb2
mov Limb2, [Op1+16]
shrd Limb1, Limb2, CL
mov [Op2+16], Limb1
mov Limb1, [Op1+24]
shrd Limb2, Limb1, CL
mov [Op2+24], Limb2
add Op1, 32
add Op2, 32
.lShrEquTwo:
test Size1, 2
je .lShrEquOne
mov Limb2, [Op1]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, [Op1+8]
shrd Limb2, Limb1, CL
mov [Op2+8], Limb2
add Op1, 16
add Op2, 16
.lShrEquOne:
test Size1, 1
je .lShrEquPost
mov Limb2, [Op1]
shrd Limb1, Limb2, CL
mov [Op2], Limb1
mov Limb1, Limb2
add Op2, 8
; store most significant limb considering shift-in part
.lShrEquPost:
shr Limb1, CL
mov [Op2], Limb1
.Exit:
vzeroupper
ret
.end: