AVX-based rshift for 4-issue Intel cpus (Haswell and newer)

2016-11-22 23:18:52 +01:00 · 2016-11-22 23:18:52 +01:00 · 4660be16f6
commit 4660be16f6
parent 105c26c466
1 changed files with 277 additions and 0 deletions
--- a/mpn/x86_64/haswell/rshift.as
+++ b/mpn/x86_64/haswell/rshift.as
@ -0,0 +1,277 @@
+
+;  Copyright 2016 Jens Nurmann and Alexander Kruppa
+
+;  This file is part of the MPIR Library.
+
+;  The MPIR Library is free software; you can redistribute it and/or modify
+;  it under the terms of the GNU Lesser General Public License as published
+;  by the Free Software Foundation; either version 2.1 of the License, or (at
+;  your option) any later version.
+
+;  The MPIR Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+;  License for more details.
+
+;  You should have received a copy of the GNU Lesser General Public License
+;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
+;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;  Boston, MA 02110-1301, USA.
+
+; mp_limb_t mpn_rshift(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1, unsigned int Shift)
+; Linux     RAX        RDI         RSI            RDX              RCX
+; Win7      RAX        RCX         RDX            R8               R9
+;
+; Description:
+; The function shifts Op1 right by Shift bits, stores the result in Op2 (non-
+; destructive shr) and hands back the shifted-out least significant bits of
+; Op1. The function operates increasing in memory supporting in place shifts.
+;
+; Result:
+; - Op2[ Size1-1..0 ] := ( ShrIn:Op1[ Size1-1..0 ] ) >> Shift
+; - Op1[ 0 ] << ( 64-Shift )
+;
+; Caveats:
+; - caller must ensure that Shift is in [ 1..63 ]!
+; - currently Linux64 support only!
+; - the AVX version uses mnemonics only available on Haswell, Broadwell and
+;   Skylake cores
+; - the behaviour of cache prefetching in combination with AVX shifting seems
+;   somewhat erratic
+;    - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
+;    - slight (a few percent) improvement for full LD1$ sizes
+;    - substantial (>10%) improvement for 1/2 LD2$ sizes
+;    - slight (a few percent) improvement for full LD2$ sizes
+;    - slight (a few percent) degradation for 1/2 LD3$ sizes
+;    - substantial (around 10%) degradation for full LD3$ sizes
+;
+; Comments:
+; - implemented, tested and benchmarked on 30.03.2016 by jn
+; - includes prefetching
+; ============================================================================
+
+%include 'yasm_mac.inc'
+
+BITS 64
+
+%ifdef USE_WIN64
+    %define Op2         RCX
+    %define Op1         RDX
+    %define Size1       R8
+    %define Shift       R9
+    %define Limb1       R10
+    %define Limb2       R11
+  %ifdef USE_PREFETCH
+    %define Offs        -512    ; No caller-saves regs left, use immediate
+  %endif
+%else
+    %define Op2         RDI
+    %define Op1         RSI
+    %define Size1       RDX
+    %define Shift       RCX
+    %define Limb1       R8
+    %define Limb2       R9
+  %ifdef USE_PREFETCH
+    %define OFFS_REG 1
+    %define Offs        R10
+  %endif
+%endif
+
+%define ShrDL0      XMM2    ; Attn: this must match ShrQL0 definition
+%define ShlDL0      XMM3    ; Attn: this must match ShlQL0 definition
+%define ShrDLCnt    XMM6    ; Attn: this must match ShrQlCnt definition
+%define ShlDLCnt    XMM7    ; Attn: this must match ShlQlCnt definition
+
+%define QLimb0      YMM0
+%define QLimb1      YMM1
+%define ShrQL0      YMM2
+%define ShlQL0      YMM3
+%define ShrQL1      YMM4
+%define ShlQL1      YMM5
+%define ShrQLCnt    YMM6
+%define ShlQLCnt    YMM7
+
+    align   32
+GLOBAL_FUNC mpn_rshift
+
+    xor     EAX, EAX
+    or      Size1, Size1
+    je      .Exit
+
+    mov     Limb1, [Op1]
+    shrd    RAX, Limb1, CL
+
+    sub     Size1, 1
+    je      .lShrEquPost        ; Size1=1 =>
+
+  %ifdef USE_PREFETCH
+    mov     Offs, 512
+  %endif
+
+    cmp     Size1, 8
+    jc      .lShrEquFour        ; AVX inefficient =>
+
+    ; first align Op2 to 32 bytes
+    test    Op2, 8
+    je      .lShrEquAlign16
+
+    mov     Limb2, [Op1+8]
+    shrd    Limb1, Limb2, CL
+    mov     [Op2], Limb1
+    mov     Limb1, Limb2
+
+    add     Op1, 8
+    add     Op2, 8
+    sub     Size1, 1
+
+  .lShrEquAlign16:
+
+    test    Op2, 16
+    je      .lShrEquAVX
+
+    mov     Limb2, [Op1+8]
+    shrd    Limb1, Limb2, CL
+    mov     [Op2], Limb1
+    mov     Limb1, [Op1+16]
+    shrd    Limb2, Limb1, CL
+    mov     [Op2+8], Limb2
+
+    add     Op1, 16
+    add     Op2, 16
+    sub     Size1, 2
+
+  .lShrEquAVX:
+
+    ; initialize AVX shift counter
+    vmovq   ShrDLCnt, RCX
+    neg     RCX
+    and     RCX, 63             ; must do, as AVX shifts set result=0 if Shift>63!
+    vmovq   ShlDLCnt, RCX
+    neg     RCX
+    and     RCX, 63             ; must do, as AVX shifts set result=0 if Shift>63!
+    vpbroadcastq ShrQLCnt, ShrDLCnt
+    vpbroadcastq ShlQLCnt, ShlDLCnt
+
+    ; pre-fetch first quad-limb
+    vmovdqu QLimb0, [Op1]
+    vpsllvq ShlQL0, QLimb0, ShlQLCnt
+
+    add     Op1, 32
+    sub     Size1, 4
+    jmp     .lShrEquAVXCheck
+
+    ; main loop (prefetching enabled, unloaded data cache)
+    ; - 0.60      cycles per limb in LD1$
+    ; - 0.60-0.70 cycles per limb in LD2$
+    ; - 0.70-0.90 cycles per limb in LD3$
+    align   16
+  .lShrEquAVXLoop:
+
+  %ifdef USE_PREFETCH
+    prefetchnta [Op1+Offs]
+  %endif
+
+    vmovdqu   QLimb1, [Op1]
+    vpsrlvq   ShrQL0, QLimb0, ShrQLCnt
+    vmovdqu   QLimb0, [Op1+32]
+    vpsllvq   ShlQL1, QLimb1, ShlQLCnt
+    vpblendd  ShlQL0, ShlQL0, ShlQL1, 00000011b
+    vpermq    ShlQL0, ShlQL0, 00111001b
+    vpor      ShrQL0, ShrQL0, ShlQL0
+    vpsrlvq   ShrQL1, QLimb1, ShrQLCnt
+    vpsllvq   ShlQL0, QLimb0, ShlQLCnt
+    vpblendd  ShlQL1, ShlQL1, ShlQL0, 00000011b
+    vpermq    ShlQL1, ShlQL1, 00111001b
+    vmovdqa   [Op2], ShrQL0
+    vpor      ShrQL1, ShrQL1, ShlQL1
+    vmovdqa   [Op2+32], ShrQL1
+
+    add     Op1, 64
+    add     Op2, 64
+
+  .lShrEquAVXCheck:
+
+    sub     Size1, 8
+    jnc     .lShrEquAVXLoop
+
+    mov     Limb1, [Op1]
+    xor     Limb2, Limb2
+    shrd    Limb2, Limb1, CL
+%if 1
+    vmovq   ShrDL0, Limb2
+    vpblendd ShlQL0, ShlQL0, ShrQL0, 3
+%else
+    ; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
+    ; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
+    ; but it is only executed once and there is no AVX2 based alternative
+    pinsrq  ShlDL0, Limb2, 0            ; SSE4.1
+%endif
+    vpsrlvq ShrQL0, QLimb0, ShrQLCnt
+    vpermq  ShlQL0, ShlQL0, 00111001b
+    vpor    ShrQL0, ShrQL0, ShlQL0
+    vmovdqa [Op2], ShrQL0
+
+    add     Op2, 32
+    add     Size1, 8
+
+    ; shift remaining max. 7 limbs with SHRD mnemonic
+  .lShrEquFour:
+
+    add     Op1, 8
+    test    Size1, 4
+    je      .lShrEquTwo
+
+    mov     Limb2, [Op1]
+    shrd    Limb1, Limb2, CL
+    mov     [Op2], Limb1
+    mov     Limb1, [Op1+8]
+    shrd    Limb2, Limb1, CL
+    mov     [Op2+8], Limb2
+    mov     Limb2, [Op1+16]
+    shrd    Limb1, Limb2, CL
+    mov     [Op2+16], Limb1
+    mov     Limb1, [Op1+24]
+    shrd    Limb2, Limb1, CL
+    mov     [Op2+24], Limb2
+
+    add     Op1, 32
+    add     Op2, 32
+
+  .lShrEquTwo:
+
+    test    Size1, 2
+    je      .lShrEquOne
+
+    mov     Limb2, [Op1]
+    shrd    Limb1, Limb2, CL
+    mov     [Op2], Limb1
+    mov     Limb1, [Op1+8]
+    shrd    Limb2, Limb1, CL
+    mov     [Op2+8], Limb2
+
+    add     Op1, 16
+    add     Op2, 16
+
+  .lShrEquOne:
+
+    test    Size1, 1
+    je      .lShrEquPost
+
+    mov     Limb2, [Op1]
+    shrd    Limb1, Limb2, CL
+    mov     [Op2], Limb1
+    mov     Limb1, Limb2
+
+    add     Op2, 8
+
+    ; store most significant limb considering shift-in part
+  .lShrEquPost:
+
+    shr     Limb1, CL
+    mov     [Op2], Limb1
+
+  .Exit:
+
+    vzeroupper
+    ret
+.end: