mpir/mpn/x86_64w/skylake/avx/lshift1.asm


;  Copyright 2016 Jens Nurmann and Alexander Kruppa

;  This file is part of the MPIR Library.

;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.

;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.

;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.

; mp_limb_t mpn_lshift1(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1 )
; Linux     RAX         RDI         RSI            RDX
; Win7      RAX         RCX         RDX            R8
;
; Description:
; The function shifts Op1 left by one bit, stores the result in Op2 (non-
; destructive shl) and hands back the shifted-out most significant bit of Op1.
; The function operates decreasing in memory supporting in-place operation.
;
; Caveats:
; - the AVX version uses mnemonics only available on Haswell, Broadwell and
;   Skylake cores
; - the behaviour of cache prefetching in combination with AVX shifting seems
;   somewhat erratic
;    - slight (a few clock cycles) degradation for 1/2 LD1$ sizes
;    - slight (a few percent) improvement for full LD1$ sizes
;    - substantial (>10%) improvement for 1/2 LD2$ sizes
;    - slight (a few percent) improvement for full LD2$ sizes
;    - slight (a few percent) degradation for 1/2 LD3$ sizes
;    - substantial (around 10%) degradation for full LD3$ sizes
;
; Comments:
; - implemented, tested and benched on 21.02.2016 by jn
; - includes cache prefetching

%define USE_WIN64

%include 'yasm_mac.inc'

BITS 64

%ifdef USE_WIN64

    %define Op2         RCX
    %define Op1         RDX
    %define Size1       R8
    %define Limb1       R9
    %define Limb2       R10
    %define Offs        -512    ; used direct def. to stay in Win scratch regs

    %define ShlDL0      XMM2    ; ATTN: this must match ShlQL0 definition
    %define ShrDL0      XMM3    ; ATTN: this must match ShrQL0 definition

    %define QLimb0      YMM0
    %define QLimb1      YMM1
    %define ShlQL0      YMM2
    %define ShrQL0      YMM3
    %define ShlQL1      YMM4
    %define ShrQL1      YMM5

%else

    %define Op2         RDI
    %define Op1         RSI
    %define Size1       RDX
    %define Limb1       R8
    %define Limb2       R9
    %define Offs        -512    ; used direct def. to stay in Win scratch regs

    %define ShlDL0      XMM2    ; ATTN: this must match ShlQL0 definition
    %define ShrDL0      XMM3    ; ATTN: this must match ShrQL0 definition

    %define QLimb0      YMM0
    %define QLimb1      YMM1
    %define ShlQL0      YMM2
    %define ShrQL0      YMM3
    %define ShlQL1      YMM4
    %define ShrQL1      YMM5

%endif

    align   32

LEAF_PROC mpn_lshift1

    xor     EAX, EAX
    sub      Size1, 1
    jc      .Exit               ;ajs:notshortform ; Size1=0 =>

    lea     Op1, [Op1+8*Size1]
    lea     Op2, [Op2+8*Size1]

    mov     Limb1, [Op1]
    shld    RAX, Limb1, 1

    or      Size1, Size1
    je      .lShl1EquPost       ;ajs:notshortform ; Size1=1 =>

    cmp     Size1, 8
    jc      .lShl1EquFour       ;ajs:notshortform ; AVX inefficient =>

    ; first align Op2 to 32 bytes
    test    Op2, 8
    jne     .lShl1EquA16

    mov     Limb2, [Op1-8]
    shld    Limb1, Limb2, 1
    mov     [Op2], Limb1
    mov     Limb1, Limb2

    sub     Op1, 8
    sub     Op2, 8
    sub     Size1, 1

  .lShl1EquA16:

    test    Op2, 16
    jne     .lShl1EquAVX

    mov     Limb2, [Op1-8]
    shld    Limb1, Limb2, 1
    mov     [Op2], Limb1
    mov     Limb1, [Op1-16]
    shld    Limb2, Limb1, 1
    mov     [Op2-8], Limb2

    sub     Op1, 16
    sub     Op2, 16
    sub     Size1, 2

  .lShl1EquAVX:

    ; pre-fetch first quad-limb
    vmovdqu QLimb0, [Op1-24]
    vpsrlq  ShrQL0, QLimb0, 63
    vpermq  ShrQL0, ShrQL0, 147		; 0b10010011

    sub     Op1, 32
    sub     Size1, 4
    jmp     .lShl1EquAVXCheck

    ; main loop requires on entry:
    ; - 0.60      cycles per limb in LD1$
    ; - 0.60-0.75 cycles per limb in LD2$
    ; - 0.75-1.00 cycles per limb in LD3$
    align   16
  .lShl1EquAVXLoop:

  %ifdef USE_PREFETCH
    prefetchnta [Op1+Offs]
  %endif

    vmovdqu   QLimb1, [Op1-24]
    vpsllq    ShlQL0, QLimb0, 1
    vmovdqu   QLimb0, [Op1-56]
    vpsrlq    ShrQL1, QLimb1, 63
    vpermq    ShrQL1, ShrQL1, 147	; 0b10010011
    vpblendd  ShrQL0, ShrQL0, ShrQL1, 3	; 0b00000011
    vpor      ShlQL0, ShlQL0, ShrQL0
    vpsllq    ShlQL1, QLimb1, 1
    vpsrlq    ShrQL0, QLimb0, 63
    vpermq    ShrQL0, ShrQL0, 147	; 0b10010011
    vpblendd  ShrQL1, ShrQL1, ShrQL0, 3	; 0b00000011
    vmovdqa   [Op2-24], ShlQL0
    vpor      ShlQL1, ShlQL1, ShrQL1
    vmovdqa   [Op2-56], ShlQL1

    sub     Op1, 64
    sub     Op2, 64

  .lShl1EquAVXCheck:

    sub     Size1, 8
    jnc     .lShl1EquAVXLoop

    mov     Limb2, [Op1]
    mov     Limb1, Limb2
    shr     Limb2, 63
%if 1
    vmovq ShlDL0, Limb2
    vpblendd ShrQL0, ShrQL0, ShlQL0, 3
%else
    ; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2
    ; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)
    ; but it is only executed once and there is no AVX2 based alternative

    ; Insert value of Limb2 into the 0-th qword of ShrDL0
    pinsrq  ShrDL0, Limb2, 0        ; SSE4.1
%endif
    vpsllq  ShlQL0, QLimb0, 1
    vpor    ShlQL0, ShlQL0, ShrQL0
    vmovdqa [Op2-24], ShlQL0

    sub     Op2, 32
    add     Size1, 8

    ; shift remaining max. 7 limbs with SHLD mnemonic
  .lShl1EquFour:

    sub     Op1, 8
    test    Size1, 4
    je      .lShl1EquTwo

    mov     Limb2, [Op1]
    shld    Limb1, Limb2, 1
    mov     [Op2], Limb1
    mov     Limb1, [Op1-8]
    shld    Limb2, Limb1, 1
    mov     [Op2-8], Limb2
    mov     Limb2, [Op1-16]
    shld    Limb1, Limb2, 1
    mov     [Op2-16], Limb1
    mov     Limb1, [Op1-24]
    shld    Limb2, Limb1, 1
    mov     [Op2-24], Limb2

    sub     Op1, 32
    sub     Op2, 32

  .lShl1EquTwo:

    test    Size1, 2
    je      .lShl1EquOne

    mov     Limb2, [Op1]
    shld    Limb1, Limb2, 1
    mov     [Op2], Limb1
    mov     Limb1, [Op1-8]
    shld    Limb2, Limb1, 1
    mov     [Op2-8], Limb2

    sub     Op1, 16
    sub     Op2, 16

  .lShl1EquOne:

    test    Size1, 1
    je      .lShl1EquPost

    mov     Limb2, [Op1]
    shld    Limb1, Limb2, 1
    mov     [Op2], Limb1
    mov     Limb1, Limb2

    sub     Op2, 8

  .lShl1EquPost:

    shl     Limb1, 1
    mov     [Op2], Limb1

  .Exit:

    vzeroupper
    ret
.end:
add assembler code for haswell, skylake and skylake_avx to the WIn64 build 2016-12-06 07:01:20 -05:00
			`; Copyright 2016 Jens Nurmann and Alexander Kruppa`

			`; This file is part of the MPIR Library.`

			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2.1 of the License, or (at`
			`; your option) any later version.`

			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`

			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`

			`; mp_limb_t mpn_lshift1(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1 )`
			`; Linux RAX RDI RSI RDX`
			`; Win7 RAX RCX RDX R8`
			`;`
			`; Description:`
			`; The function shifts Op1 left by one bit, stores the result in Op2 (non-`
			`; destructive shl) and hands back the shifted-out most significant bit of Op1.`
			`; The function operates decreasing in memory supporting in-place operation.`
			`;`
			`; Caveats:`
			`; - the AVX version uses mnemonics only available on Haswell, Broadwell and`
			`; Skylake cores`
			`; - the behaviour of cache prefetching in combination with AVX shifting seems`
			`; somewhat erratic`
			`; - slight (a few clock cycles) degradation for 1/2 LD1$ sizes`
			`; - slight (a few percent) improvement for full LD1$ sizes`
			`; - substantial (>10%) improvement for 1/2 LD2$ sizes`
			`; - slight (a few percent) improvement for full LD2$ sizes`
			`; - slight (a few percent) degradation for 1/2 LD3$ sizes`
			`; - substantial (around 10%) degradation for full LD3$ sizes`
			`;`
			`; Comments:`
			`; - implemented, tested and benched on 21.02.2016 by jn`
			`; - includes cache prefetching`

			`%define USE_WIN64`

			`%include 'yasm_mac.inc'`

			`BITS 64`

			`%ifdef USE_WIN64`

			`%define Op2 RCX`
			`%define Op1 RDX`
			`%define Size1 R8`
			`%define Limb1 R9`
			`%define Limb2 R10`
			`%define Offs -512 ; used direct def. to stay in Win scratch regs`

			`%define ShlDL0 XMM2 ; ATTN: this must match ShlQL0 definition`
			`%define ShrDL0 XMM3 ; ATTN: this must match ShrQL0 definition`

			`%define QLimb0 YMM0`
			`%define QLimb1 YMM1`
			`%define ShlQL0 YMM2`
			`%define ShrQL0 YMM3`
			`%define ShlQL1 YMM4`
			`%define ShrQL1 YMM5`

			`%else`

			`%define Op2 RDI`
			`%define Op1 RSI`
			`%define Size1 RDX`
			`%define Limb1 R8`
			`%define Limb2 R9`
			`%define Offs -512 ; used direct def. to stay in Win scratch regs`

			`%define ShlDL0 XMM2 ; ATTN: this must match ShlQL0 definition`
			`%define ShrDL0 XMM3 ; ATTN: this must match ShrQL0 definition`

			`%define QLimb0 YMM0`
			`%define QLimb1 YMM1`
			`%define ShlQL0 YMM2`
			`%define ShrQL0 YMM3`
			`%define ShlQL1 YMM4`
			`%define ShrQL1 YMM5`

			`%endif`

			`align 32`

			`LEAF_PROC mpn_lshift1`

			`xor EAX, EAX`
			`sub Size1, 1`
			`jc .Exit ;ajs:notshortform ; Size1=0 =>`

			`lea Op1, [Op1+8*Size1]`
			`lea Op2, [Op2+8*Size1]`

			`mov Limb1, [Op1]`
			`shld RAX, Limb1, 1`

			`or Size1, Size1`
			`je .lShl1EquPost ;ajs:notshortform ; Size1=1 =>`

			`cmp Size1, 8`
			`jc .lShl1EquFour ;ajs:notshortform ; AVX inefficient =>`

			`; first align Op2 to 32 bytes`
			`test Op2, 8`
			`jne .lShl1EquA16`

			`mov Limb2, [Op1-8]`
			`shld Limb1, Limb2, 1`
			`mov [Op2], Limb1`
			`mov Limb1, Limb2`

			`sub Op1, 8`
			`sub Op2, 8`
			`sub Size1, 1`

			`.lShl1EquA16:`

			`test Op2, 16`
			`jne .lShl1EquAVX`

			`mov Limb2, [Op1-8]`
			`shld Limb1, Limb2, 1`
			`mov [Op2], Limb1`
			`mov Limb1, [Op1-16]`
			`shld Limb2, Limb1, 1`
			`mov [Op2-8], Limb2`

			`sub Op1, 16`
			`sub Op2, 16`
			`sub Size1, 2`

			`.lShl1EquAVX:`

			`; pre-fetch first quad-limb`
			`vmovdqu QLimb0, [Op1-24]`
			`vpsrlq ShrQL0, QLimb0, 63`
			`vpermq ShrQL0, ShrQL0, 147 ; 0b10010011`

			`sub Op1, 32`
			`sub Size1, 4`
			`jmp .lShl1EquAVXCheck`

			`; main loop requires on entry:`
			`; - 0.60 cycles per limb in LD1$`
			`; - 0.60-0.75 cycles per limb in LD2$`
			`; - 0.75-1.00 cycles per limb in LD3$`
			`align 16`
			`.lShl1EquAVXLoop:`

			`%ifdef USE_PREFETCH`
			`prefetchnta [Op1+Offs]`
			`%endif`

			`vmovdqu QLimb1, [Op1-24]`
			`vpsllq ShlQL0, QLimb0, 1`
			`vmovdqu QLimb0, [Op1-56]`
			`vpsrlq ShrQL1, QLimb1, 63`
			`vpermq ShrQL1, ShrQL1, 147 ; 0b10010011`
			`vpblendd ShrQL0, ShrQL0, ShrQL1, 3 ; 0b00000011`
			`vpor ShlQL0, ShlQL0, ShrQL0`
			`vpsllq ShlQL1, QLimb1, 1`
			`vpsrlq ShrQL0, QLimb0, 63`
			`vpermq ShrQL0, ShrQL0, 147 ; 0b10010011`
			`vpblendd ShrQL1, ShrQL1, ShrQL0, 3 ; 0b00000011`
			`vmovdqa [Op2-24], ShlQL0`
			`vpor ShlQL1, ShlQL1, ShrQL1`
			`vmovdqa [Op2-56], ShlQL1`

			`sub Op1, 64`
			`sub Op2, 64`

			`.lShl1EquAVXCheck:`

			`sub Size1, 8`
			`jnc .lShl1EquAVXLoop`

			`mov Limb2, [Op1]`
			`mov Limb1, Limb2`
			`shr Limb2, 63`
			`%if 1`
			`vmovq ShlDL0, Limb2`
			`vpblendd ShrQL0, ShrQL0, ShlQL0, 3`
			`%else`
			`; I am mixing in a single SSE4.1 instruction into otherwise pure AVX2`
			`; this is generating stalls on Haswell & Broadwell architecture (Agner Fog)`
			`; but it is only executed once and there is no AVX2 based alternative`

			`; Insert value of Limb2 into the 0-th qword of ShrDL0`
			`pinsrq ShrDL0, Limb2, 0 ; SSE4.1`
			`%endif`
			`vpsllq ShlQL0, QLimb0, 1`
			`vpor ShlQL0, ShlQL0, ShrQL0`
			`vmovdqa [Op2-24], ShlQL0`

			`sub Op2, 32`
			`add Size1, 8`

			`; shift remaining max. 7 limbs with SHLD mnemonic`
			`.lShl1EquFour:`

			`sub Op1, 8`
			`test Size1, 4`
			`je .lShl1EquTwo`

			`mov Limb2, [Op1]`
			`shld Limb1, Limb2, 1`
			`mov [Op2], Limb1`
			`mov Limb1, [Op1-8]`
			`shld Limb2, Limb1, 1`
			`mov [Op2-8], Limb2`
			`mov Limb2, [Op1-16]`
			`shld Limb1, Limb2, 1`
			`mov [Op2-16], Limb1`
			`mov Limb1, [Op1-24]`
			`shld Limb2, Limb1, 1`
			`mov [Op2-24], Limb2`

			`sub Op1, 32`
			`sub Op2, 32`

			`.lShl1EquTwo:`

			`test Size1, 2`
			`je .lShl1EquOne`

			`mov Limb2, [Op1]`
			`shld Limb1, Limb2, 1`
			`mov [Op2], Limb1`
			`mov Limb1, [Op1-8]`
			`shld Limb2, Limb1, 1`
			`mov [Op2-8], Limb2`

			`sub Op1, 16`
			`sub Op2, 16`

			`.lShl1EquOne:`

			`test Size1, 1`
			`je .lShl1EquPost`

			`mov Limb2, [Op1]`
			`shld Limb1, Limb2, 1`
			`mov [Op2], Limb1`
			`mov Limb1, Limb2`

			`sub Op2, 8`

			`.lShl1EquPost:`

			`shl Limb1, 1`
			`mov [Op2], Limb1`

			`.Exit:`

			`vzeroupper`
			`ret`
			`.end:`