mpir/mpn/x86_64w/skylake/com_n.asm


;  AMD64 mpn_com_n
;  Copyright 2016 Jens Nurmann and Alexander Kruppa
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.

%define USE_WIN64

BITS 64

%include 'yasm_mac.inc'

%ifdef USE_WIN64
    %define Op2     RCX
    %define Op1     RDX
    %define Size1   R8
    %define Limb    R9
    %define Offs    R10
    %define FFFF    R11
%else
    %define Op2     RDI
    %define Op1     RSI
    %define Size1   RDX
    %define Limb    RCX
    %define Offs    R10
    %define FFFF    R8
%endif

%define DLimb0  XMM0
%define QLimb0  YMM0
%define QLimb1  YMM1
%define QLimb2  YMM2
%define QLimb3  YMM3
%define QFFFF   YMM4
%define DFFFF   XMM4

    align   32

LEAF_PROC mpn_com_n
    mov     RAX, Size1
    or      RAX, RAX
    je      .Exit      ;ajs:notshortform
                       ; size=0 =>

    ; Set a GPR to 0xFF...FF
    mov     FFFF, -1

    ; align the destination (Op2) to 32 byte
    test    Op2, 8
    je      .A32

    mov     Limb, [Op1]
    xor     Limb, FFFF
    mov     [Op2], Limb
    dec     Size1
    je      .Exit      ;ajs:notshortform

    add     Op1, 8
    add     Op2, 8

  .A32:

    test    Op2, 16
    je      .AVX

    mov     Limb, [Op1]
    xor     Limb, FFFF
    mov     [Op2], Limb
    dec     Size1
    je      .Exit       ;ajs:notshortform

    mov     Limb, [Op1+8]
    xor     Limb, FFFF
    mov     [Op2+8], Limb
    dec     Size1
    je      .Exit       ;ajs:notshortform

    add     Op1, 16
    add     Op2, 16

  .AVX:

    ; Set an AVX2 reg to 0xFF...FF
    movq    DFFFF, FFFF
    vbroadcastsd QFFFF, DFFFF

    mov     Offs, 128
    jmp     .AVXCheck

    ; main loop (prefetching disabled; unloaded cache)
    ; - lCpyInc is slightly slower than lCpyDec through all cache levels?!
    ; - 0.30      cycles / limb in L1$
    ; - 0.60      cycles / limb in L2$
    ; - 0.70-0.90 cycles / limb in L3$
    align   16
  .AVXLoop:

    vmovdqu QLimb0, [Op1]
    vpxor   QLimb0, QLimb0, QFFFF
    vmovdqu QLimb1, [Op1+32]
    vpxor   QLimb1, QLimb1, QFFFF
    vmovdqu QLimb2, [Op1+64]
    vpxor   QLimb2, QLimb2, QFFFF
    vmovdqu QLimb3, [Op1+96]
    vpxor   QLimb3, QLimb3, QFFFF
    vmovdqa [Op2], QLimb0
    vmovdqa [Op2+32], QLimb1
    vmovdqa [Op2+64], QLimb2
    vmovdqa [Op2+96], QLimb3

    add     Op1, Offs
    add     Op2, Offs

  .AVXCheck:

    sub     Size1, 16
    jnc     .AVXLoop

    add     Size1, 16
    je      .Exit ;ajs:notshortform
                  ; AVX copied operand fully =>

    ; copy remaining max. 15 limb
    test    Size1, 8
    je      .Four

    vmovdqu QLimb0, [Op1]
    vpxor   QLimb0, QLimb0, QFFFF
    vmovdqu QLimb1, [Op1+32]
    vpxor   QLimb1, QLimb1, QFFFF
    vmovdqa [Op2], QLimb0
    vmovdqa [Op2+32], QLimb1

    add     Op1, 64
    add     Op2, 64

  .Four:

    test    Size1, 4
    je      .Two

    vmovdqu QLimb0, [Op1]
    vpxor   QLimb0, QLimb0, QFFFF
    vmovdqa [Op2], QLimb0

    add     Op1, 32
    add     Op2, 32

  .Two:

    test    Size1, 2
    je      .One

%if 1
    ; Avoid SSE2 instruction due to stall on Haswell
    mov     Limb, [Op1]
    xor     Limb, FFFF
    mov     [Op2], Limb
    mov     Limb, [Op1+8]
    xor     Limb, FFFF
    mov     [Op2+8], Limb
%else
    movdqu  DLimb0, [Op1]
    pxor    DLimb0, DLimb0, DFFFF
    movdqa  [Op2], DLimb0
%endif

    add     Op1, 16
    add     Op2, 16

  .One:

    test    Size1, 1
    je      .Exit

    mov     Limb, [Op1]
    xor     Limb, FFFF
    mov     [Op2], Limb

  .Exit:

    vzeroupper
    ret
.end:
add assembler code for haswell, skylake and skylake_avx to the WIn64 build 2016-12-06 07:01:20 -05:00
Add the latest haswell and skylake code to the Windows x64 build 2016-12-10 09:15:40 -05:00			`; AMD64 mpn_com_n`
			`; Copyright 2016 Jens Nurmann and Alexander Kruppa`
add assembler code for haswell, skylake and skylake_avx to the WIn64 build 2016-12-06 07:01:20 -05:00			`; This file is part of the MPIR Library.`
			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2.1 of the License, or (at`
			`; your option) any later version.`
			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`

Add the latest haswell and skylake code to the Windows x64 build 2016-12-10 09:15:40 -05:00			`%define USE_WIN64`

			`BITS 64`
add assembler code for haswell, skylake and skylake_avx to the WIn64 build 2016-12-06 07:01:20 -05:00
			`%include 'yasm_mac.inc'`

Add the latest haswell and skylake code to the Windows x64 build 2016-12-10 09:15:40 -05:00			`%ifdef USE_WIN64`
			`%define Op2 RCX`
			`%define Op1 RDX`
			`%define Size1 R8`
			`%define Limb R9`
			`%define Offs R10`
			`%define FFFF R11`
			`%else`
			`%define Op2 RDI`
			`%define Op1 RSI`
			`%define Size1 RDX`
			`%define Limb RCX`
			`%define Offs R10`
			`%define FFFF R8`
			`%endif`

			`%define DLimb0 XMM0`
			`%define QLimb0 YMM0`
			`%define QLimb1 YMM1`
			`%define QLimb2 YMM2`
			`%define QLimb3 YMM3`
			`%define QFFFF YMM4`
			`%define DFFFF XMM4`

			`align 32`

			`LEAF_PROC mpn_com_n`
			`mov RAX, Size1`
			`or RAX, RAX`
			`je .Exit ;ajs:notshortform`
			`; size=0 =>`

			`; Set a GPR to 0xFF...FF`
			`mov FFFF, -1`

			`; align the destination (Op2) to 32 byte`
			`test Op2, 8`
			`je .A32`

			`mov Limb, [Op1]`
			`xor Limb, FFFF`
			`mov [Op2], Limb`
			`dec Size1`
			`je .Exit ;ajs:notshortform`

			`add Op1, 8`
			`add Op2, 8`

			`.A32:`

			`test Op2, 16`
			`je .AVX`

			`mov Limb, [Op1]`
			`xor Limb, FFFF`
			`mov [Op2], Limb`
			`dec Size1`
			`je .Exit ;ajs:notshortform`

			`mov Limb, [Op1+8]`
			`xor Limb, FFFF`
			`mov [Op2+8], Limb`
			`dec Size1`
			`je .Exit ;ajs:notshortform`

			`add Op1, 16`
			`add Op2, 16`

			`.AVX:`

			`; Set an AVX2 reg to 0xFF...FF`
			`movq DFFFF, FFFF`
			`vbroadcastsd QFFFF, DFFFF`

			`mov Offs, 128`
			`jmp .AVXCheck`

			`; main loop (prefetching disabled; unloaded cache)`
			`; - lCpyInc is slightly slower than lCpyDec through all cache levels?!`
			`; - 0.30 cycles / limb in L1$`
			`; - 0.60 cycles / limb in L2$`
			`; - 0.70-0.90 cycles / limb in L3$`
			`align 16`
			`.AVXLoop:`

			`vmovdqu QLimb0, [Op1]`
			`vpxor QLimb0, QLimb0, QFFFF`
			`vmovdqu QLimb1, [Op1+32]`
			`vpxor QLimb1, QLimb1, QFFFF`
			`vmovdqu QLimb2, [Op1+64]`
			`vpxor QLimb2, QLimb2, QFFFF`
			`vmovdqu QLimb3, [Op1+96]`
			`vpxor QLimb3, QLimb3, QFFFF`
			`vmovdqa [Op2], QLimb0`
			`vmovdqa [Op2+32], QLimb1`
			`vmovdqa [Op2+64], QLimb2`
			`vmovdqa [Op2+96], QLimb3`

			`add Op1, Offs`
			`add Op2, Offs`

			`.AVXCheck:`

			`sub Size1, 16`
			`jnc .AVXLoop`

			`add Size1, 16`
			`je .Exit ;ajs:notshortform`
			`; AVX copied operand fully =>`

			`; copy remaining max. 15 limb`
			`test Size1, 8`
			`je .Four`

			`vmovdqu QLimb0, [Op1]`
			`vpxor QLimb0, QLimb0, QFFFF`
			`vmovdqu QLimb1, [Op1+32]`
			`vpxor QLimb1, QLimb1, QFFFF`
			`vmovdqa [Op2], QLimb0`
			`vmovdqa [Op2+32], QLimb1`

			`add Op1, 64`
			`add Op2, 64`

			`.Four:`

			`test Size1, 4`
			`je .Two`

			`vmovdqu QLimb0, [Op1]`
			`vpxor QLimb0, QLimb0, QFFFF`
			`vmovdqa [Op2], QLimb0`

			`add Op1, 32`
			`add Op2, 32`

			`.Two:`

			`test Size1, 2`
			`je .One`

			`%if 1`
			`; Avoid SSE2 instruction due to stall on Haswell`
			`mov Limb, [Op1]`
			`xor Limb, FFFF`
			`mov [Op2], Limb`
			`mov Limb, [Op1+8]`
			`xor Limb, FFFF`
			`mov [Op2+8], Limb`
			`%else`
			`movdqu DLimb0, [Op1]`
			`pxor DLimb0, DLimb0, DFFFF`
			`movdqa [Op2], DLimb0`
			`%endif`

			`add Op1, 16`
			`add Op2, 16`

			`.One:`

			`test Size1, 1`
			`je .Exit`

			`mov Limb, [Op1]`
			`xor Limb, FFFF`
			`mov [Op2], Limb`

			`.Exit:`

			`vzeroupper`
			`ret`
			`.end:`