mpir/mpn/x86_64w/skylake/avx/iorn_n.asm

;  AVX mpn_iorn_n
;
;  Copyright 2017 Jens Nurmann
;
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.

;   (rdi,rcx) = (rsi,rcx) and not (rdx,rcx)

; There is no initial pointer alignment lead in code below. The argument
; why not is based on some statistical reasoning and measurement points.
; All statements below strictly refer to the Intel i6xxx (Skylake) 
; microarchitecture.

; The function is intended to be used with arbitrary pointer alignment on
; entry. That is there are 8 possible cases to consider:

; - A: 1 x all pointers mis-aligned (mod 32 byte)
; - B: 3 x one pointer aligned (mod 32 byte))
; - C: 3 x two pointers aligned (mod 32 byte)
; - D: 1 x all pointers aligned (mod 32 byte)

; All sub cases under B show equivalent performance, as do all sub cases of
; C. B is 7% faster than A, C is 11% faster than A and D is 39% faster than A.

; To do a proper alignment would require a complex decision tree to allways 
; advance the alignment situation in the best possible manner - e.g. pointer
; 1 is off by 8 while pointer 2 & 3 are off by 16. To do the alignment
; requires some arith and at least one branch in the function proloque - a
; reasonable impact for small sized operands. And all this for a small gain
; (around 6% all summed up) in the average case.

; In a specific application scenario this might be the wrong choice.

; The execution speed of VMOVDQU is equivalent to VMOVDQA in case of aligned
; pointers. This may be different for earlier generations of Intel core 
; architectures like Broadwell, Haswell, ...

; cycles per limb with all operands aligned and in:

;                   LD1$      LD2$
;   Haswell         ???       ???
;   Broadwell       ???       ???
;   Skylake         0.29-0.31 0.39-0.40

%include 'yasm_mac.inc'

; definition according to Linux 64 bit ABI

%define ResP    RCX
%define Src1P   RDX
%define Src2P    R8
%define Size     R9
%define SizeD   R9D
%define Count   RAX
%define CountD  EAX
%define Limb0   R10
%define Limb0D R10D
%define QLimb0 YMM0
%define	QLimb1 YMM1

    align   32
    BITS    64

LEAF_PROC   mpn_iorn_n

    mov     CountD, 3
    mov     Limb0, Size
    sub     Count, Size
    jnc     .PostGPR            ; dispatch size 0-3 immediately

    vpcmpeqq QLimb1, QLimb1, QLimb1

    mov     SizeD, 3
    shr     Limb0, 2
    or      Count, -4
    sub     Size, Limb0
    jnc     .PostAVX            ; dispatch size 4, 8 & 12 immediately

    mov     Limb0D, 128

  .Loop:

    vpxor   QLimb0, QLimb1, [Src2P]
    vpor    QLimb0, QLimb0, [Src1P]
    vmovdqu [ResP], QLimb0
    vpxor   QLimb0, QLimb1, [Src2P+32]
    vpor    QLimb0, QLimb0, [Src1P+32]
    vmovdqu [ResP+32], QLimb0
    vpxor   QLimb0, QLimb1, [Src2P+64]
    vpor    QLimb0, QLimb0, [Src1P+64]
    vmovdqu [ResP+64], QLimb0
    vpxor   QLimb0, QLimb1, [Src2P+96]
    vpor    QLimb0, QLimb0, [Src1P+96]
    vmovdqu [ResP+96], QLimb0

    lea     Src2P, [Src2P+Limb0]
    lea     Src1P, [Src1P+Limb0]
    lea     ResP, [ResP+Limb0]

    add     Size, 4
    jnc     .Loop

  .PostAVX:

    mov     Limb0D, 0           ; to allow pointer correction on exit
    cmp     Size, 2             ; fastest way to dispatch values 0-3
    ja      .PostAVX0
    je      .PostAVX1
    jp      .PostAVX2

  .PostAVX3:

    add     Limb0, 32
    vpxor   QLimb0, QLimb1, [Src2P+64]
    vpor    QLimb0, QLimb0, [Src1P+64]
    vmovdqu [ResP+64], QLimb0

  .PostAVX2:

    add     Limb0, 32
    vpxor   QLimb0, QLimb1, [Src2P+32]
    vpor    QLimb0, QLimb0, [Src1P+32]
    vmovdqu [ResP+32], QLimb0

  .PostAVX1:

    add     Limb0, 32
    vpxor   QLimb0, QLimb1, [Src2P]
    vpor    QLimb0, QLimb0, [Src1P]
    vmovdqu [ResP], QLimb0

  .PostAVX0:

    add     Src2P, Limb0
    add     Src1P, Limb0
    add     ResP, Limb0
    add     Count, 4

  .PostGPR:

    cmp     Count, 2            ; fastest way to dispatch values 0-3
    ja      .Exit
    je      .PostGPR1
    jp      .PostGPR2

  .PostGPR3:

    mov     Limb0, [Src2P+16]
    not     Limb0
    or      Limb0, [Src1P+16]
    mov     [ResP+16], Limb0

  .PostGPR2:

    mov     Limb0, [Src2P+8]
    not     Limb0
    or      Limb0, [Src1P+8]
    mov     [ResP+8], Limb0

  .PostGPR1:

    mov     Limb0, [Src2P]
    not     Limb0
    or      Limb0, [Src1P]
    mov     [ResP], Limb0

  .Exit:

    ret
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`; AVX mpn_iorn_n`
			`;`
			`; Copyright 2017 Jens Nurmann`
			`;`
			`; This file is part of the MPIR Library.`
			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2.1 of the License, or (at`
			`; your option) any later version.`
			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`

add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`; (rdi,rcx) = (rsi,rcx) and not (rdx,rcx)`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00
			`; There is no initial pointer alignment lead in code below. The argument`
			`; why not is based on some statistical reasoning and measurement points.`
			`; All statements below strictly refer to the Intel i6xxx (Skylake)`
			`; microarchitecture.`

			`; The function is intended to be used with arbitrary pointer alignment on`
			`; entry. That is there are 8 possible cases to consider:`

			`; - A: 1 x all pointers mis-aligned (mod 32 byte)`
			`; - B: 3 x one pointer aligned (mod 32 byte))`
			`; - C: 3 x two pointers aligned (mod 32 byte)`
			`; - D: 1 x all pointers aligned (mod 32 byte)`

			`; All sub cases under B show equivalent performance, as do all sub cases of`
			`; C. B is 7% faster than A, C is 11% faster than A and D is 39% faster than A.`

			`; To do a proper alignment would require a complex decision tree to allways`
			`; advance the alignment situation in the best possible manner - e.g. pointer`
			`; 1 is off by 8 while pointer 2 & 3 are off by 16. To do the alignment`
			`; requires some arith and at least one branch in the function proloque - a`
			`; reasonable impact for small sized operands. And all this for a small gain`
			`; (around 6% all summed up) in the average case.`

			`; In a specific application scenario this might be the wrong choice.`

			`; The execution speed of VMOVDQU is equivalent to VMOVDQA in case of aligned`
			`; pointers. This may be different for earlier generations of Intel core`
			`; architectures like Broadwell, Haswell, ...`

			`; cycles per limb with all operands aligned and in:`

			`; LD1$ LD2$`
			`; Haswell ??? ???`
			`; Broadwell ??? ???`
			`; Skylake 0.29-0.31 0.39-0.40`

			`%include 'yasm_mac.inc'`

			`; definition according to Linux 64 bit ABI`

			`%define ResP RCX`
			`%define Src1P RDX`
			`%define Src2P R8`
			`%define Size R9`
			`%define SizeD R9D`
			`%define Count RAX`
			`%define CountD EAX`
			`%define Limb0 R10`
			`%define Limb0D R10D`
			`%define QLimb0 YMM0`
			`%define QLimb1 YMM1`

			`align 32`
			`BITS 64`

			`LEAF_PROC mpn_iorn_n`

			`mov CountD, 3`
			`mov Limb0, Size`
			`sub Count, Size`
			`jnc .PostGPR ; dispatch size 0-3 immediately`

			`vpcmpeqq QLimb1, QLimb1, QLimb1`

			`mov SizeD, 3`
			`shr Limb0, 2`
			`or Count, -4`
			`sub Size, Limb0`
			`jnc .PostAVX ; dispatch size 4, 8 & 12 immediately`

			`mov Limb0D, 128`

			`.Loop:`

add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P]`
			`vpor QLimb0, QLimb0, [Src1P]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP], QLimb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P+32]`
			`vpor QLimb0, QLimb0, [Src1P+32]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP+32], QLimb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P+64]`
			`vpor QLimb0, QLimb0, [Src1P+64]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP+64], QLimb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P+96]`
			`vpor QLimb0, QLimb0, [Src1P+96]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP+96], QLimb0`

			`lea Src2P, [Src2P+Limb0]`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`lea Src1P, [Src1P+Limb0]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`lea ResP, [ResP+Limb0]`

			`add Size, 4`
			`jnc .Loop`

			`.PostAVX:`

			`mov Limb0D, 0 ; to allow pointer correction on exit`
			`cmp Size, 2 ; fastest way to dispatch values 0-3`
			`ja .PostAVX0`
			`je .PostAVX1`
			`jp .PostAVX2`

			`.PostAVX3:`

			`add Limb0, 32`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P+64]`
			`vpor QLimb0, QLimb0, [Src1P+64]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP+64], QLimb0`

			`.PostAVX2:`

			`add Limb0, 32`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P+32]`
			`vpor QLimb0, QLimb0, [Src1P+32]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP+32], QLimb0`

			`.PostAVX1:`

			`add Limb0, 32`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`vpxor QLimb0, QLimb1, [Src2P]`
			`vpor QLimb0, QLimb0, [Src1P]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`vmovdqu [ResP], QLimb0`

			`.PostAVX0:`

			`add Src2P, Limb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`add Src1P, Limb0`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`add ResP, Limb0`
			`add Count, 4`

			`.PostGPR:`

			`cmp Count, 2 ; fastest way to dispatch values 0-3`
			`ja .Exit`
			`je .PostGPR1`
			`jp .PostGPR2`

			`.PostGPR3:`

add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`mov Limb0, [Src2P+16]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`not Limb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`or Limb0, [Src1P+16]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`mov [ResP+16], Limb0`

			`.PostGPR2:`

add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`mov Limb0, [Src2P+8]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`not Limb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`or Limb0, [Src1P+8]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`mov [ResP+8], Limb0`

			`.PostGPR1:`

add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`mov Limb0, [Src2P]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`not Limb0`
add AVX mpn_and_n assembler to Windows x64; remove definition error in mpn_andn_n and mpn_iorn_n 2017-05-24 04:23:11 -04:00			`or Limb0, [Src1P]`
add assembler support for mpn logic functions on Windows x64 2017-05-23 09:52:29 -04:00			`mov [ResP], Limb0`

			`.Exit:`

			`ret`