;  AVX mpn_xnor_n
;
;  Copyright 2017 Jens Nurmann
;
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.

;   (rdi,rcx) = not(rsi,rcx) xor (rdx,rcx)

; There is no initial pointer alignment lead in code below. The argument
; why not is based on some statistical reasoning and measurement points.
; All statements below strictly refer to the Intel i6xxx (Skylake) 
; microarchitecture.

; The function is intended to be used with arbitrary pointer alignment on
; entry. That is there are 8 possible cases to consider:

; - A: 1 x all pointers mis-aligned (mod 32 byte)
; - B: 3 x one pointer aligned (mod 32 byte))
; - C: 3 x two pointers aligned (mod 32 byte)
; - D: 1 x all pointers aligned (mod 32 byte)

; All sub cases under B show equivalent performance, as do all sub cases of
; C. B is 7% faster than A, C is 11% faster than A and D is 39% faster than A.

; To do a proper alignment would require a complex decision tree to allways 
; advance the alignment situation in the best possible manner - e.g. pointer
; 1 is off by 8 while pointer 2 & 3 are off by 16. To do the alignment
; requires some arith and at least one branch in the function proloque - a
; reasonable impact for small sized operands. And all this for a small gain
; (around 6% all summed up) in the average case.

; In a specific application scenario this might be the wrong choice.

; The execution speed of VMOVDQU is equivalent to VMOVDQA in case of aligned
; pointers. This may be different for earlier generations of Intel core 
; architectures like Broadwell, Haswell, ...

; cycles per limb with all operands aligned and in:

;                   LD1$      LD2$
;   Haswell         ???       ???
;   Broadwell       ???       ???
;   Skylake         0.29-0.31 0.39-0.40

%include 'yasm_mac.inc'

; definition according to Linux 64 bit ABI

%define ResP    RCX
%define Src1P   RDX
%define Src2P    R8
%define Size     R9
%define SizeD   R9D
%define Count   RAX
%define CountD  EAX
%define Limb0   R10
%define Limb0D R10D
%define QLimb0 YMM0
%define	QLimb1 YMM1

    align   32
    BITS    64

LEAF_PROC   mpn_xnor_n

    mov     CountD, 3
    mov     Limb0, Size
    sub     Count, Size
    jnc     .PostGPR            ; dispatch size 0-3 immediately

    vpcmpeqq QLimb1, QLimb1, QLimb1

    mov     SizeD, 3
    shr     Limb0, 2
    or      Count, -4
    sub     Size, Limb0
    jnc     .PostAVX            ; dispatch size 4, 8 & 12 immediately

    mov     Limb0D, 128

  .Loop:

    vpxor   QLimb0, QLimb1, [Src1P]
    vpxor   QLimb0, QLimb0, [Src2P]
    vmovdqu [ResP], QLimb0
    vpxor   QLimb0, QLimb1, [Src1P+32]
    vpxor   QLimb0, QLimb0, [Src2P+32]
    vmovdqu [ResP+32], QLimb0
    vpxor   QLimb0, QLimb1, [Src1P+64]
    vpxor   QLimb0, QLimb0, [Src2P+64]
    vmovdqu [ResP+64], QLimb0
    vpxor   QLimb0, QLimb1, [Src1P+96]
    vpxor   QLimb0, QLimb0, [Src2P+96]
    vmovdqu [ResP+96], QLimb0

    lea     Src1P, [Src1P+Limb0]
    lea     Src2P, [Src2P+Limb0]
    lea     ResP, [ResP+Limb0]

    add     Size, 4
    jnc     .Loop

  .PostAVX:

    mov     Limb0D, 0           ; to allow pointer correction on exit
    cmp     Size, 2             ; fastest way to dispatch values 0-3
    ja      .PostAVX0
    je      .PostAVX1
    jp      .PostAVX2

  .PostAVX3:

    add     Limb0, 32
    vpxor   QLimb0, QLimb1, [Src1P+64]
    vpxor   QLimb0, QLimb0, [Src2P+64]
    vmovdqu [ResP+64], QLimb0

  .PostAVX2:

    add     Limb0, 32
    vpxor   QLimb0, QLimb1, [Src1P+32]
    vpxor   QLimb0, QLimb0, [Src2P+32]
    vmovdqu [ResP+32], QLimb0

  .PostAVX1:

    add     Limb0, 32
    vpxor   QLimb0, QLimb1, [Src1P]
    vpxor   QLimb0, QLimb0, [Src2P]
    vmovdqu [ResP], QLimb0

  .PostAVX0:

    add     Src1P, Limb0
    add     Src2P, Limb0
    add     ResP, Limb0
    add     Count, 4

  .PostGPR:

    cmp     Count, 2            ; fastest way to dispatch values 0-3
    ja      .Exit
    je      .PostGPR1
    jp      .PostGPR2

  .PostGPR3:

    mov     Limb0, [Src1P+16]
    not     Limb0
    xor     Limb0, [Src2P+16]
    mov     [ResP+16], Limb0

  .PostGPR2:

    mov     Limb0, [Src1P+8]
    not     Limb0
    xor     Limb0, [Src2P+8]
    mov     [ResP+8], Limb0

  .PostGPR1:

    mov     Limb0, [Src1P]
    not     Limb0
    xor     Limb0, [Src2P]
    mov     [ResP], Limb0

  .Exit:

    ret