mpir/mpn/x86_64w/skylake/avx/iorn_n.asm

182 lines
4.9 KiB
NASM
Raw Normal View History

; AVX mpn_iorn_n
;
; Copyright 2017 Jens Nurmann
;
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; (rdi,rcx) = (rsi,rcx) and not (rdx,rcx)
; There is no initial pointer alignment lead in code below. The argument
; why not is based on some statistical reasoning and measurement points.
; All statements below strictly refer to the Intel i6xxx (Skylake)
; microarchitecture.
; The function is intended to be used with arbitrary pointer alignment on
; entry. That is there are 8 possible cases to consider:
; - A: 1 x all pointers mis-aligned (mod 32 byte)
; - B: 3 x one pointer aligned (mod 32 byte))
; - C: 3 x two pointers aligned (mod 32 byte)
; - D: 1 x all pointers aligned (mod 32 byte)
; All sub cases under B show equivalent performance, as do all sub cases of
; C. B is 7% faster than A, C is 11% faster than A and D is 39% faster than A.
; To do a proper alignment would require a complex decision tree to allways
; advance the alignment situation in the best possible manner - e.g. pointer
; 1 is off by 8 while pointer 2 & 3 are off by 16. To do the alignment
; requires some arith and at least one branch in the function proloque - a
; reasonable impact for small sized operands. And all this for a small gain
; (around 6% all summed up) in the average case.
; In a specific application scenario this might be the wrong choice.
; The execution speed of VMOVDQU is equivalent to VMOVDQA in case of aligned
; pointers. This may be different for earlier generations of Intel core
; architectures like Broadwell, Haswell, ...
; cycles per limb with all operands aligned and in:
; LD1$ LD2$
; Haswell ??? ???
; Broadwell ??? ???
; Skylake 0.29-0.31 0.39-0.40
%include 'yasm_mac.inc'
; definition according to Linux 64 bit ABI
%define ResP RCX
%define Src1P RDX
%define Src2P R8
%define Size R9
%define SizeD R9D
%define Count RAX
%define CountD EAX
%define Limb0 R10
%define Limb0D R10D
%define QLimb0 YMM0
%define QLimb1 YMM1
align 32
BITS 64
LEAF_PROC mpn_iorn_n
mov CountD, 3
mov Limb0, Size
sub Count, Size
jnc .PostGPR ; dispatch size 0-3 immediately
vpcmpeqq QLimb1, QLimb1, QLimb1
mov SizeD, 3
shr Limb0, 2
or Count, -4
sub Size, Limb0
jnc .PostAVX ; dispatch size 4, 8 & 12 immediately
mov Limb0D, 128
.Loop:
vpxor QLimb0, QLimb1, [Src2P]
vpor QLimb0, QLimb0, [Src1P]
vmovdqu [ResP], QLimb0
vpxor QLimb0, QLimb1, [Src2P+32]
vpor QLimb0, QLimb0, [Src1P+32]
vmovdqu [ResP+32], QLimb0
vpxor QLimb0, QLimb1, [Src2P+64]
vpor QLimb0, QLimb0, [Src1P+64]
vmovdqu [ResP+64], QLimb0
vpxor QLimb0, QLimb1, [Src2P+96]
vpor QLimb0, QLimb0, [Src1P+96]
vmovdqu [ResP+96], QLimb0
lea Src2P, [Src2P+Limb0]
lea Src1P, [Src1P+Limb0]
lea ResP, [ResP+Limb0]
add Size, 4
jnc .Loop
.PostAVX:
mov Limb0D, 0 ; to allow pointer correction on exit
cmp Size, 2 ; fastest way to dispatch values 0-3
ja .PostAVX0
je .PostAVX1
jp .PostAVX2
.PostAVX3:
add Limb0, 32
vpxor QLimb0, QLimb1, [Src2P+64]
vpor QLimb0, QLimb0, [Src1P+64]
vmovdqu [ResP+64], QLimb0
.PostAVX2:
add Limb0, 32
vpxor QLimb0, QLimb1, [Src2P+32]
vpor QLimb0, QLimb0, [Src1P+32]
vmovdqu [ResP+32], QLimb0
.PostAVX1:
add Limb0, 32
vpxor QLimb0, QLimb1, [Src2P]
vpor QLimb0, QLimb0, [Src1P]
vmovdqu [ResP], QLimb0
.PostAVX0:
add Src2P, Limb0
add Src1P, Limb0
add ResP, Limb0
add Count, 4
.PostGPR:
cmp Count, 2 ; fastest way to dispatch values 0-3
ja .Exit
je .PostGPR1
jp .PostGPR2
.PostGPR3:
mov Limb0, [Src2P+16]
not Limb0
or Limb0, [Src1P+16]
mov [ResP+16], Limb0
.PostGPR2:
mov Limb0, [Src2P+8]
not Limb0
or Limb0, [Src1P+8]
mov [ResP+8], Limb0
.PostGPR1:
mov Limb0, [Src2P]
not Limb0
or Limb0, [Src1P]
mov [ResP], Limb0
.Exit:
ret