mpir/mpn/x86_64w/skylake/copyi.asm

202 lines
4.6 KiB
NASM

; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mpn_copyi(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
; Linux RDI RSI RDX
; Win7 RCX RDX R8
;
; Description:
; The function copies a given number of limb from source to destination (while
; moving low to high in memory) and hands back the size (in limb) of the
; destination.
;
; Result:
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
; - number of copied limb: range [ 0..max tCounter ]
;
; Caveats:
; - if size 0 is given the content of the destination will remain untouched!
; - if Op1=Op2 no copy is done!
;
; Comments:
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
; - did some experiments with AVX based version with following results
; - AVX can be faster in L1$ (30%), L2$ (10%) if dest. is aligned on 32 byte
; - AVX is generally faster on small sized operands (<=100 limb) due too
; start-up overhead of "rep movsq" - however this could also be achieved by
; simple copy loop
; - the break-even between AVX and "rep movsq" is around 10,000 limb
; - the prologue & epilogue can still be optimized!
%define USE_WIN64
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 RCX
%define Op1 RDX
%define Size1 R8
%define Limb R9
%define Offs R10
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Limb RCX
%define Offs R10
%endif
%define DLimb0 XMM0
%define QLimb0 YMM0
%define QLimb1 YMM1
%define QLimb2 YMM2
%define QLimb3 YMM3
align 32
LEAF_PROC mpn_copyi
mov RAX, Size1
cmp Op1, Op2
je .Exit ; no copy required =>
or RAX, RAX
je .Exit ; size=0 =>
; align the destination (Op2) to 32 byte
test Op2, 8
je .lCpyIncA32
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
add Op1, 8
add Op2, 8
.lCpyIncA32:
test Op2, 16
je .lCpyIncAVX
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
mov Limb, [Op1+8]
mov [Op2+8], Limb
dec Size1
je .Exit
add Op1, 16
add Op2, 16
.lCpyIncAVX:
mov Offs, 128
jmp .lCpyIncAVXCheck
; main loop (prefetching disabled; unloaded cache)
; - lCpyInc is slightly slower than lCpyDec through all cache levels?!
; - 0.30 cycles / limb in L1$
; - 0.60 cycles / limb in L2$
; - 0.70-0.90 cycles / limb in L3$
align 16
.lCpyIncAVXLoop:
vmovdqu QLimb0, [Op1]
vmovdqu QLimb1, [Op1+32]
vmovdqu QLimb2, [Op1+64]
vmovdqu QLimb3, [Op1+96]
vmovdqa [Op2], QLimb0
vmovdqa [Op2+32], QLimb1
vmovdqa [Op2+64], QLimb2
vmovdqa [Op2+96], QLimb3
add Op1, Offs
add Op2, Offs
.lCpyIncAVXCheck:
sub Size1, 16
jnc .lCpyIncAVXLoop
add Size1, 16
je .Exit ; AVX copied operand fully =>
; copy remaining max. 15 limb
test Size1, 8
je .lCpyIncFour
vmovdqu QLimb0, [Op1]
vmovdqu QLimb1, [Op1+32]
vmovdqa [Op2], QLimb0
vmovdqa [Op2+32], QLimb1
add Op1, 64
add Op2, 64
.lCpyIncFour:
test Size1, 4
je .lCpyIncTwo
vmovdqu QLimb0, [Op1]
vmovdqa [Op2], QLimb0
add Op1, 32
add Op2, 32
.lCpyIncTwo:
test Size1, 2
je .lCpyIncOne
%if 1
; Avoid SSE2 instruction due to stall on Haswell
mov Limb, [Op1]
mov [Op2], Limb
mov Limb, [Op1+8]
mov [Op2+8], Limb
%else
movdqu DLimb0, [Op1]
movdqa [Op2], DLimb0
%endif
add Op1, 16
add Op2, 16
.lCpyIncOne:
test Size1, 1
je .Exit
mov Limb, [Op1]
mov [Op2], Limb
.Exit:
vzeroupper
ret
.end: