202 lines
4.6 KiB
NASM
202 lines
4.6 KiB
NASM
|
|
||
|
; Copyright 2016 Jens Nurmann and Alexander Kruppa
|
||
|
|
||
|
; This file is part of the MPIR Library.
|
||
|
|
||
|
; The MPIR Library is free software; you can redistribute it and/or modify
|
||
|
; it under the terms of the GNU Lesser General Public License as published
|
||
|
; by the Free Software Foundation; either version 2.1 of the License, or (at
|
||
|
; your option) any later version.
|
||
|
|
||
|
; The MPIR Library is distributed in the hope that it will be useful, but
|
||
|
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
; License for more details.
|
||
|
|
||
|
; You should have received a copy of the GNU Lesser General Public License
|
||
|
; along with the MPIR Library; see the file COPYING.LIB. If not, write
|
||
|
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
||
|
; Boston, MA 02110-1301, USA.
|
||
|
|
||
|
; mpn_copyi(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
|
||
|
; Linux RDI RSI RDX
|
||
|
; Win7 RCX RDX R8
|
||
|
;
|
||
|
; Description:
|
||
|
; The function copies a given number of limb from source to destination (while
|
||
|
; moving low to high in memory) and hands back the size (in limb) of the
|
||
|
; destination.
|
||
|
;
|
||
|
; Result:
|
||
|
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
|
||
|
; - number of copied limb: range [ 0..max tCounter ]
|
||
|
;
|
||
|
; Caveats:
|
||
|
; - if size 0 is given the content of the destination will remain untouched!
|
||
|
; - if Op1=Op2 no copy is done!
|
||
|
;
|
||
|
; Comments:
|
||
|
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
|
||
|
; - did some experiments with AVX based version with following results
|
||
|
; - AVX can be faster in L1$ (30%), L2$ (10%) if dest. is aligned on 32 byte
|
||
|
; - AVX is generally faster on small sized operands (<=100 limb) due too
|
||
|
; start-up overhead of "rep movsq" - however this could also be achieved by
|
||
|
; simple copy loop
|
||
|
; - the break-even between AVX and "rep movsq" is around 10,000 limb
|
||
|
; - the prologue & epilogue can still be optimized!
|
||
|
|
||
|
%define USE_WIN64
|
||
|
|
||
|
%include 'yasm_mac.inc'
|
||
|
|
||
|
BITS 64
|
||
|
|
||
|
%ifdef USE_WIN64
|
||
|
%define Op2 RCX
|
||
|
%define Op1 RDX
|
||
|
%define Size1 R8
|
||
|
%define Limb R9
|
||
|
%define Offs R10
|
||
|
%else
|
||
|
%define Op2 RDI
|
||
|
%define Op1 RSI
|
||
|
%define Size1 RDX
|
||
|
%define Limb RCX
|
||
|
%define Offs R10
|
||
|
%endif
|
||
|
|
||
|
%define DLimb0 XMM0
|
||
|
%define QLimb0 YMM0
|
||
|
%define QLimb1 YMM1
|
||
|
%define QLimb2 YMM2
|
||
|
%define QLimb3 YMM3
|
||
|
|
||
|
align 32
|
||
|
|
||
|
LEAF_PROC mpn_copyi
|
||
|
mov RAX, Size1
|
||
|
cmp Op1, Op2
|
||
|
je .Exit ; no copy required =>
|
||
|
|
||
|
or RAX, RAX
|
||
|
je .Exit ; size=0 =>
|
||
|
|
||
|
; align the destination (Op2) to 32 byte
|
||
|
test Op2, 8
|
||
|
je .lCpyIncA32
|
||
|
|
||
|
mov Limb, [Op1]
|
||
|
mov [Op2], Limb
|
||
|
dec Size1
|
||
|
je .Exit
|
||
|
|
||
|
add Op1, 8
|
||
|
add Op2, 8
|
||
|
|
||
|
.lCpyIncA32:
|
||
|
|
||
|
test Op2, 16
|
||
|
je .lCpyIncAVX
|
||
|
|
||
|
mov Limb, [Op1]
|
||
|
mov [Op2], Limb
|
||
|
dec Size1
|
||
|
je .Exit
|
||
|
|
||
|
mov Limb, [Op1+8]
|
||
|
mov [Op2+8], Limb
|
||
|
dec Size1
|
||
|
je .Exit
|
||
|
|
||
|
add Op1, 16
|
||
|
add Op2, 16
|
||
|
|
||
|
.lCpyIncAVX:
|
||
|
|
||
|
mov Offs, 128
|
||
|
jmp .lCpyIncAVXCheck
|
||
|
|
||
|
; main loop (prefetching disabled; unloaded cache)
|
||
|
; - lCpyInc is slightly slower than lCpyDec through all cache levels?!
|
||
|
; - 0.30 cycles / limb in L1$
|
||
|
; - 0.60 cycles / limb in L2$
|
||
|
; - 0.70-0.90 cycles / limb in L3$
|
||
|
align 16
|
||
|
.lCpyIncAVXLoop:
|
||
|
|
||
|
vmovdqu QLimb0, [Op1]
|
||
|
vmovdqu QLimb1, [Op1+32]
|
||
|
vmovdqu QLimb2, [Op1+64]
|
||
|
vmovdqu QLimb3, [Op1+96]
|
||
|
vmovdqa [Op2], QLimb0
|
||
|
vmovdqa [Op2+32], QLimb1
|
||
|
vmovdqa [Op2+64], QLimb2
|
||
|
vmovdqa [Op2+96], QLimb3
|
||
|
|
||
|
add Op1, Offs
|
||
|
add Op2, Offs
|
||
|
|
||
|
.lCpyIncAVXCheck:
|
||
|
|
||
|
sub Size1, 16
|
||
|
jnc .lCpyIncAVXLoop
|
||
|
|
||
|
add Size1, 16
|
||
|
je .Exit ; AVX copied operand fully =>
|
||
|
|
||
|
; copy remaining max. 15 limb
|
||
|
test Size1, 8
|
||
|
je .lCpyIncFour
|
||
|
|
||
|
vmovdqu QLimb0, [Op1]
|
||
|
vmovdqu QLimb1, [Op1+32]
|
||
|
vmovdqa [Op2], QLimb0
|
||
|
vmovdqa [Op2+32], QLimb1
|
||
|
|
||
|
add Op1, 64
|
||
|
add Op2, 64
|
||
|
|
||
|
.lCpyIncFour:
|
||
|
|
||
|
test Size1, 4
|
||
|
je .lCpyIncTwo
|
||
|
|
||
|
vmovdqu QLimb0, [Op1]
|
||
|
vmovdqa [Op2], QLimb0
|
||
|
|
||
|
add Op1, 32
|
||
|
add Op2, 32
|
||
|
|
||
|
.lCpyIncTwo:
|
||
|
|
||
|
test Size1, 2
|
||
|
je .lCpyIncOne
|
||
|
|
||
|
%if 1
|
||
|
; Avoid SSE2 instruction due to stall on Haswell
|
||
|
mov Limb, [Op1]
|
||
|
mov [Op2], Limb
|
||
|
mov Limb, [Op1+8]
|
||
|
mov [Op2+8], Limb
|
||
|
%else
|
||
|
movdqu DLimb0, [Op1]
|
||
|
movdqa [Op2], DLimb0
|
||
|
%endif
|
||
|
|
||
|
add Op1, 16
|
||
|
add Op2, 16
|
||
|
|
||
|
.lCpyIncOne:
|
||
|
|
||
|
test Size1, 1
|
||
|
je .Exit
|
||
|
|
||
|
mov Limb, [Op1]
|
||
|
mov [Op2], Limb
|
||
|
|
||
|
.Exit:
|
||
|
|
||
|
vzeroupper
|
||
|
ret
|
||
|
.end:
|