mpir/mpn/x86_64/haswell/copyd.as

204 lines
4.7 KiB
ActionScript
Raw Normal View History

; Copyright 2016 Jens Nurmann and Alexander Kruppa
; This file is part of the MPIR Library.
; The MPIR Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published
; by the Free Software Foundation; either version 2.1 of the License, or (at
; your option) any later version.
; The MPIR Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
; License for more details.
; You should have received a copy of the GNU Lesser General Public License
; along with the MPIR Library; see the file COPYING.LIB. If not, write
; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
; Boston, MA 02110-1301, USA.
; mpn_copyd(mp_ptr Op2, mp_srcptr Op1, mp_size_t Size1)
; Linux RDI RSI RDX
; Win7 RCX RDX R8
;
; Description:
; The function copies a given number of limb from source to destination (while
; moving high to low in memory) and hands back the size (in limb) of the
; destination.
;
; Result:
; - Op2[ 0..size-1 ] = Op1[ 0..size-1 ]
; - number of copied limb: range [ 0..max tCounter ]
;
; Caveats:
; - if size 0 is given the content of the destination will remain untouched!
; - if Op1=Op2 no copy is done!
;
; Comments:
; - AVX-based version implemented, tested & benched on 05.01.2016 by jn
; - did some experiments with AVX based version with following results
; - AVX can be faster in L1$-L3$ if destination is aligned on 32 byte
; - AVX is generally faster on small sized operands (<=100 limb) due too
; start-up overhead of "rep movsq" - however this could also be achieved by
; simple copy loop
; - startup overhead of "rep movsq" with negative direction is 200 cycles!!!
; - negative direction is unfavourable compared to positive "rep movsq" and
; to AVX.
%include 'yasm_mac.inc'
BITS 64
%ifdef USE_WIN64
%define Op2 RCX
%define Op1 RDX
%define Size1 R8
%define Limb R9
%define Offs R10
%else
%define Op2 RDI
%define Op1 RSI
%define Size1 RDX
%define Limb RCX
%define Offs R10
%endif
%define DLimb0 XMM0
%define QLimb0 YMM0
%define QLimb1 YMM1
%define QLimb2 YMM2
%define QLimb3 YMM3
align 32
GLOBAL_FUNC mpn_copyd
mov RAX, Size1
cmp Op1, Op2
je .Exit ; no copy required =>
or RAX, RAX
je .Exit ; Size=0 =>
lea Op1, [Op1+8*Size1-8]
lea Op2, [Op2+8*Size1-8]
; align the destination (Op2) to 32 byte
test Op2, 8
jne .lCpyDecA32
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
sub Op1, 8
sub Op2, 8
.lCpyDecA32:
test Op2, 16
jnz .lCpyDecAVX
mov Limb, [Op1]
mov [Op2], Limb
dec Size1
je .Exit
mov Limb, [Op1-8]
mov [Op2-8], Limb
dec Size1
je .Exit
sub Op1, 16
sub Op2, 16
.lCpyDecAVX:
mov Offs, 128
jmp .lCpyDecAVXCheck
; main loop (prefetching disabled; unloaded cache)
; - 0.30 cycles / limb in L1$
; - 0.60 cycles / limb in L2$
; - 0.70-0.90 cycles / limb in L3$
align 16
.lCpyDecAVXLoop:
vmovdqu QLimb0, [Op1-24]
vmovdqu QLimb1, [Op1-56]
vmovdqu QLimb2, [Op1-88]
vmovdqu QLimb3, [Op1-120]
vmovdqa [Op2-24], QLimb0
vmovdqa [Op2-56], QLimb1
vmovdqa [Op2-88], QLimb2
vmovdqa [Op2-120], QLimb3
sub Op1, Offs
sub Op2, Offs
.lCpyDecAVXCheck:
sub Size1, 16
jnc .lCpyDecAVXLoop
add Size1, 16
je .Exit ; AVX copied operand fully =>
; copy remaining max. 15 limb
test Size1, 8
je .lCpyDecFour
vmovdqu QLimb0, [Op1-24]
vmovdqu QLimb1, [Op1-56]
vmovdqa [Op2-24], QLimb0
vmovdqa [Op2-56], QLimb1
sub Op1, 64
sub Op2, 64
.lCpyDecFour:
test Size1, 4
je .lCpyDecTwo
vmovdqu QLimb0, [Op1-24]
vmovdqa [Op2-24], QLimb0
sub Op1, 32
sub Op2, 32
.lCpyDecTwo:
test Size1, 2
je .lCpyDecOne
%if 1
; Avoid SSE2 instruction due to stall on Haswell
mov Limb, [Op1]
mov [Op2], Limb
mov Limb, [Op1-8]
mov [Op2-8], Limb
%else
movdqu DLimb0, [Op1-8]
movdqa [Op2-8], DLimb0
%endif
sub Op1, 16
sub Op2, 16
.lCpyDecOne:
test Size1, 1
je .Exit
mov Limb, [Op1]
mov [Op2], Limb
.Exit:
vzeroupper
ret
.end: