mpir/mpn/x86_64/amd64/copyi.as

111 lines
3.4 KiB
ActionScript
Raw Normal View History

; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Provided by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
; compiler and the YASM assembler.
; AMD64 mpn_copyi -- incrementing copy limb vector
;
; Calling interface:
;
; void mpn_copyi(
; mp_ptr dst, rdi
; mp_srcptr src, rsi
; mp_size_t size rdx
; )
%include '../yasm_mac.inc'
%define UNROLL_THRESHOLD 16
%define d_ptr rdi
%define s_ptr rsi
%define s_len rdx
%define s_lend edx
BITS 64
GLOBAL_FUNC mpn_copyi
movsxd s_len,s_lend
or s_len,s_len ; none to move?
jz label1
mov rax,s_ptr ; find relative alignment of
xor rax,d_ptr ; source and destination (min
mov r9,s_ptr ; 8-byte alignment assumed)
lea s_ptr,[s_ptr+s_len*8]
lea d_ptr,[d_ptr+s_len*8]
neg s_len
cmp s_len,byte -UNROLL_THRESHOLD
jbe label2 ; if many limbs to move
label0:
mov rax,[s_ptr+s_len*8] ; short move via rax
mov [d_ptr+s_len*8],rax
inc s_len
jnz label0 ; avoid single byte ret that
label1:
rep ret ; interferes with branch prediction
label2:
test al,8
jnz label7 ; not 16 byte aligned
test r9,8 ; see if src is on 16 byte
jz label3 ; boundary
mov rax,[s_ptr+s_len*8] ; if not do a one limb copy
mov [d_ptr+s_len*8],rax
inc s_len
label3:
lea s_len,[s_len+3] ; now 16 byte aligned
label4:
prefetchnta [s_ptr+s_len*8-24+3*64] ; should this be +4*64 ??
movdqa xmm0,[s_ptr+s_len*8-24] ; move 32 bytes at a time
movntdq [d_ptr+s_len*8-24],xmm0
movdqa xmm0,[s_ptr+s_len*8-8]
movntdq [d_ptr+s_len*8-8],xmm0
add s_len,4
jl label4
sfence
test s_len,2
jnz label5
movdqa xmm0,[s_ptr+s_len*8-24] ; move 16 bytes if necessary
movdqa [d_ptr+s_len*8-24],xmm0
add s_len,2
label5:
test s_len,1
jnz label6
movq xmm0,[s_ptr+s_len*8-24] ; move 8 bytes if necessary
movq [d_ptr+s_len*8-24],xmm0
label6:
ret
label7:
lea s_len,[s_len+1] ; move 8 bytes at a time
label8:
movq xmm0,[s_ptr+s_len*8-8]
movq xmm1,[s_ptr+s_len*8]
movq [d_ptr+s_len*8-8],xmm0
movq [d_ptr+s_len*8],xmm1
add s_len,2
jl label8
test s_len,1
jnz label9
movq xmm0,[s_ptr-8]
movq [d_ptr-8],xmm0
label9:
ret