mpir/mpn/x86_64/amd64/copyd.as

;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
;  Provided by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
;  compiler and the YASM assembler.

; AMD64 mpn_copyd -- decrementing copy limb vector
;
;  Calling interface (WIN64):
;
; void mpn_copyd(
;     mp_ptr dst,    rcx
;     mp_srcptr src, rdx
;     mp_size_t size  r8
; )
;
;  Calling interface (linux):
;
; void mpn_copyd(
;     mp_ptr dst,    rdi
;     mp_srcptr src, rsi
;     mp_size_t size rdx
; )
;
;  This is an SEH Leaf Function (no unwind support needed)

%include '../yasm_mac.inc'

%define    UNROLL_THRESHOLD 16

%ifdef _WIN64_ABI
%define  d_ptr rcx
%define  s_ptr rdx
%define s_len   r8
%define s_lend r8d

%else
%define  d_ptr rdi
%define  s_ptr rsi
%define s_len  rdx
%define s_lend edx
%endif

    bits    64
    section .text

    G_EXPORT __gmpn_copyd

%ifdef DLL
    export  __gmpn_copyd
%endif

G_LABEL __gmpn_copyd
    movsxd  s_len,s_lend
    cmp     s_len,byte UNROLL_THRESHOLD
    jge     .2                  ; if many limbs to move
    dec     s_len
    jl      .1
.0: mov     rax,[s_ptr+s_len*8] ; short move via rax
    mov     [d_ptr+s_len*8],rax
    dec     s_len
    jge     .0                  ; avoid single byte ret that
.1: rep     ret                 ; interferes with branch prediction

.2: mov     rax,s_ptr           ; find relative alignment of
    xor     rax,d_ptr           ; source and destination (min
    test    al,8
    jnz     .7                  ; not 16 byte aligned
    lea     rax,[s_ptr+s_len*8]
    test    al,8                ; see if src is on 16 byte
    jz      .3                  ; boundary
    dec     s_len
    mov     rax,[rax-8]         ; if not do a one limb copy
    mov     [d_ptr+s_len*8],rax
.3: lea     s_len,[s_len-4]             ; now 16 byte aligned
.4: prefetchnta [s_ptr+s_len*8+16-3*64] ; should this be -4*64 ??
    movdqa  xmm0,[s_ptr+s_len*8+16]     ; move 32 bytes at a time
    movntdq [d_ptr+s_len*8+16],xmm0
    movdqa  xmm0,[s_ptr+s_len*8]
    movntdq [d_ptr+s_len*8],xmm0
    sub     s_len,4
    jge     .4
    sfence
    test    s_len,2
    jz      .5
    movdqa  xmm0,[s_ptr+s_len*8+16]     ; move 16 bytes if necessary
    movdqa  [d_ptr+s_len*8+16],xmm0
.5: test    s_len,1
    jz      .6
    movq    xmm0,[s_ptr]                ; move 8 bytes if necessary
    movq    [d_ptr],xmm0
.6: ret

.7: lea     s_len,[s_len-2]             ; move 8 bytes at a time
.8: movq    xmm0,[s_ptr+s_len*8+8]
    movq    xmm1,[s_ptr+s_len*8]
    movq    [d_ptr+s_len*8+8],xmm0
    movq    [d_ptr+s_len*8],xmm1
    sub     s_len,2
    jge    .8
    test    s_len,1
    jz      .9
    movq    xmm0,[s_ptr]
    movq    [d_ptr],xmm0
.9: ret

    end
Roughly speaking mpir should now build on an AMD64. At the present moment the config.guess doesn't distinguish a Core 2 from an AMD64 and so the same code is probably built on both. One must build yasm (included in the yasm directory) before building GMP, if building on an x86_64 machine. Note: make test and make tune do not currently build. 2008-05-26 18:11:40 -04:00			`; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Provided by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit`
			`; compiler and the YASM assembler.`

			`; AMD64 mpn_copyd -- decrementing copy limb vector`
			`;`
			`; Calling interface (WIN64):`
			`;`
			`; void mpn_copyd(`
			`; mp_ptr dst, rcx`
			`; mp_srcptr src, rdx`
			`; mp_size_t size r8`
			`; )`
			`;`
			`; Calling interface (linux):`
			`;`
			`; void mpn_copyd(`
			`; mp_ptr dst, rdi`
			`; mp_srcptr src, rsi`
			`; mp_size_t size rdx`
			`; )`
			`;`
			`; This is an SEH Leaf Function (no unwind support needed)`

Put macros instances in all yasm assembly files for global symbol prefixes. 2008-06-15 18:00:33 -04:00			`%include '../yasm_mac.inc'`

Roughly speaking mpir should now build on an AMD64. At the present moment the config.guess doesn't distinguish a Core 2 from an AMD64 and so the same code is probably built on both. One must build yasm (included in the yasm directory) before building GMP, if building on an x86_64 machine. Note: make test and make tune do not currently build. 2008-05-26 18:11:40 -04:00			`%define UNROLL_THRESHOLD 16`

			`%ifdef _WIN64_ABI`
			`%define d_ptr rcx`
			`%define s_ptr rdx`
			`%define s_len r8`
			`%define s_lend r8d`

			`%else`
			`%define d_ptr rdi`
			`%define s_ptr rsi`
			`%define s_len rdx`
			`%define s_lend edx`
			`%endif`

			`bits 64`
			`section .text`

Put macros instances in all yasm assembly files for global symbol prefixes. 2008-06-15 18:00:33 -04:00			`G_EXPORT __gmpn_copyd`
Roughly speaking mpir should now build on an AMD64. At the present moment the config.guess doesn't distinguish a Core 2 from an AMD64 and so the same code is probably built on both. One must build yasm (included in the yasm directory) before building GMP, if building on an x86_64 machine. Note: make test and make tune do not currently build. 2008-05-26 18:11:40 -04:00
			`%ifdef DLL`
			`export __gmpn_copyd`
			`%endif`

Put macros instances in all yasm assembly files for global symbol prefixes. 2008-06-15 18:00:33 -04:00			`G_LABEL __gmpn_copyd`
Roughly speaking mpir should now build on an AMD64. At the present moment the config.guess doesn't distinguish a Core 2 from an AMD64 and so the same code is probably built on both. One must build yasm (included in the yasm directory) before building GMP, if building on an x86_64 machine. Note: make test and make tune do not currently build. 2008-05-26 18:11:40 -04:00			`movsxd s_len,s_lend`
			`cmp s_len,byte UNROLL_THRESHOLD`
			`jge .2 ; if many limbs to move`
			`dec s_len`
			`jl .1`
			`.0: mov rax,[s_ptr+s_len*8] ; short move via rax`
			`mov [d_ptr+s_len*8],rax`
			`dec s_len`
			`jge .0 ; avoid single byte ret that`
			`.1: rep ret ; interferes with branch prediction`

			`.2: mov rax,s_ptr ; find relative alignment of`
			`xor rax,d_ptr ; source and destination (min`
			`test al,8`
			`jnz .7 ; not 16 byte aligned`
			`lea rax,[s_ptr+s_len*8]`
			`test al,8 ; see if src is on 16 byte`
			`jz .3 ; boundary`
			`dec s_len`
			`mov rax,[rax-8] ; if not do a one limb copy`
			`mov [d_ptr+s_len*8],rax`
			`.3: lea s_len,[s_len-4] ; now 16 byte aligned`
			`.4: prefetchnta [s_ptr+s_len8+16-364] ; should this be -4*64 ??`
			`movdqa xmm0,[s_ptr+s_len*8+16] ; move 32 bytes at a time`
			`movntdq [d_ptr+s_len*8+16],xmm0`
			`movdqa xmm0,[s_ptr+s_len*8]`
			`movntdq [d_ptr+s_len*8],xmm0`
			`sub s_len,4`
			`jge .4`
			`sfence`
			`test s_len,2`
			`jz .5`
			`movdqa xmm0,[s_ptr+s_len*8+16] ; move 16 bytes if necessary`
			`movdqa [d_ptr+s_len*8+16],xmm0`
			`.5: test s_len,1`
			`jz .6`
			`movq xmm0,[s_ptr] ; move 8 bytes if necessary`
			`movq [d_ptr],xmm0`
			`.6: ret`

			`.7: lea s_len,[s_len-2] ; move 8 bytes at a time`
			`.8: movq xmm0,[s_ptr+s_len*8+8]`
			`movq xmm1,[s_ptr+s_len*8]`
			`movq [d_ptr+s_len*8+8],xmm0`
			`movq [d_ptr+s_len*8],xmm1`
			`sub s_len,2`
			`jge .8`
			`test s_len,1`
			`jz .9`
			`movq xmm0,[s_ptr]`
			`movq [d_ptr],xmm0`
			`.9: ret`

			`end`