mpir/mpn/x86_64w/core2/sub_n.asm


;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  Copyright 2005, 2006 Pierrick Gaudry
;
;  Copyright 2008 Brian Gladman
;
;  This file is part of the MPIR Library.
;  The MPIR Library is free software; you can redistribute it and/or modify
;  it under the terms of the GNU Lesser General Public License as published
;  by the Free Software Foundation; either version 2.1 of the License, or (at
;  your option) any later version.
;  The MPIR Library is distributed in the hope that it will be useful, but
;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
;  License for more details.
;  You should have received a copy of the GNU Lesser General Public License
;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;  Boston, MA 02110-1301, USA.
;
;  AMD64 mpn_add_n/mpn_sub_n -- mpn add or subtract.
;
;  Calling interface:
;
;  mp_limb_t __gmpn_<op>_n(    <op> = add OR sub
;     mp_ptr dst,              rcx
;     mp_srcptr src1,          rdx
;     mp_srcptr src2,           r8
;     mp_size_t  len            r9
;  )
;
;  mp_limb_t __gmpn_<op>_nc(   <op> = add OR sub
;     mp_ptr dst,              rcx
;     mp_srcptr src1,          rdx
;     mp_srcptr src2,           r8
;     mp_size_t len,            r9
;     mp_limb_t carry   [rsp+0x28]
;  )
;
;  Calculate src1[size] plus(minus) src2[size] and store the result in
;  dst[size].  The return value is the carry bit from the top of the result
;  (1 or 0).  The _nc version accepts 1 or 0 for an initial carry into the
;  low limb of the calculation.  Note values other than 1 or 0 here will
;  lead to garbage results.
;
;  This is an SEH Leaf Function (no unwind support needed)

%include "..\yasm_mac.inc"

    CPU  Core2
    BITS 64

%define dst       rcx   ; destination pointer
%define sr1       rdx   ; source 1 pointer
%define sr2        r8   ; source 2 pointer
%define len        r9   ; number of limbs
%define cry [rsp+0x28]  ; carry value

%define r_jmp     r10   ; temporary for jump table entry
%define r_cnt     r11   ; temporary for loop count

%define UNROLL_LOG2         4
%define UNROLL_COUNT        (1 << UNROLL_LOG2)
%define UNROLL_MASK         (UNROLL_COUNT - 1)
%define UNROLL_BYTES        (8 * UNROLL_COUNT)
%define UNROLL_THRESHOLD    8

%if UNROLL_BYTES >= 256
%error unroll count is too large
%elif UNROLL_BYTES >= 128
%define off 128
%else
%define off 0
%endif

    LEAF_PROC mpn_sub_nc
    mov     rax,[rsp+0x28]
    jmp     entry
    
    LEAF_PROC mpn_sub_n
    xor     rax,rax

entry:
    movsxd  len,r9d
    cmp     len,UNROLL_THRESHOLD
    jae     .2
    lea     sr1,[sr1+len*8]
    lea     sr2,[sr2+len*8]
    lea     dst,[dst+len*8]
    neg     len
    shr     rax,1

.1: mov     rax,[sr1+len*8]
    mov     r10,[sr2+len*8]
    sbb     rax,r10
    mov     [dst+len*8],rax
    inc     len
    jnz     .1
    mov     rax,dword 0
    setc    al
    ret

.2: mov     r_cnt,1
    and     r_cnt,len
    mov     [rsp+0x08], r_cnt
    and     len,-2
    mov     r_cnt,len
    dec     r_cnt
    shr     r_cnt,UNROLL_LOG2
    neg     len
    and     len,UNROLL_MASK
    lea     r_jmp,[len*4]
    neg     len
    lea     sr1,[sr1+len*8+off]
    lea     sr2,[sr2+len*8+off]
    lea     dst,[dst+len*8+off]
    shr     rax,1
    lea     r_jmp,[r_jmp+r_jmp*2]
    lea     rax,[rel .3]
    lea     r_jmp,[r_jmp+rax]
    jmp     r_jmp

.3:
%define CHUNK_COUNT  2
%assign i 0

%rep  UNROLL_COUNT / CHUNK_COUNT
%assign  disp0 8 * i * CHUNK_COUNT - off

    mov     r_jmp,[byte sr1+disp0]      ; len and r_jmp registers
    mov     len,[byte sr1+disp0+8]      ; now not needed
    sbb     r_jmp,[byte sr2+disp0]
    mov     [byte dst+disp0],r_jmp
    sbb     len,[byte sr2+disp0+8]
    mov     [byte dst+disp0+8],len

%assign i i + 1
%endrep

    dec     r_cnt
    lea     sr1,[sr1+UNROLL_BYTES]
    lea     sr2,[sr2+UNROLL_BYTES]
    lea     dst,[dst+UNROLL_BYTES]
    jns     .3

    mov     rax,[rsp+0x08]
    dec     rax
    js      .5
    mov     len,[sr1-off]
    sbb     len,[sr2-off]
    mov     [dst-off],len
.5: 
    mov     rax,dword 0
    setc    al
    ret
    
    end
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00
			`; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.`
			`;`
			`; Copyright 2005, 2006 Pierrick Gaudry`
			`;`
			`; Copyright 2008 Brian Gladman`
			`;`
			`; This file is part of the MPIR Library.`
			`; The MPIR Library is free software; you can redistribute it and/or modify`
			`; it under the terms of the GNU Lesser General Public License as published`
			`; by the Free Software Foundation; either version 2.1 of the License, or (at`
			`; your option) any later version.`
			`; The MPIR Library is distributed in the hope that it will be useful, but`
			`; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`; License for more details.`
			`; You should have received a copy of the GNU Lesser General Public License`
			`; along with the MPIR Library; see the file COPYING.LIB. If not, write`
			`; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,`
			`; Boston, MA 02110-1301, USA.`
			`;`
			`; AMD64 mpn_add_n/mpn_sub_n -- mpn add or subtract.`
			`;`
			`; Calling interface:`
			`;`
			`; mp_limb_t __gmpn_<op>_n( <op> = add OR sub`
			`; mp_ptr dst, rcx`
			`; mp_srcptr src1, rdx`
			`; mp_srcptr src2, r8`
			`; mp_size_t len r9`
			`; )`
			`;`
			`; mp_limb_t __gmpn_<op>_nc( <op> = add OR sub`
			`; mp_ptr dst, rcx`
			`; mp_srcptr src1, rdx`
			`; mp_srcptr src2, r8`
			`; mp_size_t len, r9`
			`; mp_limb_t carry [rsp+0x28]`
			`; )`
			`;`
			`; Calculate src1[size] plus(minus) src2[size] and store the result in`
			`; dst[size]. The return value is the carry bit from the top of the result`
			`; (1 or 0). The _nc version accepts 1 or 0 for an initial carry into the`
			`; low limb of the calculation. Note values other than 1 or 0 here will`
			`; lead to garbage results.`
			`;`
			`; This is an SEH Leaf Function (no unwind support needed)`

			`%include "..\yasm_mac.inc"`

Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`CPU Core2`
			`BITS 64`

change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`%define dst rcx ; destination pointer`
			`%define sr1 rdx ; source 1 pointer`
			`%define sr2 r8 ; source 2 pointer`
			`%define len r9 ; number of limbs`
			`%define cry [rsp+0x28] ; carry value`

			`%define r_jmp r10 ; temporary for jump table entry`
			`%define r_cnt r11 ; temporary for loop count`

			`%define UNROLL_LOG2 4`
			`%define UNROLL_COUNT (1 << UNROLL_LOG2)`
			`%define UNROLL_MASK (UNROLL_COUNT - 1)`
			`%define UNROLL_BYTES (8 * UNROLL_COUNT)`
			`%define UNROLL_THRESHOLD 8`

			`%if UNROLL_BYTES >= 256`
			`%error unroll count is too large`
			`%elif UNROLL_BYTES >= 128`
			`%define off 128`
			`%else`
			`%define off 0`
			`%endif`

Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`LEAF_PROC mpn_sub_nc`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov rax,[rsp+0x28]`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`jmp entry`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`LEAF_PROC mpn_sub_n`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`xor rax,rax`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00
			`entry:`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`movsxd len,r9d`
			`cmp len,UNROLL_THRESHOLD`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`jae .2`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`lea sr1,[sr1+len*8]`
			`lea sr2,[sr2+len*8]`
			`lea dst,[dst+len*8]`
			`neg len`
			`shr rax,1`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00
			`.1: mov rax,[sr1+len*8]`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov r10,[sr2+len*8]`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`sbb rax,r10`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov [dst+len*8],rax`
			`inc len`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`jnz .1`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov rax,dword 0`
			`setc al`
			`ret`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00
			`.2: mov r_cnt,1`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`and r_cnt,len`
			`mov [rsp+0x08], r_cnt`
			`and len,-2`
			`mov r_cnt,len`
			`dec r_cnt`
			`shr r_cnt,UNROLL_LOG2`
			`neg len`
			`and len,UNROLL_MASK`
			`lea r_jmp,[len*4]`
			`neg len`
			`lea sr1,[sr1+len*8+off]`
			`lea sr2,[sr2+len*8+off]`
			`lea dst,[dst+len*8+off]`
			`shr rax,1`
			`lea r_jmp,[r_jmp+r_jmp*2]`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`lea rax,[rel .3]`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`lea r_jmp,[r_jmp+rax]`
			`jmp r_jmp`

Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`.3:`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`%define CHUNK_COUNT 2`
			`%assign i 0`

			`%rep UNROLL_COUNT / CHUNK_COUNT`
			`%assign disp0 8 * i * CHUNK_COUNT - off`

			`mov r_jmp,[byte sr1+disp0] ; len and r_jmp registers`
			`mov len,[byte sr1+disp0+8] ; now not needed`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`sbb r_jmp,[byte sr2+disp0]`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov [byte dst+disp0],r_jmp`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`sbb len,[byte sr2+disp0+8]`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov [byte dst+disp0+8],len`

			`%assign i i + 1`
			`%endrep`

			`dec r_cnt`
			`lea sr1,[sr1+UNROLL_BYTES]`
			`lea sr2,[sr2+UNROLL_BYTES]`
			`lea dst,[dst+UNROLL_BYTES]`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`jns .3`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00
			`mov rax,[rsp+0x08]`
			`dec rax`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`js .5`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov len,[sr1-off]`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`sbb len,[sr2-off]`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`mov [dst-off],len`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00			`.5:`
			`mov rax,dword 0`
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`setc al`
			`ret`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00
change Core2 assembler code for mpn_add_n and mpn_sub_n.asm update VC++ build 2009-03-14 16:55:03 -04:00			`end`
Adjust Windows assembler files to assist planned automation of aspects of the Windows build 2009-04-03 05:07:52 -04:00