; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. ; ; Copyright 2005, 2006 Pierrick Gaudry ; ; Copyright 2008 Brian Gladman ; ; This file is part of the MPIR Library. ; The MPIR Library is free software; you can redistribute it and/or modify ; it under the terms of the GNU Lesser General Public License as published ; by the Free Software Foundation; either version 2.1 of the License, or (at ; your option) any later version. ; The MPIR Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. ; You should have received a copy of the GNU Lesser General Public License ; along with the MPIR Library; see the file COPYING.LIB. If not, write ; to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ; Boston, MA 02110-1301, USA. ; ; AMD64 mpn_add_n/mpn_sub_n -- mpn add or subtract. ; ; Calling interface: ; ; mp_limb_t __gmpn__n( = add OR sub ; mp_ptr dst, rcx ; mp_srcptr src1, rdx ; mp_srcptr src2, r8 ; mp_size_t len r9 ; ) ; ; mp_limb_t __gmpn__nc( = add OR sub ; mp_ptr dst, rcx ; mp_srcptr src1, rdx ; mp_srcptr src2, r8 ; mp_size_t len, r9 ; mp_limb_t carry [rsp+0x28] ; ) ; ; Calculate src1[size] plus(minus) src2[size] and store the result in ; dst[size]. The return value is the carry bit from the top of the result ; (1 or 0). The _nc version accepts 1 or 0 for an initial carry into the ; low limb of the calculation. Note values other than 1 or 0 here will ; lead to garbage results. ; ; This is an SEH Leaf Function (no unwind support needed) %include "..\yasm_mac.inc" CPU Core2 BITS 64 %define dst rcx ; destination pointer %define sr1 rdx ; source 1 pointer %define sr2 r8 ; source 2 pointer %define len r9 ; number of limbs %define cry [rsp+0x28] ; carry value %define r_jmp r10 ; temporary for jump table entry %define r_cnt r11 ; temporary for loop count %define UNROLL_LOG2 4 %define UNROLL_COUNT (1 << UNROLL_LOG2) %define UNROLL_MASK (UNROLL_COUNT - 1) %define UNROLL_BYTES (8 * UNROLL_COUNT) %define UNROLL_THRESHOLD 8 %if UNROLL_BYTES >= 256 %error unroll count is too large %elif UNROLL_BYTES >= 128 %define off 128 %else %define off 0 %endif LEAF_PROC mpn_sub_nc mov rax,[rsp+0x28] jmp entry LEAF_PROC mpn_sub_n xor rax,rax entry: movsxd len,r9d cmp len,UNROLL_THRESHOLD jae .2 lea sr1,[sr1+len*8] lea sr2,[sr2+len*8] lea dst,[dst+len*8] neg len shr rax,1 .1: mov rax,[sr1+len*8] mov r10,[sr2+len*8] sbb rax,r10 mov [dst+len*8],rax inc len jnz .1 mov rax,dword 0 setc al ret .2: mov r_cnt,1 and r_cnt,len mov [rsp+0x08], r_cnt and len,-2 mov r_cnt,len dec r_cnt shr r_cnt,UNROLL_LOG2 neg len and len,UNROLL_MASK lea r_jmp,[len*4] neg len lea sr1,[sr1+len*8+off] lea sr2,[sr2+len*8+off] lea dst,[dst+len*8+off] shr rax,1 lea r_jmp,[r_jmp+r_jmp*2] lea rax,[rel .3] lea r_jmp,[r_jmp+rax] jmp r_jmp .3: %define CHUNK_COUNT 2 %assign i 0 %rep UNROLL_COUNT / CHUNK_COUNT %assign disp0 8 * i * CHUNK_COUNT - off mov r_jmp,[byte sr1+disp0] ; len and r_jmp registers mov len,[byte sr1+disp0+8] ; now not needed sbb r_jmp,[byte sr2+disp0] mov [byte dst+disp0],r_jmp sbb len,[byte sr2+disp0+8] mov [byte dst+disp0+8],len %assign i i + 1 %endrep dec r_cnt lea sr1,[sr1+UNROLL_BYTES] lea sr2,[sr2+UNROLL_BYTES] lea dst,[dst+UNROLL_BYTES] jns .3 mov rax,[rsp+0x08] dec rax js .5 mov len,[sr1-off] sbb len,[sr2-off] mov [dst-off],len .5: mov rax,dword 0 setc al ret end