; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. ; ; Copyright 2005, 2006 Pierrick Gaudry ; ; Copyright 2008 Brian Gladman, William Hart ; ; This file is part of the MPIR Library. ; ; The MPIR Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The MPIR Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the MPIR Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 51 Franklin Street, ; Fifth Floor, Boston, MA 02110-1301, USA. ; ; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit ; compiler and the YASM assembler. ; AMD64 mpn_add_n/mpn_sub_n -- mpn add or subtract. ; ; Calling interface: ; ; mp_limb_t __gmpn_mul_1( = add or sub ; mp_ptr dst, rdi ; mp_srcptr src, rsi ; mp_size_t size, rdx ; mp_limb_t mult rcx ; ) ; ; mp_limb_t __gmpn_mul_1c( ; mp_ptr dst, rdi ; mp_srcptr src, rsi ; mp_size_t size, rdx ; mp_limb_t mult, rcx ; mp_limb_t carry r8 ; ) ; ; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and ; return the carry or borrow from the top of the result %include '../yasm_mac.inc' %define UNROLL_LOG2 4 %define UNROLL_COUNT (1 << UNROLL_LOG2) %define UNROLL_MASK (UNROLL_COUNT - 1) %define UNROLL_BYTES (8 * UNROLL_COUNT) %ifdef PIC %define UNROLL_THRESHOLD 9 %else %define UNROLL_THRESHOLD 6 %endif %if UNROLL_BYTES >= 256 %error unroll count is too large %elif UNROLL_BYTES >= 128 %define off 128 %else %define off 0 %endif %macro mac_sub 3 align 32 GLOBAL_FUNC %2 mov r9, rcx mov r10, rbx mov r11, rbp mov rax, rsi xor rcx, rcx ; carry = 0 dec rdx ; test for one limb only jnz %%5 ; if more than one mov rax,[rax] ; get limb value mov rcx, rdi mul r9 ; rax * mlt -> rdx (hi), rax (lo) %1 [rcx],rax ; add/sub from destination adc rdx, 0 ; add any carry into high word mov rax,rdx ; and return the carry value ret align 16 GLOBAL_FUNC %3 mov r9, rcx mov r10, rbx mov r11, rbp mov rax,rsi ; source pointer dec rdx ; test for one limb jnz %%1 ; if more than one mov rax,[rax] ; get limb value mov rcx, rdi mul r9 ; rax * mlt -> rdx (hi), rax (lo) add rax,r8 ; add in input carry adc rdx, 0 ; propagate it into rdx %1 [rcx],rax ; add or subtract rax from dest limb adc rdx, 0 ; propagate carry into high word mov rax,rdx ret align 16 %%1: mov rcx, r8 align 32 %%5: mov rbx, rdx cmp rdx, UNROLL_THRESHOLD mov rbp, r9 mov rax,[rsi] ; first limb of source ja %%3 ; unroll for many limbs lea rsi,[rsi+rbx*8+8] ; next source limb lea rdi,[rdi+rbx*8] ; current dst limb neg rbx ; simple loop %%2: mul rbp ; multiply current src limb -> rxx, rax add rcx,rax ; add in carry adc rdx, 0 ; propagate carry into rdx %1 [rdi+rbx*8],rcx ; add or subtract rax from dest limb mov rax,[rsi+rbx*8] ; get next source limb adc rdx, 0 ; add carry or borrow into high word inc rbx ; go to next limb mov rcx,rdx ; high word -> carry jnz %%2 mul rbp ; one more limb to do mov rbx, r10 mov rbp, r11 add rcx,rax adc rdx, 0 %1 [rdi],rcx adc rdx, 0 mov rax,rdx ; return carry value as a limb ret align 32 %%3: sub rbx, 2 dec rdx shr rbx,UNROLL_LOG2 neg rdx mov r9, rbx and rdx,UNROLL_MASK mov r8,rdx mov rbx,rdx ; cry_hi and jmp_val are temporary shl r8,3 ; values for calculating the jump shl rdx,4 ; offset into the unrolled code %ifdef PIC call .pic_calc .unroll_here: ..@unroll_here1: %else lea rdx,[rel %%4 + rdx + r8] %endif neg rbx mov r8, rdx mul rbp add rcx,rax ; initial carry, becomes low carry adc rdx, 0 test bl,1 mov rax,[rsi+8] ; src second limb lea rsi,[rsi+rbx*8+off+16] lea rdi,[rdi+rbx*8+off] mov rbx,rdx cmovnz rbx,rcx ; high, low carry other way around cmovnz rcx,rdx jmp r8 %ifdef PIC .pic_calc: lea rdx, [rdx+r8] add rdx, ..@unroll_entry1 - ..@unroll_here1 add rdx, [rsp] ret %endif align 32 .unroll_entry1: ..@unroll_entry1: %%4: %define CHUNK_COUNT 2 %assign i 0 %rep UNROLL_COUNT / CHUNK_COUNT %assign disp0 8 * i * CHUNK_COUNT - off %assign disp1 disp0 + 8 mul rbp %1 [byte rdi+disp0],rcx mov rcx, 0 ; len = 0 adc rbx,rax adc rcx,rdx mov rax,[byte rsi+disp0] mul rbp %1 [byte rdi+disp1],rbx mov rbx,0 ; len = 0 adc rcx,rax adc rbx,rdx mov rax,[byte rsi+disp1] %assign i i + 1 %endrep dec r9 lea rsi,[rsi+UNROLL_BYTES] lea rdi,[rdi+UNROLL_BYTES] jns %%4 mul rbp %1 [rdi-off],rcx mov rbp, r11 adc rax,rbx mov rbx, r10 adc rdx,0 %1 [rdi-off+8],rax adc rdx,0 mov rax,rdx ret %endmacro BITS 64 mac_sub sub,mpn_submul_1,mpn_submul_1c