mpir/mpn/x86_64/amd64/addmul_1.as
2008-07-23 18:37:20 +00:00

282 lines
6.6 KiB
ActionScript

; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
; compiler and the YASM assembler.
; AMD64 mpn_add_n/mpn_sub_n -- mpn add or subtract.
;
; Calling interface:
;
; mp_limb_t __gmpn_<op>mul_1( <op> = add or sub
; mp_ptr dst, rdi
; mp_srcptr src, rsi
; mp_size_t size, rdx
; mp_limb_t mult rcx
; )
;
; mp_limb_t __gmpn_<op>mul_1c(
; mp_ptr dst, rdi
; mp_srcptr src, rsi
; mp_size_t size, rdx
; mp_limb_t mult, rcx
; mp_limb_t carry r8
; )
;
; Calculate src[size] multiplied by mult[1] and add to /subtract from dst[size] and
; return the carry or borrow from the top of the result
%include '../yasm_mac.inc'
%define UNROLL_LOG2 4
%define UNROLL_COUNT (1 << UNROLL_LOG2)
%define UNROLL_MASK (UNROLL_COUNT - 1)
%define UNROLL_BYTES (8 * UNROLL_COUNT)
%ifdef PIC
%define UNROLL_THRESHOLD 9
%else
%define UNROLL_THRESHOLD 6
%endif
%if UNROLL_BYTES >= 256
%error unroll count is too large
%elif UNROLL_BYTES >= 128
%define off 128
%else
%define off 0
%endif
%macro mac_sub 3
align 32
GLOBAL_FUNC %2
mov r9, rcx
mov r10, rbx
mov r11, rbp
mov rax, rsi
xor rcx, rcx ; carry = 0
dec rdx ; test for one limb only
jnz %%5 ; if more than one
mov rax,[rax] ; get limb value
mov rcx, rdi
mul r9 ; rax * mlt -> rdx (hi), rax (lo)
%1 [rcx],rax ; add/sub from destination
adc rdx, 0 ; add any carry into high word
mov rax,rdx ; and return the carry value
ret
align 16
GLOBAL_FUNC %3
mov r9, rcx
mov r10, rbx
mov r11, rbp
mov rax,rsi ; source pointer
dec rdx ; test for one limb
jnz %%1 ; if more than one
mov rax,[rax] ; get limb value
mov rcx, rdi
mul r9 ; rax * mlt -> rdx (hi), rax (lo)
add rax,r8 ; add in input carry
adc rdx, 0 ; propagate it into rdx
%1 [rcx],rax ; add or subtract rax from dest limb
adc rdx, 0 ; propagate carry into high word
mov rax,rdx
ret
align 16
%%1:
mov rcx, r8
align 32
%%5:
mov rbx, rdx
cmp rdx, UNROLL_THRESHOLD
mov rbp, r9
mov rax,[rsi] ; first limb of source
ja %%3 ; unroll for many limbs
lea rsi,[rsi+rbx*8+8] ; next source limb
lea rdi,[rdi+rbx*8] ; current dst limb
neg rbx
; simple loop
%%2:
mul rbp ; multiply current src limb -> rxx, rax
add rcx,rax ; add in carry
adc rdx, 0 ; propagate carry into rdx
%1 [rdi+rbx*8],rcx ; add or subtract rax from dest limb
mov rax,[rsi+rbx*8] ; get next source limb
adc rdx, 0 ; add carry or borrow into high word
inc rbx ; go to next limb
mov rcx,rdx ; high word -> carry
jnz %%2
mul rbp ; one more limb to do
mov rbx, r10
mov rbp, r11
add rcx,rax
adc rdx, 0
%1 [rdi],rcx
adc rdx, 0
mov rax,rdx ; return carry value as a limb
ret
align 32
%%3:
sub rbx, 2
dec rdx
shr rbx,UNROLL_LOG2
neg rdx
mov r9, rbx
and rdx,UNROLL_MASK
mov r8,rdx
mov rbx,rdx ; cry_hi and jmp_val are temporary
shl r8,3 ; values for calculating the jump
shl rdx,4 ; offset into the unrolled code
%ifdef PIC
call .pic_calc
.unroll_here:
..@unroll_here1:
%else
lea rdx,[rel %%4 + rdx + r8]
%endif
neg rbx
mov r8, rdx
mul rbp
add rcx,rax ; initial carry, becomes low carry
adc rdx, 0
test bl,1
mov rax,[rsi+8] ; src second limb
lea rsi,[rsi+rbx*8+off+16]
lea rdi,[rdi+rbx*8+off]
mov rbx,rdx
cmovnz rbx,rcx ; high, low carry other way around
cmovnz rcx,rdx
jmp r8
%ifdef PIC
.pic_calc:
lea rdx, [rdx+r8]
add rdx, ..@unroll_entry1 - ..@unroll_here1
add rdx, [rsp]
ret
%endif
align 32
.unroll_entry1:
..@unroll_entry1:
%%4:
%define CHUNK_COUNT 2
%assign i 0
%rep UNROLL_COUNT / CHUNK_COUNT
%assign disp0 8 * i * CHUNK_COUNT - off
%assign disp1 disp0 + 8
mul rbp
%1 [byte rdi+disp0],rcx
mov rcx, 0 ; len = 0
adc rbx,rax
adc rcx,rdx
mov rax,[byte rsi+disp0]
mul rbp
%1 [byte rdi+disp1],rbx
mov rbx,0 ; len = 0
adc rcx,rax
adc rbx,rdx
mov rax,[byte rsi+disp1]
%assign i i + 1
%endrep
dec r9
lea rsi,[rsi+UNROLL_BYTES]
lea rdi,[rdi+UNROLL_BYTES]
jns %%4
mul rbp
%1 [rdi-off],rcx
mov rbp, r11
adc rax,rbx
mov rbx, r10
adc rdx,0
%1 [rdi-off+8],rax
adc rdx,0
mov rax,rdx
ret
%endmacro
BITS 64
mac_sub add,mpn_addmul_1,mpn_addmul_1c