mpir/mpn/x86_64/amd64/sqr_basecase.as

366 lines
7.7 KiB
ActionScript

; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
; compiler and the YASM assembler.
; AMD64 mpn_sqr_basecase -- square an mpn number.
;
; Calling interface:
;
; void mpn_sqr_basecase(
; mp_ptr dst, rcx rdi
; mp_srcptr src, rdx rsi
; mp_size_t size r8 rdx
; )
; The unroll count here is set very large so that tuneup won't run
; out of unroll loop when it tried ridiculously large crossover
; values with karatsuba squaring. Tuneup was observed to require a
; value here of over 128. Though 150 seems ok, we set it even higher.
%include '../yasm_mac.inc'
%define UNROLL_COUNT 300
%if UNROLL_COUNT > 31
%define DWORD_OFFSETS
%else
%undef DWORD_OFFSETS
%endif
%ifdef DWORD_OFFSETS
%define ADR_BIAS 0
%elif UNROLL_COUNT > 15
%define ADR_BIAS (UNROLL_COUNT - 15) * 8
%else
%define ADR_BIAS 0
%endif
%define r_ptr r10
%define x_ptr r9
%define x_len r8
%define v_ctr r12
%define v_jmp r13
bits 64
section .text
G_EXPORT __gmpn_sqr_basecase
G_LABEL __gmpn_sqr_basecase
movsxd x_len,edx
cmp x_len,2
je sqr_2
ja sqr_3_plus
mov rax,[rsi]
mul rax
mov [rdi+8],rdx
mov [rdi],rax
ret
sqr_2:
mov r_ptr,rdi
mov x_ptr,rsi
mov r8,[x_ptr]
mov r9,[x_ptr+8]
mov rax,r8
mul r8
mov [r_ptr],rax
mov [r_ptr+8],rdx
mov rax,r9
mul r9
mov [r_ptr+16],rax
mov [r_ptr+24],rdx
xor rcx,rcx
mov rax,r8
mul r9
add rax,rax
adc rdx,rdx
adc rcx,rcx
add [r_ptr+8],rax
adc [r_ptr+16],rdx
adc [r_ptr+24],rcx
ret
sqr_3_plus:
push rbx
push rsi
push rdi
push rbp
push r12
push r13
mov r_ptr,rdi
mov x_ptr,rsi
cmp x_len,4
jae sqr_4_plus
mov rax,[x_ptr]
mul rax
mov [r_ptr],rax
mov rax,[x_ptr+8]
mov [r_ptr+8],rdx
mul rax
mov [r_ptr+16],rax
mov rax,[x_ptr+16]
mov [r_ptr+24],rdx
mul rax
mov [r_ptr+32],rax
mov rax,[x_ptr]
mov [r_ptr+40],rdx
mul qword [x_ptr+8]
mov rsi,rax
mov rax,[x_ptr]
mov rdi,rdx
mul qword [x_ptr+16]
add rdi,rax
mov rbp,dword 0
mov rax,[x_ptr+8]
adc rbp,rdx
mul qword [x_ptr+16]
xor x_ptr,x_ptr
add rbp,rax
adc rdx,dword 0
adc rdx,dword 0
add rsi,rsi
adc rdi,rdi
mov rax,[r_ptr+8]
adc rbp,rbp
adc rdx,rdx
adc x_ptr,dword 0
add rsi,rax
mov rax,[r_ptr+16]
adc rdi,rax
mov rax,[r_ptr+24]
mov [r_ptr+8],rsi
adc rbp,rax
mov rax,[r_ptr+32]
mov [r_ptr+16],rdi
adc rdx,rax
mov rax,[r_ptr+40]
mov [r_ptr+24],rbp
adc rax,x_ptr
mov [r_ptr+32],rdx
mov [r_ptr+40],rax
jmp sqr_exit
sqr_4_plus:
mov rcx,x_len
lea rdi,[r_ptr+rcx*8]
lea rsi,[x_ptr+rcx*8]
mov rbp,[x_ptr]
mov rbx,dword 0
dec rcx
neg rcx
.0: mov rax,[rsi+rcx*8]
mul rbp
add rax,rbx
mov [rdi+rcx*8],rax
mov rbx,dword 0
adc rbx,rdx
inc rcx
jnz .0
mov rcx,x_len
mov [rdi],rbx
sub rcx,4
jz L_corner
neg rcx
%if ADR_BIAS != 0
sub rdi,ADR_BIAS
sub rsi,ADR_BIAS
%endif
mov rdx,rcx
%ifdef DWORD_OFFSETS
%define CODE_BYTES_PER_LIMB 31 ; must be odd
%define dsiz dword
shl rcx,5
sub rcx,rdx
%ifdef PIC
call .pic_calc
%else
lea v_jmp,[rel .3]
%endif
..@unroll_here1:
lea rcx,[rcx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB]
%else
%define CODE_BYTES_PER_LIMB 25 ; must be odd
%define dsiz byte
shl rcx,3
lea rcx,[rcx+rcx*2]
%ifdef PIC
call .pic_calc
%else
lea v_jmp,[rel .3]
%endif
..@unroll_here1:
lea rcx,[rcx+rdx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB]
%endif
lea rcx,[rcx+v_jmp]
.2: lea v_jmp,[rcx+CODE_BYTES_PER_LIMB]
mov rbp,[rsi+rdx*8-24+ADR_BIAS]
mov rax,[rsi+rdx*8-16+ADR_BIAS]
mov v_ctr,rdx
mul rbp
test cl,1
mov rbx,rdx
mov rcx,rax
%if (UNROLL_COUNT % 2)
cmovnz rbx,rax
cmovnz rcx,rdx
%else
cmovz rbx,rax
cmovz rcx,rdx
%endif
xor rdx,rdx
lea rdi,[rdi+8]
jmp v_jmp
.pic_calc:
mov v_jmp, ..@unroll_entry1 - ..@unroll_here1
add v_jmp, [rsp]
ret
; %endif
align 2
.3:
..@unroll_entry1:
%assign i UNROLL_COUNT
%rep UNROLL_COUNT
%define disp_src ADR_BIAS - 8 * i
%ifndef DWORD_OFFSETS
%if disp_src < -120 || disp_src >= 128
%error source dispacement too large
%endif
%endif
%if (i % 2) = 0 ; 25 bytes of code per limb
nop
mov rax,[dsiz rsi + disp_src]
adc rbx,rdx
mul rbp
add [dsiz rdi + disp_src - 8],rcx
mov rcx,dword 0
adc rbx,rax
%else
nop
mov rax,[dsiz rsi + disp_src]
adc rcx,rdx
mul rbp
add [dsiz rdi + disp_src - 8],rbx
%if i != 1
mov rbx,dword 0
%endif
adc rcx,rax
%endif
%assign i i - 1
%endrep
adc rdx,dword 0
add [rdi-8+ADR_BIAS],rcx
mov rcx,v_jmp
adc rdx,dword 0
mov [rdi+ADR_BIAS],rdx
mov rdx,v_ctr
inc rdx
jnz .2
%if ADR_BIAS != 0
add rsi,ADR_BIAS
add rdi,ADR_BIAS
%endif
L_corner:
mov rbp,[rsi-24]
mov rax,[rsi-16]
mov rcx,rax
mul rbp
add [rdi-8],rax
mov rax,[rsi-8]
adc rdx,dword 0
mov rbx,rdx
mov rsi,rax
mul rbp
add rax,rbx
adc rdx,dword 0
add [rdi],rax
mov rax,rsi
adc rdx,dword 0
mov rbx,rdx
mul rcx
add rax,rbx
mov [rdi+8],rax
adc rdx,dword 0
mov [rdi+16],rdx
mov rax,x_len ; start of shift
mov rdi,r_ptr
xor rcx,rcx
lea r11,[rax+rax]
lea rdi,[rdi+r11*8]
not rax
lea rax,[rax+2]
.0: lea r11,[rax+rax]
rcl qword [rdi+r11*8-8],1
rcl qword [rdi+r11*8],1
inc rax
jnz .0
setc al
mov rsi,x_ptr
mov [rdi-8],rax
mov rcx,x_len
mov rax,[rsi]
mul rax
lea rsi,[rsi+rcx*8]
neg rcx
lea r11,[rcx+rcx]
mov [rdi+r11*8],rax
inc rcx
.1: lea r11,[rcx+rcx]
mov rax,[rsi+rcx*8]
mov rbx,rdx
mul rax
add [rdi+r11*8-8],rbx
adc [rdi+r11*8],rax
adc rdx,dword 0
inc rcx
jnz .1
add [rdi-8],rdx
sqr_exit:
pop r13
pop r12
pop rbp
pop rdi
pop rsi
pop rbx
ret
end