; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 59 Temple Place - ; Suite 330, Boston, MA 02111-1307, USA. ; ; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit ; compiler and the YASM assembler. ; AMD64 mpn_sqr_basecase -- square an mpn number. ; ; Calling interface: ; ; void mpn_sqr_basecase( ; mp_ptr dst, rcx rdi ; mp_srcptr src, rdx rsi ; mp_size_t size r8 rdx ; ) ; The unroll count here is set very large so that tuneup won't run ; out of unroll loop when it tried ridiculously large crossover ; values with karatsuba squaring. Tuneup was observed to require a ; value here of over 128. Though 150 seems ok, we set it even higher. %define UNROLL_COUNT 300 %if UNROLL_COUNT > 31 %define DWORD_OFFSETS %else %undef DWORD_OFFSETS %endif %ifdef DWORD_OFFSETS %define ADR_BIAS 0 %elif UNROLL_COUNT > 15 %define ADR_BIAS (UNROLL_COUNT - 15) * 8 %else %define ADR_BIAS 0 %endif %define r_ptr r10 %define x_ptr r9 %define x_len r8 %define v_ctr r12 %define v_jmp r13 bits 64 section .text global __gmpn_sqr_basecase:function __gmpn_sqr_basecase: movsxd x_len,edx cmp x_len,2 je sqr_2 ja sqr_3_plus mov rax,[rsi] mul rax mov [rdi+8],rdx mov [rdi],rax ret sqr_2: mov r_ptr,rdi mov x_ptr,rsi mov r8,[x_ptr] mov r9,[x_ptr+8] mov rax,r8 mul r8 mov [r_ptr],rax mov [r_ptr+8],rdx mov rax,r9 mul r9 mov [r_ptr+16],rax mov [r_ptr+24],rdx xor rcx,rcx mov rax,r8 mul r9 add rax,rax adc rdx,rdx adc rcx,rcx add [r_ptr+8],rax adc [r_ptr+16],rdx adc [r_ptr+24],rcx ret sqr_3_plus: push rbx push rsi push rdi push rbp push r12 push r13 mov r_ptr,rdi mov x_ptr,rsi cmp x_len,4 jae sqr_4_plus mov rax,[x_ptr] mul rax mov [r_ptr],rax mov rax,[x_ptr+8] mov [r_ptr+8],rdx mul rax mov [r_ptr+16],rax mov rax,[x_ptr+16] mov [r_ptr+24],rdx mul rax mov [r_ptr+32],rax mov rax,[x_ptr] mov [r_ptr+40],rdx mul qword [x_ptr+8] mov rsi,rax mov rax,[x_ptr] mov rdi,rdx mul qword [x_ptr+16] add rdi,rax mov rbp,dword 0 mov rax,[x_ptr+8] adc rbp,rdx mul qword [x_ptr+16] xor x_ptr,x_ptr add rbp,rax adc rdx,dword 0 adc rdx,dword 0 add rsi,rsi adc rdi,rdi mov rax,[r_ptr+8] adc rbp,rbp adc rdx,rdx adc x_ptr,dword 0 add rsi,rax mov rax,[r_ptr+16] adc rdi,rax mov rax,[r_ptr+24] mov [r_ptr+8],rsi adc rbp,rax mov rax,[r_ptr+32] mov [r_ptr+16],rdi adc rdx,rax mov rax,[r_ptr+40] mov [r_ptr+24],rbp adc rax,x_ptr mov [r_ptr+32],rdx mov [r_ptr+40],rax jmp sqr_exit sqr_4_plus: mov rcx,x_len lea rdi,[r_ptr+rcx*8] lea rsi,[x_ptr+rcx*8] mov rbp,[x_ptr] mov rbx,dword 0 dec rcx neg rcx .0: mov rax,[rsi+rcx*8] mul rbp add rax,rbx mov [rdi+rcx*8],rax mov rbx,dword 0 adc rbx,rdx inc rcx jnz .0 mov rcx,x_len mov [rdi],rbx sub rcx,4 jz L_corner neg rcx %if ADR_BIAS != 0 sub rdi,ADR_BIAS sub rsi,ADR_BIAS %endif mov rdx,rcx %ifdef DWORD_OFFSETS %define CODE_BYTES_PER_LIMB 31 ; must be odd %define dsiz dword shl rcx,5 sub rcx,rdx %ifdef PIC call .pic_calc %else lea v_jmp,[rel .3] %endif ..@unroll_here1: lea rcx,[rcx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB] %else %define CODE_BYTES_PER_LIMB 25 ; must be odd %define dsiz byte shl rcx,3 lea rcx,[rcx+rcx*2] %ifdef PIC call .pic_calc %else lea v_jmp,[rel .3] %endif ..@unroll_here1: lea rcx,[rcx+rdx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB] %endif lea rcx,[rcx+v_jmp] .2: lea v_jmp,[rcx+CODE_BYTES_PER_LIMB] mov rbp,[rsi+rdx*8-24+ADR_BIAS] mov rax,[rsi+rdx*8-16+ADR_BIAS] mov v_ctr,rdx mul rbp test cl,1 mov rbx,rdx mov rcx,rax %if (UNROLL_COUNT % 2) cmovnz rbx,rax cmovnz rcx,rdx %else cmovz rbx,rax cmovz rcx,rdx %endif xor rdx,rdx lea rdi,[rdi+8] jmp v_jmp .pic_calc: mov v_jmp, ..@unroll_entry1 - ..@unroll_here1 add v_jmp, [rsp] ret ; %endif align 2 .3: ..@unroll_entry1: %assign i UNROLL_COUNT %rep UNROLL_COUNT %define disp_src ADR_BIAS - 8 * i %ifndef DWORD_OFFSETS %if disp_src < -120 || disp_src >= 128 %error source dispacement too large %endif %endif %if (i % 2) = 0 ; 25 bytes of code per limb nop mov rax,[dsiz rsi + disp_src] adc rbx,rdx mul rbp add [dsiz rdi + disp_src - 8],rcx mov rcx,dword 0 adc rbx,rax %else nop mov rax,[dsiz rsi + disp_src] adc rcx,rdx mul rbp add [dsiz rdi + disp_src - 8],rbx %if i != 1 mov rbx,dword 0 %endif adc rcx,rax %endif %assign i i - 1 %endrep adc rdx,dword 0 add [rdi-8+ADR_BIAS],rcx mov rcx,v_jmp adc rdx,dword 0 mov [rdi+ADR_BIAS],rdx mov rdx,v_ctr inc rdx jnz .2 %if ADR_BIAS != 0 add rsi,ADR_BIAS add rdi,ADR_BIAS %endif L_corner: mov rbp,[rsi-24] mov rax,[rsi-16] mov rcx,rax mul rbp add [rdi-8],rax mov rax,[rsi-8] adc rdx,dword 0 mov rbx,rdx mov rsi,rax mul rbp add rax,rbx adc rdx,dword 0 add [rdi],rax mov rax,rsi adc rdx,dword 0 mov rbx,rdx mul rcx add rax,rbx mov [rdi+8],rax adc rdx,dword 0 mov [rdi+16],rdx mov rax,x_len ; start of shift mov rdi,r_ptr xor rcx,rcx lea r11,[rax+rax] lea rdi,[rdi+r11*8] not rax lea rax,[rax+2] .0: lea r11,[rax+rax] rcl qword [rdi+r11*8-8],1 rcl qword [rdi+r11*8],1 inc rax jnz .0 setc al mov rsi,x_ptr mov [rdi-8],rax mov rcx,x_len mov rax,[rsi] mul rax lea rsi,[rsi+rcx*8] neg rcx lea r11,[rcx+rcx] mov [rdi+r11*8],rax inc rcx .1: lea r11,[rcx+rcx] mov rax,[rsi+rcx*8] mov rbx,rdx mul rax add [rdi+r11*8-8],rbx adc [rdi+r11*8],rax adc rdx,dword 0 inc rcx jnz .1 add [rdi-8],rdx sqr_exit: pop r13 pop r12 pop rbp pop rdi pop rsi pop rbx ret end