mpir/mpn/x86_64i/amd64/sqr_basecase.asm

;  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
;
;  This file is part of the GNU MP Library.
;
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
;
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
;
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
;  Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit
;  compiler and the YASM assembler.

; AMD64 mpn_sqr_basecase -- square an mpn number.
;
;  Calling interface:
;
;  void mpn_sqr_basecase(
;     mp_ptr dst,       rcx
;     mp_srcptr src,    rdx
;     mp_size_t size     r8
;  )
;
; This is an SEH Frame Function with a leaf prologue

%define _SEH_
%define DWORD_OFFSETS

%define UNROLL_COUNT    40

%ifndef DWORD_OFFSETS 
%if UNROLL_COUNT > 31
%error Unroll count is too large for byte offsets
%endif
%endif

%if   UNROLL_COUNT > 15
%define  off (UNROLL_COUNT - 15) * 8
%else
%define  off  0
%endif

%define r_ptr  r10
%define x_ptr   r9
%define x_len   r8

%define  v_ctr r12
%define v_jmp  r13

   bits     64
   section  .text

   global   __gmpn_sqr_basecase

%ifdef DLL
   export   __gmpn_sqr_basecase
%endif

__gmpn_sqr_basecase:
    movsxd  x_len,r8d
    cmp     x_len,2
    je      sqr_2
    ja      sqr_3_plus
    mov     rax,[rdx]
    mul     rax
    mov     [rcx+8],rdx
    mov     [rcx],rax
    ret

sqr_2:
    mov     r_ptr,rcx
    mov     x_ptr,rdx
    mov     r8,[x_ptr]
    mov     r9,[x_ptr+8]
    mov     rax,r8
    mul     r8
    mov     [r_ptr],rax
    mov     [r_ptr+8],rdx
    mov     rax,r9
    mul     r9
    mov     [r_ptr+16],rax
    mov     [r_ptr+24],rdx
    xor     rcx,rcx
    mov     rax,r8
    mul     r9
    add     rax,rax
    adc     rdx,rdx
    adc     rcx,rcx
    add     [r_ptr+8],rax
    adc     [r_ptr+16],rdx
    adc     [r_ptr+24],rcx
    ret

%ifdef _SEH_
PROC_FRAME  sqr_3_plus
    push_reg    rbx
    push_reg    rsi
    push_reg    rdi
    push_reg    rbp
    push_reg    r12
    push_reg    r13
   alloc_stack  8           ; align to 16 byte boundary
END_PROLOGUE
%else
sqr_3_plus:
    push    rbx
    push    rsi
    push    rdi
    push    rbp
    push    r12
    push    r13
%endif
    mov     r_ptr,rcx
    mov     x_ptr,rdx
    cmp     x_len,4
    jae     sqr_4_plus
    mov     rax,[x_ptr]
    mul     rax
    mov     [r_ptr],rax
    mov     rax,[x_ptr+8]
    mov     [r_ptr+8],rdx
    mul     rax
    mov     [r_ptr+16],rax
    mov     rax,[x_ptr+16]
    mov     [r_ptr+24],rdx
    mul     rax
    mov     [r_ptr+32],rax
    mov     rax,[x_ptr]
    mov     [r_ptr+40],rdx
    mul     qword [x_ptr+8]
    mov     rsi,rax
    mov     rax,[x_ptr]
    mov     rdi,rdx
    mul     qword [x_ptr+16]
    add     rdi,rax
    mov     rbp,dword 0
    mov     rax,[x_ptr+8]
    adc     rbp,rdx
    mul     qword [x_ptr+16]
    xor     x_ptr,x_ptr
    add     rbp,rax
    adc     rdx,dword 0
    adc     rdx,dword 0
    add     rsi,rsi
    adc     rdi,rdi
    mov     rax,[r_ptr+8]
    adc     rbp,rbp
    adc     rdx,rdx
    adc     x_ptr,dword 0
    add     rsi,rax
    mov     rax,[r_ptr+16]
    adc     rdi,rax
    mov     rax,[r_ptr+24]
    mov     [r_ptr+8],rsi
    adc     rbp,rax
    mov     rax,[r_ptr+32]
    mov     [r_ptr+16],rdi
    adc     rdx,rax
    mov     rax,[r_ptr+40]
    mov     [r_ptr+24],rbp
    adc     rax,x_ptr
    mov     [r_ptr+32],rdx
    mov     [r_ptr+40],rax
    jmp     sqr_exit

sqr_4_plus:
    mov     rcx,x_len
    lea     rdi,[r_ptr+rcx*8]
    lea     rsi,[x_ptr+rcx*8]
    mov     rbp,[x_ptr]
    mov     rbx,dword 0
    dec     rcx
    neg     rcx
.0: mov     rax,[rsi+rcx*8]
    mul     rbp
    add     rax,rbx
    mov     [rdi+rcx*8],rax
    mov     rbx,dword 0
    adc     rbx,rdx
    inc     rcx
    jnz     .0
    mov     rcx,x_len
    mov     [rdi],rbx
    sub     rcx,4
    jz      L_corner
    neg     rcx
%if   off != 0
    sub     rdi,off
    sub     rsi,off
%endif
    mov     rdx,rcx

%ifdef DWORD_OFFSETS

%define  CODE_BYTES_PER_LIMB  31    ; must be odd
%define dsiz dword

    shl     rcx,5
    sub     rcx,rdx
    lea     v_jmp,[rel .3]
    lea     rcx,[rcx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB]

%else

%define  CODE_BYTES_PER_LIMB  25    ; must be odd
%define  dsiz byte

    shl     rcx,3
    lea     rcx,[rcx+rcx*2]
    lea     v_jmp,[rel .3]
    lea     rcx,[rcx+rdx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB]

%endif

    lea     rcx,[rcx+v_jmp]
.2: lea     v_jmp,[rcx+CODE_BYTES_PER_LIMB]
    mov     rbp,[rsi+rdx*8-24+off]
    mov     rax,[rsi+rdx*8-16+off]
    mov     v_ctr,rdx
    mul     rbp
    test    cl,1
    mov     rbx,rdx
    mov     rcx,rax
%if (UNROLL_COUNT % 2)
    cmovnz  rbx,rax
    cmovnz  rcx,rdx
%else
    cmovz   rbx,rax
    cmovz   rcx,rdx
%endif
    xor     rdx,rdx
    lea     rdi,[rdi+8]
    jmp     v_jmp

   align    2
.3:
%assign  i UNROLL_COUNT
%rep  UNROLL_COUNT
%define  disp_src off - 8 * i

%ifndef DWORD_OFFSETS
%if disp_src < -120 || disp_src >= 128
%error source dispacement too large
%endif
%endif

%if (i % 2) = 0     ; 25 bytes of code per limb
    nop
    mov     rax,[dsiz rsi + disp_src]
    adc     rbx,rdx
    mul     rbp
    add     [dsiz rdi + disp_src - 8],rcx
    mov     rcx,dword 0
    adc     rbx,rax
%else
    nop
    mov     rax,[dsiz rsi + disp_src]
    adc     rcx,rdx
    mul     rbp
    add     [dsiz rdi + disp_src - 8],rbx
%if   i != 1
    mov     rbx,dword 0
%endif
    adc     rcx,rax
%endif
%assign i i - 1
%endrep

    adc     rdx,dword 0
    add     [rdi-8+off],rcx
    mov     rcx,v_jmp
    adc     rdx,dword 0
    mov     [rdi+off],rdx
    mov     rdx,v_ctr
    inc     rdx
    jnz     .2

%if   off != 0
    add     rsi,off
    add     rdi,off
%endif

L_corner:
    mov     rbp,[rsi-24]
    mov     rax,[rsi-16]
    mov     rcx,rax
    mul     rbp
    add     [rdi-8],rax
    mov     rax,[rsi-8]
    adc     rdx,dword 0
    mov     rbx,rdx
    mov     rsi,rax
    mul     rbp
    add     rax,rbx
    adc     rdx,dword 0
    add     [rdi],rax
    mov     rax,rsi
    adc     rdx,dword 0
    mov     rbx,rdx
    mul     rcx
    add     rax,rbx
    mov     [rdi+8],rax
    adc     rdx,dword 0
    mov     [rdi+16],rdx
    mov     rax,x_len    ; start of shift
    mov     rdi,r_ptr
    xor     rcx,rcx
    lea      r11,[rax+rax]
    lea     rdi,[rdi+r11*8]
    not     rax
    lea     rax,[rax+2]
.0: lea     r11,[rax+rax]
    rcl     qword [rdi+r11*8-8],1
    rcl     qword [rdi+r11*8],1
    inc     rax
    jnz     .0
    setc    al
    mov     rsi,x_ptr
    mov     [rdi-8],rax
    mov     rcx,x_len
    mov     rax,[rsi]
    mul     rax
    lea     rsi,[rsi+rcx*8]
    neg     rcx
    lea      r11,[rcx+rcx]
    mov     [rdi+r11*8],rax
    inc     rcx
.1: lea     r11,[rcx+rcx]
    mov     rax,[rsi+rcx*8]
    mov     rbx,rdx
    mul     rax
    add     [rdi+r11*8-8],rbx
    adc     [rdi+r11*8],rax
    adc     rdx,dword 0
    inc     rcx
    jnz     .1
    add     [rdi-8],rdx
sqr_exit:
%ifdef _SEH_
    add     rsp, 8
    pop     r13
    pop     r12
    pop     rbp
    pop     rdi
    pop     rsi
    pop     rbx
    ret
ENDPROC_FRAME
%else
    pop     r13
    pop     r12
    pop     rbp
    pop     rdi
    pop     rsi
    pop     rbx
    ret
%endif
    end
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`; Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Adapted by Brian Gladman AMD64 using the Microsoft VC++ v8 64-bit`
			`; compiler and the YASM assembler.`

			`; AMD64 mpn_sqr_basecase -- square an mpn number.`
			`;`
			`; Calling interface:`
			`;`
			`; void mpn_sqr_basecase(`
			`; mp_ptr dst, rcx`
			`; mp_srcptr src, rdx`
			`; mp_size_t size r8`
			`; )`
			`;`
			`; This is an SEH Frame Function with a leaf prologue`

			`%define _SEH_`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`%define DWORD_OFFSETS`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`%define UNROLL_COUNT 40`

			`%ifndef DWORD_OFFSETS`
			`%if UNROLL_COUNT > 31`
			`%error Unroll count is too large for byte offsets`
			`%endif`
			`%endif`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00
			`%if UNROLL_COUNT > 15`
			`%define off (UNROLL_COUNT - 15) * 8`
			`%else`
			`%define off 0`
			`%endif`

			`%define r_ptr r10`
			`%define x_ptr r9`
			`%define x_len r8`

			`%define v_ctr r12`
			`%define v_jmp r13`

			`bits 64`
			`section .text`

			`global __gmpn_sqr_basecase`

			`%ifdef DLL`
			`export __gmpn_sqr_basecase`
			`%endif`

			`__gmpn_sqr_basecase:`
			`movsxd x_len,r8d`
			`cmp x_len,2`
			`je sqr_2`
			`ja sqr_3_plus`
			`mov rax,[rdx]`
			`mul rax`
			`mov [rcx+8],rdx`
			`mov [rcx],rax`
			`ret`

			`sqr_2:`
			`mov r_ptr,rcx`
			`mov x_ptr,rdx`
			`mov r8,[x_ptr]`
			`mov r9,[x_ptr+8]`
			`mov rax,r8`
			`mul r8`
			`mov [r_ptr],rax`
			`mov [r_ptr+8],rdx`
			`mov rax,r9`
			`mul r9`
			`mov [r_ptr+16],rax`
			`mov [r_ptr+24],rdx`
			`xor rcx,rcx`
			`mov rax,r8`
			`mul r9`
			`add rax,rax`
			`adc rdx,rdx`
			`adc rcx,rcx`
			`add [r_ptr+8],rax`
			`adc [r_ptr+16],rdx`
			`adc [r_ptr+24],rcx`
			`ret`

			`%ifdef _SEH_`
			`PROC_FRAME sqr_3_plus`
			`push_reg rbx`
			`push_reg rsi`
			`push_reg rdi`
			`push_reg rbp`
			`push_reg r12`
			`push_reg r13`
			`alloc_stack 8 ; align to 16 byte boundary`
			`END_PROLOGUE`
			`%else`
			`sqr_3_plus:`
			`push rbx`
			`push rsi`
			`push rdi`
			`push rbp`
			`push r12`
			`push r13`
			`%endif`
			`mov r_ptr,rcx`
			`mov x_ptr,rdx`
			`cmp x_len,4`
			`jae sqr_4_plus`
			`mov rax,[x_ptr]`
			`mul rax`
			`mov [r_ptr],rax`
			`mov rax,[x_ptr+8]`
			`mov [r_ptr+8],rdx`
			`mul rax`
			`mov [r_ptr+16],rax`
			`mov rax,[x_ptr+16]`
			`mov [r_ptr+24],rdx`
			`mul rax`
			`mov [r_ptr+32],rax`
			`mov rax,[x_ptr]`
			`mov [r_ptr+40],rdx`
			`mul qword [x_ptr+8]`
			`mov rsi,rax`
			`mov rax,[x_ptr]`
			`mov rdi,rdx`
			`mul qword [x_ptr+16]`
			`add rdi,rax`
			`mov rbp,dword 0`
			`mov rax,[x_ptr+8]`
			`adc rbp,rdx`
			`mul qword [x_ptr+16]`
			`xor x_ptr,x_ptr`
			`add rbp,rax`
			`adc rdx,dword 0`
			`adc rdx,dword 0`
			`add rsi,rsi`
			`adc rdi,rdi`
			`mov rax,[r_ptr+8]`
			`adc rbp,rbp`
			`adc rdx,rdx`
			`adc x_ptr,dword 0`
			`add rsi,rax`
			`mov rax,[r_ptr+16]`
			`adc rdi,rax`
			`mov rax,[r_ptr+24]`
			`mov [r_ptr+8],rsi`
			`adc rbp,rax`
			`mov rax,[r_ptr+32]`
			`mov [r_ptr+16],rdi`
			`adc rdx,rax`
			`mov rax,[r_ptr+40]`
			`mov [r_ptr+24],rbp`
			`adc rax,x_ptr`
			`mov [r_ptr+32],rdx`
			`mov [r_ptr+40],rax`
			`jmp sqr_exit`

			`sqr_4_plus:`
			`mov rcx,x_len`
			`lea rdi,[r_ptr+rcx*8]`
			`lea rsi,[x_ptr+rcx*8]`
			`mov rbp,[x_ptr]`
			`mov rbx,dword 0`
			`dec rcx`
			`neg rcx`
			`.0: mov rax,[rsi+rcx*8]`
			`mul rbp`
			`add rax,rbx`
			`mov [rdi+rcx*8],rax`
			`mov rbx,dword 0`
			`adc rbx,rdx`
			`inc rcx`
			`jnz .0`
			`mov rcx,x_len`
			`mov [rdi],rbx`
			`sub rcx,4`
			`jz L_corner`
			`neg rcx`
			`%if off != 0`
			`sub rdi,off`
			`sub rsi,off`
			`%endif`
			`mov rdx,rcx`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00
			`%ifdef DWORD_OFFSETS`

			`%define CODE_BYTES_PER_LIMB 31 ; must be odd`
			`%define dsiz dword`

			`shl rcx,5`
			`sub rcx,rdx`
			`lea v_jmp,[rel .3]`
			`lea rcx,[rcx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB]`

			`%else`

			`%define CODE_BYTES_PER_LIMB 25 ; must be odd`
			`%define dsiz byte`

1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`shl rcx,3`
			`lea rcx,[rcx+rcx*2]`
			`lea v_jmp,[rel .3]`
			`lea rcx,[rcx+rdx+(UNROLL_COUNT - 2) * CODE_BYTES_PER_LIMB]`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00
			`%endif`

1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`lea rcx,[rcx+v_jmp]`
			`.2: lea v_jmp,[rcx+CODE_BYTES_PER_LIMB]`
			`mov rbp,[rsi+rdx*8-24+off]`
			`mov rax,[rsi+rdx*8-16+off]`
			`mov v_ctr,rdx`
			`mul rbp`
			`test cl,1`
			`mov rbx,rdx`
			`mov rcx,rax`
			`%if (UNROLL_COUNT % 2)`
			`cmovnz rbx,rax`
			`cmovnz rcx,rdx`
			`%else`
			`cmovz rbx,rax`
			`cmovz rcx,rdx`
			`%endif`
			`xor rdx,rdx`
			`lea rdi,[rdi+8]`
			`jmp v_jmp`

			`align 2`
			`.3:`
			`%assign i UNROLL_COUNT`
			`%rep UNROLL_COUNT`
			`%define disp_src off - 8 * i`

Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`%ifndef DWORD_OFFSETS`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`%if disp_src < -120 \|\| disp_src >= 128`
			`%error source dispacement too large`
			`%endif`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`%endif`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00
			`%if (i % 2) = 0 ; 25 bytes of code per limb`
			`nop`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`mov rax,[dsiz rsi + disp_src]`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`adc rbx,rdx`
			`mul rbp`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`add [dsiz rdi + disp_src - 8],rcx`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`mov rcx,dword 0`
			`adc rbx,rax`
			`%else`
			`nop`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`mov rax,[dsiz rsi + disp_src]`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`adc rcx,rdx`
			`mul rbp`
Improve AMD64 sgr_basecase Add warning about sqr_basecase Karatsuba limit 2008-05-20 08:39:43 -04:00			`add [dsiz rdi + disp_src - 8],rbx`
1. longlong.h change to add MSVC intrinsics 2. longlong.h rearrangement for Intel compiler 3. MSVC additions in test code 4. GMP 4.2.1 bug fixes 5. Intel format assembly code 2008-05-18 18:20:43 -04:00			`%if i != 1`
			`mov rbx,dword 0`
			`%endif`
			`adc rcx,rax`
			`%endif`
			`%assign i i - 1`
			`%endrep`

			`adc rdx,dword 0`
			`add [rdi-8+off],rcx`
			`mov rcx,v_jmp`
			`adc rdx,dword 0`
			`mov [rdi+off],rdx`
			`mov rdx,v_ctr`
			`inc rdx`
			`jnz .2`

			`%if off != 0`
			`add rsi,off`
			`add rdi,off`
			`%endif`

			`L_corner:`
			`mov rbp,[rsi-24]`
			`mov rax,[rsi-16]`
			`mov rcx,rax`
			`mul rbp`
			`add [rdi-8],rax`
			`mov rax,[rsi-8]`
			`adc rdx,dword 0`
			`mov rbx,rdx`
			`mov rsi,rax`
			`mul rbp`
			`add rax,rbx`
			`adc rdx,dword 0`
			`add [rdi],rax`
			`mov rax,rsi`
			`adc rdx,dword 0`
			`mov rbx,rdx`
			`mul rcx`
			`add rax,rbx`
			`mov [rdi+8],rax`
			`adc rdx,dword 0`
			`mov [rdi+16],rdx`
			`mov rax,x_len ; start of shift`
			`mov rdi,r_ptr`
			`xor rcx,rcx`
			`lea r11,[rax+rax]`
			`lea rdi,[rdi+r11*8]`
			`not rax`
			`lea rax,[rax+2]`
			`.0: lea r11,[rax+rax]`
			`rcl qword [rdi+r11*8-8],1`
			`rcl qword [rdi+r11*8],1`
			`inc rax`
			`jnz .0`
			`setc al`
			`mov rsi,x_ptr`
			`mov [rdi-8],rax`
			`mov rcx,x_len`
			`mov rax,[rsi]`
			`mul rax`
			`lea rsi,[rsi+rcx*8]`
			`neg rcx`
			`lea r11,[rcx+rcx]`
			`mov [rdi+r11*8],rax`
			`inc rcx`
			`.1: lea r11,[rcx+rcx]`
			`mov rax,[rsi+rcx*8]`
			`mov rbx,rdx`
			`mul rax`
			`add [rdi+r11*8-8],rbx`
			`adc [rdi+r11*8],rax`
			`adc rdx,dword 0`
			`inc rcx`
			`jnz .1`
			`add [rdi-8],rdx`
			`sqr_exit:`
			`%ifdef _SEH_`
			`add rsp, 8`
			`pop r13`
			`pop r12`
			`pop rbp`
			`pop rdi`
			`pop rsi`
			`pop rbx`
			`ret`
			`ENDPROC_FRAME`
			`%else`
			`pop r13`
			`pop r12`
			`pop rbp`
			`pop rdi`
			`pop rsi`
			`pop rbx`
			`ret`
			`%endif`
			`end`