mpir/mpn/x86w/p6/sqr_basecase.old.asm

521 lines
12 KiB
NASM
Raw Normal View History

; Intel P6 mpn_sqr_basecase -- square an mpn number.
;
; Copyright 1999,2000,2002 Free Software Foundation,Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License,or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not,write to the Free Software Foundation,Inc.,59 Temple Place -
; Suite 330,Boston,MA 02111-1307,USA.
;
; P6: approx 4.0 cycles per cross product,or 7.75 cycles per triangular
; product (measured on the speed difference between 20 and 40 limbs,
; which is the Karatsuba recursing range).
;
; These are the same as in mpn/x86/k6/sqr_basecase.asm,see that file for
; a description. The only difference here is that UNROLL_COUNT can go up
; to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67.
;
; void mpn_sqr_basecase (mp_ptr dst,mp_srcptr src,mp_size_t size);
;
; The algorithm is basically the same as mpn/generic/sqr_basecase.c,but a
; lot of function call overheads are avoided,especially when the given size
; is small.
;
; The code size might look a bit excessive,but not all of it is executed so
; it won't all get into the code cache. The 1x1,2x2 and 3x3 special cases
; clearly apply only to those sizes; mid sizes like 10x10 only need part of
; the unrolled addmul; and big sizes like 40x40 that do use the full
; unrolling will least be making good use of it,because 40x40 will take
; something like 7000 cycles.
%include "..\\x86i.inc"
%define SQR_KARATSUBA_THRESHOLD 10
%define SQR_KARATSUBA_THRESHOLD_MAX 67
%ifdef SQR_KARATSUBA_THRESHOLD_OVERRIDE
%define SQR_KARATSUBA_THRESHOLD SQR_KARATSUBA_THRESHOLD_OVERRIDE
%endif
%define UNROLL_COUNT SQR_KARATSUBA_THRESHOLD-3
FR_def PARAM_SIZE,12
FR_def PARAM_SRC,8
FR_def PARAM_DST,4
section .text
global ___gmpn_sqr_basecase
align 32
___gmpn_sqr_basecase:
mov edx,[PARAM_SIZE]
mov eax,[PARAM_SRC]
cmp edx,2
mov ecx,[PARAM_DST]
je Ltwo_limbs
mov eax,[eax]
ja Lthree_or_more
; one limb only
; eax src limb
; ebx
; ecx dst
; edx
mul eax
mov [ecx],eax
mov [4+ecx],edx
ret
; eax src
; ebx
; ecx dst
; edx
FR_def SAVE_ESI, -4
FR_def SAVE_EBX, -8
FR_def SAVE_EDI,-12
FR_def SAVE_EBP,-16
%define STACK_SPACE 16
%define frame 16
Ltwo_limbs:
sub esp,frame
mov [SAVE_ESI],esi
mov esi,eax
mov eax,[eax]
mul eax ; src[0]^2
mov [ecx],eax ; dst[0]
mov eax,[4+esi]
mov [SAVE_EBX],ebx
mov ebx,edx ; dst[1]
mul eax ; src[1]^2
mov [SAVE_EDI],edi
mov edi,eax ; dst[2]
mov eax,[esi]
mov [SAVE_EBP],ebp
mov ebp,edx ; dst[3]
mul dword [4+esi] ; src[0]*src[1]
add ebx,eax
mov esi,[SAVE_ESI]
adc edi,edx
adc ebp,0
add eax,ebx
mov ebx,[SAVE_EBX]
adc edx,edi
mov edi,[SAVE_EDI]
adc ebp,0
mov [4+ecx],eax
mov [12+ecx],ebp
mov ebp,[SAVE_EBP]
mov [8+ecx],edx
add esp,frame
ret
; eax src low limb
; ebx
; ecx dst
; edx size
%define frame 0
Lthree_or_more:
FR_push esi
cmp edx,4
mov esi,[PARAM_SRC]
jae Lfour_or_more
; three limbs
;
; eax src low limb
; ebx
; ecx dst
; edx
; esi src
; edi
; ebp
%undef SAVE_EBP
%undef SAVE_EDI
%undef SAVE_EBX
FR_push ebp,SAVE_EBP
FR_push edi,SAVE_EDI
mul eax ; src[0] ^ 2
mov [ecx],eax
mov [4+ecx],edx
mov eax,[4+esi]
xor ebp,ebp
mul eax ; src[1] ^ 2
mov [8+ecx],eax
mov [12+ecx],edx
mov eax,[8+esi]
FR_push ebx,SAVE_EBX
mul eax ; src[2] ^ 2
mov [16+ecx],eax
mov [20+ecx],edx
mov eax,[esi]
mul dword [4+esi] ; src[0] * src[1]
mov ebx,eax
mov edi,edx
mov eax,[esi]
mul dword [8+esi] ; src[0] * src[2]
add edi,eax
mov ebp,edx
adc ebp,0
mov eax,[4+esi]
mul dword [8+esi] ; src[1] * src[2]
xor esi,esi
add ebp,eax
; eax
; ebx dst[1]
; ecx dst
; edx dst[4]
; esi zero,will be dst[5]
; edi dst[2]
; ebp dst[3]
adc edx,0
add ebx,ebx
adc edi,edi
adc ebp,ebp
adc edx,edx
mov eax,[4+ecx]
adc esi,0
add eax,ebx
mov [4+ecx],eax
mov eax,[8+ecx]
adc eax,edi
mov ebx,[12+ecx]
adc ebx,ebp
mov edi,[16+ecx]
mov [8+ecx],eax
mov ebp,[SAVE_EBP]
mov [12+ecx],ebx
mov ebx,[SAVE_EBX]
adc edi,edx
mov eax,[20+ecx]
mov [16+ecx],edi
mov edi,[SAVE_EDI]
adc eax,esi ; no carry out of this
mov esi,[SAVE_ESI]
mov [20+ecx],eax
add esp,frame
ret
; eax src low limb
; ebx
; ecx
; edx size
; esi src
; edi
; ebp
%define VAR_COUNTER esp+frame-20
%define VAR_JMP esp+frame-24
%define STACK_SPACE 24
%define frame 4
; First multiply src[0]*src[1..size-1] and store at dst[1..size].
Lfour_or_more:
sub esp,STACK_SPACE-frame
%define frame STACK_SPACE
mov ecx,1
mov [SAVE_EDI],edi
mov edi,[PARAM_DST]
mov [SAVE_EBX],ebx
sub ecx,edx ; -(size-1)
mov [SAVE_EBP],ebp
mov ebx,0 ; initial carry
lea esi,[esi+edx*4] ; &src[size]
mov ebp,eax ; multiplier
lea edi,[-4+edi+edx*4] ; &dst[size-1]
; This loop runs at just over 6 c/l.
;
; eax scratch
; ebx carry
; ecx counter,limbs,negative,-(size-1) to -1
; edx scratch
; esi &src[size]
; edi &dst[size-1]
; ebp multiplier
Lmul_1:
mov eax,ebp
mul dword [esi+ecx*4]
add eax,ebx
mov ebx,0
adc ebx,edx
mov [4+edi+ecx*4],eax
inc ecx
jnz Lmul_1
mov [4+edi],ebx
; Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2.
;
; The last two addmuls,which are the bottom right corner of the product
; triangle,are left to the end. These are src[size-3]*src[size-2,size-1]
; and src[size-2]*src[size-1]. If size is 4 then it's only these corner
; cases that need to be done.
;
; The unrolled code is the same as mpn_addmul_1(),see that routine for some
; comments.
;
; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive.
;
; VAR_JMP is the computed jump into the unrolled code,stepped by one code
; chunk each outer loop.
;
; This is also hard-coded in the address calculation below.
;
; With &src[size] and &dst[size-1] pointers,the displacements in the
; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above
; that an offset must be added to them.
;
; eax
; ebx carry
; ecx
; edx
; esi &src[size]
; edi &dst[size-1]
; ebp
%define CODE_BYTES_PER_LIMB 15
%if UNROLL_COUNT > 32
%define OFFSET UNROLL_COUNT-32
%else
%define OFFSET 0
%endif
mov ecx,[PARAM_SIZE]
sub ecx,4
jz Lcorner
mov edx,ecx
neg ecx
shl ecx,4
%if OFFSET != 0
sub esi,OFFSET
%endif
%ifdef PIC
call Lhere
Lhere:
add ecx,[esp]
add ecx,Lunroll_inner_end-Lhere-2*CODE_BYTES_PER_LIMB
add ecx,edx
add esp,4
%else
lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]
%endif
neg edx
%if OFFSET != 0
sub edi,OFFSET
%endif
; The calculated jump mustn't be before the start of the available
; code. This is the limit that UNROLL_COUNT puts on the src operand
; size,but checked here using the jump address directly.
%ifdef ASSERT
mov eax,Lunroll_inner_start
cmp ecx,eax
jae Lunroll_outer_top
jmp exit
%endif
; eax
; ebx high limb to store
; ecx VAR_JMP
; edx VAR_COUNTER,limbs,negative
; esi &src[size],constant
; edi dst ptr,second highest limb of last addmul
; ebp
align 16
Lunroll_outer_top:
mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier
mov [VAR_COUNTER],edx
mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand
mul ebp
%if UNROLL_COUNT % 2 == 1
%define cmovX cmovz
%else
%define cmovX cmovnz
%endif
test cl,1
mov ebx,edx ; high carry
lea edi,[4+edi]
mov edx,ecx ; jump
mov ecx,eax ; low carry
lea edx,[CODE_BYTES_PER_LIMB+edx]
cmovX ecx,ebx
cmovX ebx,eax
mov [VAR_JMP],edx
jmp edx
; Must be on an even address here so the low bit of the jump address
; will indicate which way around ecx/ebx should start.
; eax scratch
; ebx carry high
; ecx carry low
; edx scratch
; esi src pointer
; edi dst pointer
; ebp multiplier
;
; 15 code bytes each limb
; ecx/ebx reversed on each chunk
align 2
Lunroll_inner_start:
%assign i UNROLL_COUNT
%rep UNROLL_COUNT
%assign disp OFFSET-4*i
%if i % 2 == 0
mov eax,[byte disp+esi]
mul ebp
add [byte disp+edi],ebx
adc ecx,eax
mov ebx,edx
adc ebx,0
%else
; this one comes out last
mov eax,[byte disp+esi]
mul ebp
add [byte disp+edi],ecx
adc ebx,eax
mov ecx,edx
adc ecx,0
%endif
%assign i i-1
%endrep
Lunroll_inner_end:
add [OFFSET+edi],ebx
mov edx,[VAR_COUNTER]
adc ecx,0
mov [OFFSET+4+edi],ecx
mov ecx,[VAR_JMP]
inc edx
jnz Lunroll_outer_top
%if OFFSET != 0
add esi,OFFSET
add edi,OFFSET
%endif
; eax
; ebx
; ecx
; edx
; esi &src[size]
; edi &dst[2*size-5]
; ebp
align 16
Lcorner:
mov eax,[-12+esi]
mul dword [-8+esi]
add [edi],eax
mov eax,[-12+esi]
mov ebx,0
adc ebx,edx
mul dword [-4+esi]
add ebx,eax
mov eax,[-8+esi]
adc edx,0
add [4+edi],ebx
mov ebx,0
adc ebx,edx
mul dword [-4+esi]
mov ecx,[PARAM_SIZE]
add eax,ebx
adc edx,0
mov [8+edi],eax
mov [12+edi],edx
mov edi,[PARAM_DST]
; Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1].
sub ecx,1 ; size-1
xor eax,eax ; ready for final adcl,and clear carry
mov edx,ecx
mov esi,[PARAM_SRC]
; eax
; ebx
; ecx counter,size-1 to 1
; edx size-1 (for later use)
; esi src (for later use)
; edi dst,incrementing
; ebp
Llshift:
rcl dword [4+edi],1
rcl dword [8+edi],1
lea edi,[8+edi]
dec ecx
jnz Llshift
adc eax,eax
mov [4+edi],eax ; dst most significant limb
mov eax,[esi] ; src[0]
lea esi,[4+esi+edx*4] ; &src[size]
sub ecx,edx ; -(size-1)
; Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,
; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the
; low limb of src[0]^2.
mul eax
mov [edi+ecx*8],eax ; dst[0]
; eax scratch
; ebx scratch
; ecx counter,negative
; edx carry
; esi &src[size]
; edi dst[2*size-2]
; ebp
Ldiag:
mov eax,[esi+ecx*4]
mov ebx,edx
mul eax
add [4+edi+ecx*8],ebx
adc [8+edi+ecx*8],eax
adc edx,0
inc ecx
jnz Ldiag
mov esi,[SAVE_ESI]
mov ebx,[SAVE_EBX]
add [4+edi],edx ; dst most significant limb
mov edi,[SAVE_EDI]
mov ebp,[SAVE_EBP]
add esp,frame
ret
end