mpir/mpn/x86w/p6/sqr_basecase.asm
brgladman 9c467c6415
2008-07-04 10:39:15 +00:00

489 lines
11 KiB
NASM

; Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
;
; This file is part of the GNU MP Library.
;
; The GNU MP Library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Lesser General Public License as
; published by the Free Software Foundation; either version 2.1 of the
; License, or (at your option) any later version.
;
; The GNU MP Library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Lesser General Public License for more details.
;
; You should have received a copy of the GNU Lesser General Public
; License along with the GNU MP Library; see the file COPYING.LIB. If
; not, write to the Free Software Foundation, Inc., 59 Temple Place -
; Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman
%include "..\x86i.inc"
global ___gmpn_sqr_basecase
%ifdef DLL
export ___gmpn_sqr_basecase
%endif
%define UNROLL_COUNT 64 ; seems to be maximum required (I hope!)
%define PARAM_SIZE esp+frame+12
%define PARAM_SRC esp+frame+8
%define PARAM_DST esp+frame+4
%define frame 0
section .text
align 32
___gmpn_sqr_basecase:
mov edx,[PARAM_SIZE]
mov eax,[PARAM_SRC]
cmp edx,2
mov ecx,[PARAM_DST]
je Ltwo_limbs
mov eax,[eax]
ja Lthree_or_more
; one limb only
; eax src limb
; ebx
; ecx dst
; edx
mul eax
mov [ecx],eax
mov [4+ecx],edx
ret
; eax src
; ebx
; ecx dst
; edx
%define SAVE_ESI esp+frame-4
%define SAVE_EBX esp+frame-8
%define SAVE_EDI esp+frame-12
%define SAVE_EBP esp+frame-16
%define frame 16
Ltwo_limbs:
sub esp,frame
mov [SAVE_ESI],esi
mov esi,eax
mov eax,[eax]
mul eax ; src[0]^2
mov [ecx],eax ; dst[0]
mov eax,[4+esi]
mov [SAVE_EBX],ebx
mov ebx,edx ; dst[1]
mul eax ; src[1]^2
mov [SAVE_EDI],edi
mov edi,eax ; dst[2]
mov eax,[esi]
mov [SAVE_EBP],ebp
mov ebp,edx ; dst[3]
mul dword [4+esi] ; src[0]*src[1]
add ebx,eax
mov esi,[SAVE_ESI]
adc edi,edx
adc ebp,0
add eax,ebx
mov ebx,[SAVE_EBX]
adc edx,edi
mov edi,[SAVE_EDI]
adc ebp,0
mov [4+ecx],eax
mov [12+ecx],ebp
mov ebp,[SAVE_EBP]
mov [8+ecx],edx
add esp,frame
ret
; eax src low limb
; ebx
; ecx dst
; edx size
Lthree_or_more:
sub esp,frame
mov [SAVE_ESI],esi
cmp edx,4
mov esi,[PARAM_SRC]
jae Lfour_or_more
; three limbs
; eax src low limb
; ebx
; ecx dst
; edx
; esi src
; edi
; ebp
mov [SAVE_EBP],ebp
mov [SAVE_EDI],edi
mul eax ; src[0] ^ 2
mov [ecx],eax
mov [4+ecx],edx
mov eax,[4+esi]
xor ebp,ebp
mul eax ; src[1] ^ 2
mov [8+ecx],eax
mov [12+ecx],edx
mov eax,[8+esi]
mov [SAVE_EBX],ebx
mul eax ; src[2] ^ 2
mov [16+ecx],eax
mov [20+ecx],edx
mov eax,[esi]
mul dword [4+esi] ; src[0] * src[1]
mov ebx,eax
mov edi,edx
mov eax,[esi]
mul dword [8+esi] ; src[0] * src[2]
add edi,eax
mov ebp,edx
adc ebp,0
mov eax,[4+esi]
mul dword [8+esi] ; src[1] * src[2]
xor esi,esi
add ebp,eax
; eax
; ebx dst[1]
; ecx dst
; edx dst[4]
; esi zero,will be dst[5]
; edi dst[2]
; ebp dst[3]
adc edx,0
add ebx,ebx
adc edi,edi
adc ebp,ebp
adc edx,edx
mov eax,[4+ecx]
adc esi,0
add eax,ebx
mov [4+ecx],eax
mov eax,[8+ecx]
adc eax,edi
mov ebx,[12+ecx]
adc ebx,ebp
mov edi,[16+ecx]
mov [8+ecx],eax
mov ebp,[SAVE_EBP]
mov [12+ecx],ebx
mov ebx,[SAVE_EBX]
adc edi,edx
mov eax,[20+ecx]
mov [16+ecx],edi
mov edi,[SAVE_EDI]
adc eax,esi ; no carry out of this
mov esi,[SAVE_ESI]
mov [20+ecx],eax
add esp,frame
ret
; eax src low limb
; ebx
; ecx
; edx size
; esi src
; edi
; ebp
; First multiply src[0]*src[1..size-1] and store at dst[1..size].
%define VAR_COUNTER esp+frame-20
%define VAR_JMP esp+frame-24
%define STACK_SPACE 24
Lfour_or_more:
sub esp,STACK_SPACE-frame
%define frame STACK_SPACE
mov ecx,1
mov [SAVE_EDI],edi
mov edi,[PARAM_DST]
mov [SAVE_EBX],ebx
sub ecx,edx ; -(size-1)
mov [SAVE_EBP],ebp
mov ebx,0 ; initial carry
lea esi,[esi+edx*4] ; &src[size]
mov ebp,eax ; multiplier
lea edi,[-4+edi+edx*4] ; &dst[size-1]
; This loop runs at just over 6 c/l.
; eax scratch
; ebx carry
; ecx counter,limbs,negative,-(size-1) to -1
; edx scratch
; esi &src[size]
; edi &dst[size-1]
; ebp multiplier
Lmul_1:
mov eax,ebp
mul dword [esi+ecx*4]
add eax,ebx
mov ebx,0
adc ebx,edx
mov [4+edi+ecx*4],eax
inc ecx
jnz Lmul_1
mov [4+edi],ebx
; Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2.
;
; The last two addmuls,which are the bottom right corner of the product
; triangle,are left to the end. These are src[size-3]*src[size-2,size-1]
; and src[size-2]*src[size-1]. If size is 4 then it's only these corner
; cases that need to be done.
;
; The unrolled code is the same as mpn_addmul_1(),see that routine for some
; comments.
;
; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive.
;
; VAR_JMP is the computed jump into the unrolled code,stepped by one code
; chunk each outer loop.
;
; This is also hard-coded in the address calculation below.
;
; With &src[size] and &dst[size-1] pointers,the displacements in the
; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above
; that an offset must be added to them.
;
; eax
; ebx carry
; ecx
; edx
; esi &src[size]
; edi &dst[size-1]
; ebp
%define CODE_BYTES_PER_LIMB 15
%if UNROLL_COUNT > 32
%define OFFSET 4*(UNROLL_COUNT-32)
%else
%define OFFSET 0
%endif
mov ecx,[PARAM_SIZE]
sub ecx,4
jz Lcorner
mov edx,ecx
neg ecx
shl ecx,4
%if OFFSET != 0
sub esi,OFFSET
%endif
%ifdef PIC
call Lhere
Lhere:
add ecx,[esp]
add ecx,Lunroll_inner_end-Lhere-(2*CODE_BYTES_PER_LIMB)
add ecx,edx
add esp,4
%else
lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]
%endif
neg edx
%if OFFSET != 0
sub edi,OFFSET
%endif
; The calculated jump mustn't be before the start of the available
; code. This is the limit that UNROLL_COUNT puts on the src operand
; size,but checked here using the jump address directly.
; ASSERT(ae,movl_text_address( Lunroll_inner_start,%eax) cmpl %eax,%ecx)
%ifdef ASSERT
mov eax,Lunroll_inner_start
cmp ecx,eax
jae Lunroll_outer_top
jmp exit
%endif
; eax
; ebx high limb to store
; ecx VAR_JMP
; edx VAR_COUNTER,limbs,negative
; esi &src[size],constant
; edi dst ptr,second highest limb of last addmul
; ebp
%if UNROLL_COUNT % 2 == 1
%define cmovX cmovz
%else
%define cmovX cmovnz
%endif
align 16
Lunroll_outer_top:
mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier
mov [VAR_COUNTER],edx
mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand
mul ebp
test cl,1
mov ebx,edx ; high carry
lea edi,[4+edi]
mov edx,ecx ; jump
mov ecx,eax ; low carry
lea edx,[CODE_BYTES_PER_LIMB+edx]
cmovX ecx,ebx
cmovX ebx,eax
mov [VAR_JMP],edx
jmp edx
; Must be on an even address here so the low bit of the jump address
; will indicate which way around ecx/ebx should start.
; eax scratch
; ebx carry high
; ecx carry low
; edx scratch
; esi src pointer
; edi dst pointer
; ebp multiplier
;
; 15 code bytes each limb
; ecx/ebx reversed on each chunk
align 2
Lunroll_inner_start:
%assign i UNROLL_COUNT
%rep UNROLL_COUNT
%assign disp_src OFFSET-4*i
%assign disp_dst disp_src
; m4_assert(disp_src>=-128 && disp_src<128)
; m4_assert(disp_dst>=-128 && disp_dst<128)
mov eax,[byte disp_src+esi]
mul ebp
%if i % 2 == 0
add [byte disp_dst+edi],ebx
adc ecx,eax
mov ebx,edx
adc ebx,0
%else
add [byte disp_dst+edi],ecx
adc ebx,eax
mov ecx,edx
adc ecx,0
%endif
%assign i i-1
%endrep
Lunroll_inner_end:
add [OFFSET+edi],ebx
mov edx,[VAR_COUNTER]
adc ecx,0
mov [OFFSET+4+edi],ecx
mov ecx,[VAR_JMP]
inc edx
jnz Lunroll_outer_top
%if OFFSET != 0
add esi,OFFSET
add edi,OFFSET
%endif
; eax
; ebx
; ecx
; edx
; esi &src[size]
; edi &dst[2*size-5]
; ebp
align 16
Lcorner:
mov eax,[-12+esi]
mul dword [-8+esi]
add [edi],eax
mov eax,[-12+esi]
mov ebx,0
adc ebx,edx
mul dword [-4+esi]
add ebx,eax
mov eax,[-8+esi]
adc edx,0
add [4+edi],ebx
mov ebx,0
adc ebx,edx
mul dword [-4+esi]
mov ecx,[PARAM_SIZE]
add eax,ebx
adc edx,0
mov [8+edi],eax
mov [12+edi],edx
mov edi,[PARAM_DST]
; Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1].
sub ecx,1 ; size-1
xor eax,eax ; ready for final adcl,and clear carry
mov edx,ecx
mov esi,[PARAM_SRC]
; eax
; ebx
; ecx counter,size-1 to 1
; edx size-1 (for later use)
; esi src (for later use)
; edi dst,incrementing
; ebp
Llshift:
rcl dword [4+edi],1
rcl dword [8+edi],1
lea edi,[8+edi]
dec ecx
jnz Llshift
adc eax,eax
mov [4+edi],eax ; dst most significant limb
mov eax,[esi] ; src[0]
lea esi,[4+esi+edx*4] ; &src[size]
sub ecx,edx ; -(size-1)
; Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,
; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the
; low limb of src[0]^2.
mul eax
mov [edi+ecx*8],eax ; dst[0]
; eax scratch
; ebx scratch
; ecx counter,negative
; edx carry
; esi &src[size]
; edi dst[2*size-2]
; ebp
Ldiag:
mov eax,[esi+ecx*4]
mov ebx,edx
mul eax
add [4+edi+ecx*8],ebx
adc [8+edi+ecx*8],eax
adc edx,0
inc ecx
jnz Ldiag
mov esi,[SAVE_ESI]
mov ebx,[SAVE_EBX]
add [4+edi],edx ; dst most significant limb
mov edi,[SAVE_EDI]
mov ebp,[SAVE_EBP]
add esp,frame
ret
end