; Copyright 1999, 2000, 2002 Free Software Foundation, Inc. ; ; This file is part of the GNU MP Library. ; ; The GNU MP Library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Lesser General Public License as ; published by the Free Software Foundation; either version 2.1 of the ; License, or (at your option) any later version. ; ; The GNU MP Library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Lesser General Public License for more details. ; ; You should have received a copy of the GNU Lesser General Public ; License along with the GNU MP Library; see the file COPYING.LIB. If ; not, write to the Free Software Foundation, Inc., 59 Temple Place - ; Suite 330, Boston, MA 02111-1307, USA. ; ; Translation of AT&T syntax code by Brian Gladman %include "..\x86i.inc" global ___gmpn_sqr_basecase %ifdef DLL export ___gmpn_sqr_basecase %endif %define UNROLL_COUNT 64 ; seems to be maximum required (I hope!) %define PARAM_SIZE esp+frame+12 %define PARAM_SRC esp+frame+8 %define PARAM_DST esp+frame+4 %define frame 0 section .text align 32 ___gmpn_sqr_basecase: mov edx,[PARAM_SIZE] mov eax,[PARAM_SRC] cmp edx,2 mov ecx,[PARAM_DST] je Ltwo_limbs mov eax,[eax] ja Lthree_or_more ; one limb only ; eax src limb ; ebx ; ecx dst ; edx mul eax mov [ecx],eax mov [4+ecx],edx ret ; eax src ; ebx ; ecx dst ; edx %define SAVE_ESI esp+frame-4 %define SAVE_EBX esp+frame-8 %define SAVE_EDI esp+frame-12 %define SAVE_EBP esp+frame-16 %define frame 16 Ltwo_limbs: sub esp,frame mov [SAVE_ESI],esi mov esi,eax mov eax,[eax] mul eax ; src[0]^2 mov [ecx],eax ; dst[0] mov eax,[4+esi] mov [SAVE_EBX],ebx mov ebx,edx ; dst[1] mul eax ; src[1]^2 mov [SAVE_EDI],edi mov edi,eax ; dst[2] mov eax,[esi] mov [SAVE_EBP],ebp mov ebp,edx ; dst[3] mul dword [4+esi] ; src[0]*src[1] add ebx,eax mov esi,[SAVE_ESI] adc edi,edx adc ebp,0 add eax,ebx mov ebx,[SAVE_EBX] adc edx,edi mov edi,[SAVE_EDI] adc ebp,0 mov [4+ecx],eax mov [12+ecx],ebp mov ebp,[SAVE_EBP] mov [8+ecx],edx add esp,frame ret ; eax src low limb ; ebx ; ecx dst ; edx size Lthree_or_more: sub esp,frame mov [SAVE_ESI],esi cmp edx,4 mov esi,[PARAM_SRC] jae Lfour_or_more ; three limbs ; eax src low limb ; ebx ; ecx dst ; edx ; esi src ; edi ; ebp mov [SAVE_EBP],ebp mov [SAVE_EDI],edi mul eax ; src[0] ^ 2 mov [ecx],eax mov [4+ecx],edx mov eax,[4+esi] xor ebp,ebp mul eax ; src[1] ^ 2 mov [8+ecx],eax mov [12+ecx],edx mov eax,[8+esi] mov [SAVE_EBX],ebx mul eax ; src[2] ^ 2 mov [16+ecx],eax mov [20+ecx],edx mov eax,[esi] mul dword [4+esi] ; src[0] * src[1] mov ebx,eax mov edi,edx mov eax,[esi] mul dword [8+esi] ; src[0] * src[2] add edi,eax mov ebp,edx adc ebp,0 mov eax,[4+esi] mul dword [8+esi] ; src[1] * src[2] xor esi,esi add ebp,eax ; eax ; ebx dst[1] ; ecx dst ; edx dst[4] ; esi zero,will be dst[5] ; edi dst[2] ; ebp dst[3] adc edx,0 add ebx,ebx adc edi,edi adc ebp,ebp adc edx,edx mov eax,[4+ecx] adc esi,0 add eax,ebx mov [4+ecx],eax mov eax,[8+ecx] adc eax,edi mov ebx,[12+ecx] adc ebx,ebp mov edi,[16+ecx] mov [8+ecx],eax mov ebp,[SAVE_EBP] mov [12+ecx],ebx mov ebx,[SAVE_EBX] adc edi,edx mov eax,[20+ecx] mov [16+ecx],edi mov edi,[SAVE_EDI] adc eax,esi ; no carry out of this mov esi,[SAVE_ESI] mov [20+ecx],eax add esp,frame ret ; eax src low limb ; ebx ; ecx ; edx size ; esi src ; edi ; ebp ; First multiply src[0]*src[1..size-1] and store at dst[1..size]. %define VAR_COUNTER esp+frame-20 %define VAR_JMP esp+frame-24 %define STACK_SPACE 24 Lfour_or_more: sub esp,STACK_SPACE-frame %define frame STACK_SPACE mov ecx,1 mov [SAVE_EDI],edi mov edi,[PARAM_DST] mov [SAVE_EBX],ebx sub ecx,edx ; -(size-1) mov [SAVE_EBP],ebp mov ebx,0 ; initial carry lea esi,[esi+edx*4] ; &src[size] mov ebp,eax ; multiplier lea edi,[-4+edi+edx*4] ; &dst[size-1] ; This loop runs at just over 6 c/l. ; eax scratch ; ebx carry ; ecx counter,limbs,negative,-(size-1) to -1 ; edx scratch ; esi &src[size] ; edi &dst[size-1] ; ebp multiplier Lmul_1: mov eax,ebp mul dword [esi+ecx*4] add eax,ebx mov ebx,0 adc ebx,edx mov [4+edi+ecx*4],eax inc ecx jnz Lmul_1 mov [4+edi],ebx ; Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2. ; ; The last two addmuls,which are the bottom right corner of the product ; triangle,are left to the end. These are src[size-3]*src[size-2,size-1] ; and src[size-2]*src[size-1]. If size is 4 then it's only these corner ; cases that need to be done. ; ; The unrolled code is the same as mpn_addmul_1(),see that routine for some ; comments. ; ; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive. ; ; VAR_JMP is the computed jump into the unrolled code,stepped by one code ; chunk each outer loop. ; ; This is also hard-coded in the address calculation below. ; ; With &src[size] and &dst[size-1] pointers,the displacements in the ; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above ; that an offset must be added to them. ; ; eax ; ebx carry ; ecx ; edx ; esi &src[size] ; edi &dst[size-1] ; ebp %define CODE_BYTES_PER_LIMB 15 %if UNROLL_COUNT > 32 %define OFFSET 4*(UNROLL_COUNT-32) %else %define OFFSET 0 %endif mov ecx,[PARAM_SIZE] sub ecx,4 jz Lcorner mov edx,ecx neg ecx shl ecx,4 %if OFFSET != 0 sub esi,OFFSET %endif %ifdef PIC call Lhere Lhere: add ecx,[esp] add ecx,Lunroll_inner_end-Lhere-(2*CODE_BYTES_PER_LIMB) add ecx,edx add esp,4 %else lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx] %endif neg edx %if OFFSET != 0 sub edi,OFFSET %endif ; The calculated jump mustn't be before the start of the available ; code. This is the limit that UNROLL_COUNT puts on the src operand ; size,but checked here using the jump address directly. ; ASSERT(ae,movl_text_address( Lunroll_inner_start,%eax) cmpl %eax,%ecx) %ifdef ASSERT mov eax,Lunroll_inner_start cmp ecx,eax jae Lunroll_outer_top jmp exit %endif ; eax ; ebx high limb to store ; ecx VAR_JMP ; edx VAR_COUNTER,limbs,negative ; esi &src[size],constant ; edi dst ptr,second highest limb of last addmul ; ebp %if UNROLL_COUNT % 2 == 1 %define cmovX cmovz %else %define cmovX cmovnz %endif align 16 Lunroll_outer_top: mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier mov [VAR_COUNTER],edx mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand mul ebp test cl,1 mov ebx,edx ; high carry lea edi,[4+edi] mov edx,ecx ; jump mov ecx,eax ; low carry lea edx,[CODE_BYTES_PER_LIMB+edx] cmovX ecx,ebx cmovX ebx,eax mov [VAR_JMP],edx jmp edx ; Must be on an even address here so the low bit of the jump address ; will indicate which way around ecx/ebx should start. ; eax scratch ; ebx carry high ; ecx carry low ; edx scratch ; esi src pointer ; edi dst pointer ; ebp multiplier ; ; 15 code bytes each limb ; ecx/ebx reversed on each chunk align 2 Lunroll_inner_start: %assign i UNROLL_COUNT %rep UNROLL_COUNT %assign disp_src OFFSET-4*i %assign disp_dst disp_src ; m4_assert(disp_src>=-128 && disp_src<128) ; m4_assert(disp_dst>=-128 && disp_dst<128) mov eax,[byte disp_src+esi] mul ebp %if i % 2 == 0 add [byte disp_dst+edi],ebx adc ecx,eax mov ebx,edx adc ebx,0 %else add [byte disp_dst+edi],ecx adc ebx,eax mov ecx,edx adc ecx,0 %endif %assign i i-1 %endrep Lunroll_inner_end: add [OFFSET+edi],ebx mov edx,[VAR_COUNTER] adc ecx,0 mov [OFFSET+4+edi],ecx mov ecx,[VAR_JMP] inc edx jnz Lunroll_outer_top %if OFFSET != 0 add esi,OFFSET add edi,OFFSET %endif ; eax ; ebx ; ecx ; edx ; esi &src[size] ; edi &dst[2*size-5] ; ebp align 16 Lcorner: mov eax,[-12+esi] mul dword [-8+esi] add [edi],eax mov eax,[-12+esi] mov ebx,0 adc ebx,edx mul dword [-4+esi] add ebx,eax mov eax,[-8+esi] adc edx,0 add [4+edi],ebx mov ebx,0 adc ebx,edx mul dword [-4+esi] mov ecx,[PARAM_SIZE] add eax,ebx adc edx,0 mov [8+edi],eax mov [12+edi],edx mov edi,[PARAM_DST] ; Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1]. sub ecx,1 ; size-1 xor eax,eax ; ready for final adcl,and clear carry mov edx,ecx mov esi,[PARAM_SRC] ; eax ; ebx ; ecx counter,size-1 to 1 ; edx size-1 (for later use) ; esi src (for later use) ; edi dst,incrementing ; ebp Llshift: rcl dword [4+edi],1 rcl dword [8+edi],1 lea edi,[8+edi] dec ecx jnz Llshift adc eax,eax mov [4+edi],eax ; dst most significant limb mov eax,[esi] ; src[0] lea esi,[4+esi+edx*4] ; &src[size] sub ecx,edx ; -(size-1) ; Now add in the squares on the diagonal,src[0]^2,src[1]^2,..., ; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the ; low limb of src[0]^2. mul eax mov [edi+ecx*8],eax ; dst[0] ; eax scratch ; ebx scratch ; ecx counter,negative ; edx carry ; esi &src[size] ; edi dst[2*size-2] ; ebp Ldiag: mov eax,[esi+ecx*4] mov ebx,edx mul eax add [4+edi+ecx*8],ebx adc [8+edi+ecx*8],eax adc edx,0 inc ecx jnz Ldiag mov esi,[SAVE_ESI] mov ebx,[SAVE_EBX] add [4+edi],edx ; dst most significant limb mov edi,[SAVE_EDI] mov ebp,[SAVE_EBP] add esp,frame ret end