mpir/mpn/x86w/p6/sqr_basecase.asm


;  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
; 
;  This file is part of the GNU MP Library.
; 
;  The GNU MP Library is free software; you can redistribute it and/or
;  modify it under the terms of the GNU Lesser General Public License as
;  published by the Free Software Foundation; either version 2.1 of the
;  License, or (at your option) any later version.
; 
;  The GNU MP Library is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;  Lesser General Public License for more details.
; 
;  You should have received a copy of the GNU Lesser General Public
;  License along with the GNU MP Library; see the file COPYING.LIB.  If
;  not, write to the Free Software Foundation, Inc., 59 Temple Place -
;  Suite 330, Boston, MA 02111-1307, USA.
;
; Translation of AT&T syntax code by Brian Gladman 

%include "..\x86i.inc" 

	global	___gmpn_sqr_basecase

%ifdef	DLL
	export	___gmpn_sqr_basecase
%endif

%define UNROLL_COUNT	64	; seems to be maximum required (I hope!)

%define	PARAM_SIZE	esp+frame+12 
%define PARAM_SRC   esp+frame+8 
%define PARAM_DST   esp+frame+4 
%define frame		0 

	section	.text
	
	align	32
		
___gmpn_sqr_basecase:
    mov     edx,[PARAM_SIZE]
    mov     eax,[PARAM_SRC]
    cmp     edx,2
    mov     ecx,[PARAM_DST]
    je      Ltwo_limbs
    mov     eax,[eax]
    ja      Lthree_or_more

;  one limb only 
;  eax        src limb 
;  ebx 
;  ecx        dst 
;  edx 

    mul     eax
    mov     [ecx],eax
    mov     [4+ecx],edx
    ret

;  eax        src 
;  ebx 
;  ecx        dst 
;  edx 

%define	SAVE_ESI    esp+frame-4 
%define SAVE_EBX    esp+frame-8 
%define SAVE_EDI    esp+frame-12 
%define SAVE_EBP    esp+frame-16 
%define	frame		16

Ltwo_limbs: 
    sub     esp,frame
    mov     [SAVE_ESI],esi
    mov     esi,eax
    mov     eax,[eax]
    mul     eax				;  src[0]^2 
    mov     [ecx],eax		;  dst[0] 
    mov     eax,[4+esi]
    mov     [SAVE_EBX],ebx
    mov     ebx,edx			;  dst[1] 
    mul     eax				;  src[1]^2 
    mov     [SAVE_EDI],edi
    mov     edi,eax			;  dst[2] 
    mov     eax,[esi]
    mov     [SAVE_EBP],ebp
    mov     ebp,edx			;  dst[3] 
    mul     dword [4+esi]	;  src[0]*src[1] 
    add     ebx,eax
    mov     esi,[SAVE_ESI]
    adc     edi,edx
    adc     ebp,0
    add     eax,ebx
    mov     ebx,[SAVE_EBX]
    adc     edx,edi
    mov     edi,[SAVE_EDI]
    adc     ebp,0
    mov     [4+ecx],eax
    mov     [12+ecx],ebp
    mov     ebp,[SAVE_EBP]
    mov     [8+ecx],edx
    add     esp,frame
    ret

;  eax        src low limb 
;  ebx 
;  ecx        dst 
;  edx        size 

Lthree_or_more: 
	sub		esp,frame
	mov		[SAVE_ESI],esi
    cmp     edx,4
    mov     esi,[PARAM_SRC]
    jae     Lfour_or_more

;  three limbs 
;  eax        src low limb 
;  ebx 
;  ecx        dst 
;  edx 
;  esi        src 
;  edi 
;  ebp 

	mov		[SAVE_EBP],ebp
	mov		[SAVE_EDI],edi
    mul     eax				;  src[0] ^ 2 
    mov     [ecx],eax
    mov     [4+ecx],edx
    mov     eax,[4+esi]
    xor     ebp,ebp
    mul     eax				;  src[1] ^ 2 
    mov     [8+ecx],eax
    mov     [12+ecx],edx
    mov     eax,[8+esi]
	mov		[SAVE_EBX],ebx
    mul     eax				;  src[2] ^ 2 
    mov     [16+ecx],eax
    mov     [20+ecx],edx
    mov     eax,[esi]
    mul     dword [4+esi]	;  src[0] * src[1] 
    mov     ebx,eax
    mov     edi,edx
    mov     eax,[esi]
    mul     dword [8+esi]	;  src[0] * src[2] 
    add     edi,eax
    mov     ebp,edx
    adc     ebp,0
    mov     eax,[4+esi]
    mul     dword [8+esi]	;  src[1] * src[2] 
    xor     esi,esi
    add     ebp,eax

;  eax 
;  ebx        dst[1] 
;  ecx        dst 
;  edx        dst[4] 
;  esi        zero,will be dst[5] 
;  edi        dst[2] 
;  ebp        dst[3] 

    adc     edx,0
    add     ebx,ebx
    adc     edi,edi
    adc     ebp,ebp
    adc     edx,edx
    mov     eax,[4+ecx]
    adc     esi,0
    add     eax,ebx
    mov     [4+ecx],eax
    mov     eax,[8+ecx]
    adc     eax,edi
    mov     ebx,[12+ecx]
    adc     ebx,ebp
    mov     edi,[16+ecx]
    mov     [8+ecx],eax
    mov     ebp,[SAVE_EBP]
    mov     [12+ecx],ebx
    mov     ebx,[SAVE_EBX]
    adc     edi,edx
    mov     eax,[20+ecx]
    mov     [16+ecx],edi
    mov     edi,[SAVE_EDI]
    adc     eax,esi			;  no carry out of this 
    mov     esi,[SAVE_ESI]
    mov     [20+ecx],eax
    add     esp,frame
    ret

;  eax        src low limb 
;  ebx 
;  ecx 
;  edx        size 
;  esi        src 
;  edi 
;  ebp 
;  First multiply src[0]*src[1..size-1] and store at dst[1..size]. 

%define VAR_COUNTER	esp+frame-20 
%define VAR_JMP		esp+frame-24 
%define	STACK_SPACE 24 

Lfour_or_more: 
	sub     esp,STACK_SPACE-frame
%define       frame   STACK_SPACE 
    mov     ecx,1
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    mov     [SAVE_EBX],ebx
    sub     ecx,edx				;  -(size-1) 
    mov     [SAVE_EBP],ebp
    mov     ebx,0				;  initial carry 
    lea     esi,[esi+edx*4]		;  &src[size] 
    mov     ebp,eax				;  multiplier 
    lea     edi,[-4+edi+edx*4]  ;  &dst[size-1] 

;  This loop runs at just over 6 c/l. 
;  eax        scratch 
;  ebx        carry 
;  ecx        counter,limbs,negative,-(size-1) to -1 
;  edx        scratch 
;  esi        &src[size] 
;  edi        &dst[size-1] 
;  ebp        multiplier 

Lmul_1: 
    mov     eax,ebp
    mul     dword [esi+ecx*4]
    add     eax,ebx
    mov     ebx,0
    adc     ebx,edx
    mov     [4+edi+ecx*4],eax
    inc     ecx
    jnz     Lmul_1
    mov     [4+edi],ebx

;  Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2. 
;  
;  The last two addmuls,which are the bottom right corner of the product 
;  triangle,are left to the end.  These are src[size-3]*src[size-2,size-1] 
;  and src[size-2]*src[size-1].  If size is 4 then it's only these corner 
;  cases that need to be done. 
;  
;  The unrolled code is the same as mpn_addmul_1(),see that routine for some 
;  comments. 
;  
;  VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive. 
;  
;  VAR_JMP is the computed jump into the unrolled code,stepped by one code 
;  chunk each outer loop. 
;
;   This is also hard-coded in the address calculation below. 
;
;   With &src[size] and &dst[size-1] pointers,the displacements in the 
;   unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above 
;   that an offset must be added to them. 
;
;  eax 
;  ebx        carry 
;  ecx 
;  edx 
;  esi        &src[size] 
;  edi        &dst[size-1] 
;  ebp 

%define	CODE_BYTES_PER_LIMB	15 
%if	UNROLL_COUNT > 32
%define	OFFSET	4*(UNROLL_COUNT-32)
%else
%define	OFFSET	0
%endif
    mov     ecx,[PARAM_SIZE]
    sub     ecx,4
    jz      Lcorner
    mov     edx,ecx
    neg     ecx
    shl     ecx,4
%if	OFFSET != 0
	sub		esi,OFFSET
%endif

%ifdef	PIC
    call    Lhere
Lhere:
    add     ecx,[esp]
    add     ecx,Lunroll_inner_end-Lhere-(2*CODE_BYTES_PER_LIMB)
    add     ecx,edx
    add		esp,4
%else
	lea     ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]
%endif
	neg     edx
%if OFFSET != 0
	sub		edi,OFFSET
%endif

;  The calculated jump mustn't be before the start of the available 
;  code.  This is the limit that UNROLL_COUNT puts on the src operand 
;  size,but checked here using the jump address directly. 

; ASSERT(ae,movl_text_address( Lunroll_inner_start,%eax) cmpl %eax,%ecx) 

%ifdef	ASSERT
	mov		eax,Lunroll_inner_start
	cmp		ecx,eax
	jae		Lunroll_outer_top
	jmp		exit
%endif

;  eax 
;  ebx        high limb to store 
;  ecx        VAR_JMP 
;  edx        VAR_COUNTER,limbs,negative 
;  esi        &src[size],constant 
;  edi        dst ptr,second highest limb of last addmul 
;  ebp 

%if	UNROLL_COUNT % 2 == 1
%define	cmovX	cmovz
%else
%define	cmovX	cmovnz
%endif

	align	16
Lunroll_outer_top: 
    mov     ebp,[-12+OFFSET+esi+edx*4]   ;  multiplier 
    mov     [VAR_COUNTER],edx
    mov     eax,[-8+OFFSET+esi+edx*4]   ;  first limb of multiplicand 
    mul     ebp
    test    cl,1
    mov     ebx,edx    ;  high carry 
    lea     edi,[4+edi]
    mov     edx,ecx    ;  jump 
    mov     ecx,eax    ;  low carry 
    lea     edx,[CODE_BYTES_PER_LIMB+edx]
	cmovX	ecx,ebx
	cmovX	ebx,eax
    mov     [VAR_JMP],edx
    jmp     edx

;  Must be on an even address here so the low bit of the jump address 
;  will indicate which way around ecx/ebx should start. 

;  eax        scratch 
;  ebx        carry high 
;  ecx        carry low 
;  edx        scratch 
;  esi        src pointer 
;  edi        dst pointer 
;  ebp        multiplier 
;  
;  15 code bytes each limb 
;  ecx/ebx reversed on each chunk 

	align	2

Lunroll_inner_start: 

%assign	i	UNROLL_COUNT
%rep	UNROLL_COUNT
	%assign	disp_src	OFFSET-4*i
	%assign	disp_dst	disp_src 
;	m4_assert(disp_src>=-128 && disp_src<128)
;	m4_assert(disp_dst>=-128 && disp_dst<128)

	mov		eax,[byte disp_src+esi]
    mul     ebp
%if	i % 2 == 0
	add		[byte disp_dst+edi],ebx
	adc     ecx,eax
    mov     ebx,edx
    adc     ebx,0
%else
	add		[byte disp_dst+edi],ecx
	adc     ebx,eax
    mov     ecx,edx
    adc     ecx,0
%endif
%assign	i	i-1
%endrep

Lunroll_inner_end: 
    add     [OFFSET+edi],ebx
    mov     edx,[VAR_COUNTER]
    adc     ecx,0
    mov     [OFFSET+4+edi],ecx
    mov     ecx,[VAR_JMP]
    inc     edx
    jnz     Lunroll_outer_top

%if	OFFSET != 0
    add     esi,OFFSET
    add     edi,OFFSET
%endif

;  eax 
;  ebx 
;  ecx 
;  edx 
;  esi        &src[size] 
;  edi        &dst[2*size-5] 
;  ebp 

	align	16
Lcorner: 
    mov     eax,[-12+esi]
    mul     dword [-8+esi]
    add     [edi],eax
    mov     eax,[-12+esi]
    mov     ebx,0
    adc     ebx,edx
    mul     dword [-4+esi]
    add     ebx,eax
    mov     eax,[-8+esi]
    adc     edx,0
    add     [4+edi],ebx
    mov     ebx,0
    adc     ebx,edx
    mul     dword [-4+esi]
    mov     ecx,[PARAM_SIZE]
    add     eax,ebx
    adc     edx,0
    mov     [8+edi],eax
    mov     [12+edi],edx
    mov     edi,[PARAM_DST]

;  Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1]. 

    sub     ecx,1				;  size-1 
    xor     eax,eax				;  ready for final adcl,and clear carry 
    mov     edx,ecx
    mov     esi,[PARAM_SRC]

;  eax 
;  ebx 
;  ecx        counter,size-1 to 1 
;  edx        size-1 (for later use) 
;  esi        src (for later use) 
;  edi        dst,incrementing 
;  ebp 

Llshift: 
    rcl     dword [4+edi],1
    rcl     dword [8+edi],1
    lea     edi,[8+edi]
    dec     ecx
    jnz     Llshift
    adc     eax,eax
    mov     [4+edi],eax			;  dst most significant limb 
    mov     eax,[esi]			;  src[0] 
    lea     esi,[4+esi+edx*4]   ;  &src[size] 
    sub     ecx,edx				;  -(size-1) 

;  Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,
;  src[size-1]^2.  dst[0] hasn't yet been set at all yet,and just gets the 
;  low limb of src[0]^2. 

    mul     eax
    mov     [edi+ecx*8],eax     ;  dst[0] 

;  eax        scratch 
;  ebx        scratch 
;  ecx        counter,negative 
;  edx        carry 
;  esi        &src[size] 
;  edi        dst[2*size-2] 
;  ebp 

Ldiag: 
    mov     eax,[esi+ecx*4]
    mov     ebx,edx
    mul     eax
    add     [4+edi+ecx*8],ebx
    adc     [8+edi+ecx*8],eax
    adc     edx,0
    inc     ecx
    jnz     Ldiag
    mov     esi,[SAVE_ESI]
    mov     ebx,[SAVE_EBX]
    add     [4+edi],edx			;  dst most significant limb 
    mov     edi,[SAVE_EDI]
    mov     ebp,[SAVE_EBP]
    add     esp,frame
    ret

	end
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`; Copyright 1999, 2000, 2002 Free Software Foundation, Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License, or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not, write to the Free Software Foundation, Inc., 59 Temple Place -`
			`; Suite 330, Boston, MA 02111-1307, USA.`
			`;`
			`; Translation of AT&T syntax code by Brian Gladman`

			`%include "..\x86i.inc"`

			`global ___gmpn_sqr_basecase`

			`%ifdef DLL`
			`export ___gmpn_sqr_basecase`
			`%endif`

			`%define UNROLL_COUNT 64 ; seems to be maximum required (I hope!)`

			`%define PARAM_SIZE esp+frame+12`
			`%define PARAM_SRC esp+frame+8`
			`%define PARAM_DST esp+frame+4`
			`%define frame 0`

			`section .text`

			`align 32`

			`___gmpn_sqr_basecase:`
			`mov edx,[PARAM_SIZE]`
			`mov eax,[PARAM_SRC]`
			`cmp edx,2`
			`mov ecx,[PARAM_DST]`
			`je Ltwo_limbs`
			`mov eax,[eax]`
			`ja Lthree_or_more`

			`; one limb only`
			`; eax src limb`
			`; ebx`
			`; ecx dst`
			`; edx`

			`mul eax`
			`mov [ecx],eax`
			`mov [4+ecx],edx`
			`ret`

			`; eax src`
			`; ebx`
			`; ecx dst`
			`; edx`

			`%define SAVE_ESI esp+frame-4`
			`%define SAVE_EBX esp+frame-8`
			`%define SAVE_EDI esp+frame-12`
			`%define SAVE_EBP esp+frame-16`
			`%define frame 16`

			`Ltwo_limbs:`
			`sub esp,frame`
			`mov [SAVE_ESI],esi`
			`mov esi,eax`
			`mov eax,[eax]`
			`mul eax ; src[0]^2`
			`mov [ecx],eax ; dst[0]`
			`mov eax,[4+esi]`
			`mov [SAVE_EBX],ebx`
			`mov ebx,edx ; dst[1]`
			`mul eax ; src[1]^2`
			`mov [SAVE_EDI],edi`
			`mov edi,eax ; dst[2]`
			`mov eax,[esi]`
			`mov [SAVE_EBP],ebp`
			`mov ebp,edx ; dst[3]`
			`mul dword [4+esi] ; src[0]*src[1]`
			`add ebx,eax`
			`mov esi,[SAVE_ESI]`
			`adc edi,edx`
			`adc ebp,0`
			`add eax,ebx`
			`mov ebx,[SAVE_EBX]`
			`adc edx,edi`
			`mov edi,[SAVE_EDI]`
			`adc ebp,0`
			`mov [4+ecx],eax`
			`mov [12+ecx],ebp`
			`mov ebp,[SAVE_EBP]`
			`mov [8+ecx],edx`
			`add esp,frame`
			`ret`

			`; eax src low limb`
			`; ebx`
			`; ecx dst`
			`; edx size`

			`Lthree_or_more:`
			`sub esp,frame`
			`mov [SAVE_ESI],esi`
			`cmp edx,4`
			`mov esi,[PARAM_SRC]`
			`jae Lfour_or_more`

			`; three limbs`
			`; eax src low limb`
			`; ebx`
			`; ecx dst`
			`; edx`
			`; esi src`
			`; edi`
			`; ebp`

			`mov [SAVE_EBP],ebp`
			`mov [SAVE_EDI],edi`
			`mul eax ; src[0] ^ 2`
			`mov [ecx],eax`
			`mov [4+ecx],edx`
			`mov eax,[4+esi]`
			`xor ebp,ebp`
			`mul eax ; src[1] ^ 2`
			`mov [8+ecx],eax`
			`mov [12+ecx],edx`
			`mov eax,[8+esi]`
			`mov [SAVE_EBX],ebx`
			`mul eax ; src[2] ^ 2`
			`mov [16+ecx],eax`
			`mov [20+ecx],edx`
			`mov eax,[esi]`
			`mul dword [4+esi] ; src[0] * src[1]`
			`mov ebx,eax`
			`mov edi,edx`
			`mov eax,[esi]`
			`mul dword [8+esi] ; src[0] * src[2]`
			`add edi,eax`
			`mov ebp,edx`
			`adc ebp,0`
			`mov eax,[4+esi]`
			`mul dword [8+esi] ; src[1] * src[2]`
			`xor esi,esi`
			`add ebp,eax`

			`; eax`
			`; ebx dst[1]`
			`; ecx dst`
			`; edx dst[4]`
			`; esi zero,will be dst[5]`
			`; edi dst[2]`
			`; ebp dst[3]`

			`adc edx,0`
			`add ebx,ebx`
			`adc edi,edi`
			`adc ebp,ebp`
			`adc edx,edx`
			`mov eax,[4+ecx]`
			`adc esi,0`
			`add eax,ebx`
			`mov [4+ecx],eax`
			`mov eax,[8+ecx]`
			`adc eax,edi`
			`mov ebx,[12+ecx]`
			`adc ebx,ebp`
			`mov edi,[16+ecx]`
			`mov [8+ecx],eax`
			`mov ebp,[SAVE_EBP]`
			`mov [12+ecx],ebx`
			`mov ebx,[SAVE_EBX]`
			`adc edi,edx`
			`mov eax,[20+ecx]`
			`mov [16+ecx],edi`
			`mov edi,[SAVE_EDI]`
			`adc eax,esi ; no carry out of this`
			`mov esi,[SAVE_ESI]`
			`mov [20+ecx],eax`
			`add esp,frame`
			`ret`

			`; eax src low limb`
			`; ebx`
			`; ecx`
			`; edx size`
			`; esi src`
			`; edi`
			`; ebp`
			`; First multiply src[0]*src[1..size-1] and store at dst[1..size].`

			`%define VAR_COUNTER esp+frame-20`
			`%define VAR_JMP esp+frame-24`
			`%define STACK_SPACE 24`

			`Lfour_or_more:`
			`sub esp,STACK_SPACE-frame`
			`%define frame STACK_SPACE`
			`mov ecx,1`
			`mov [SAVE_EDI],edi`
			`mov edi,[PARAM_DST]`
			`mov [SAVE_EBX],ebx`
			`sub ecx,edx ; -(size-1)`
			`mov [SAVE_EBP],ebp`
			`mov ebx,0 ; initial carry`
			`lea esi,[esi+edx*4] ; &src[size]`
			`mov ebp,eax ; multiplier`
			`lea edi,[-4+edi+edx*4] ; &dst[size-1]`

			`; This loop runs at just over 6 c/l.`
			`; eax scratch`
			`; ebx carry`
			`; ecx counter,limbs,negative,-(size-1) to -1`
			`; edx scratch`
			`; esi &src[size]`
			`; edi &dst[size-1]`
			`; ebp multiplier`

			`Lmul_1:`
			`mov eax,ebp`
			`mul dword [esi+ecx*4]`
			`add eax,ebx`
			`mov ebx,0`
			`adc ebx,edx`
			`mov [4+edi+ecx*4],eax`
			`inc ecx`
			`jnz Lmul_1`
			`mov [4+edi],ebx`

			`; Addmul src[n]src[n+1..size-1] at dst[2n-1...],for each n=1..size-2.`
			`;`
			`; The last two addmuls,which are the bottom right corner of the product`
			`; triangle,are left to the end. These are src[size-3]*src[size-2,size-1]`
			`; and src[size-2]*src[size-1]. If size is 4 then it's only these corner`
			`; cases that need to be done.`
			`;`
			`; The unrolled code is the same as mpn_addmul_1(),see that routine for some`
			`; comments.`
			`;`
			`; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive.`
			`;`
			`; VAR_JMP is the computed jump into the unrolled code,stepped by one code`
			`; chunk each outer loop.`
			`;`
			`; This is also hard-coded in the address calculation below.`
			`;`
			`; With &src[size] and &dst[size-1] pointers,the displacements in the`
			`; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above`
			`; that an offset must be added to them.`
			`;`
			`; eax`
			`; ebx carry`
			`; ecx`
			`; edx`
			`; esi &src[size]`
			`; edi &dst[size-1]`
			`; ebp`

			`%define CODE_BYTES_PER_LIMB 15`
			`%if UNROLL_COUNT > 32`
			`%define OFFSET 4*(UNROLL_COUNT-32)`
			`%else`
			`%define OFFSET 0`
			`%endif`
			`mov ecx,[PARAM_SIZE]`
			`sub ecx,4`
			`jz Lcorner`
			`mov edx,ecx`
			`neg ecx`
			`shl ecx,4`
			`%if OFFSET != 0`
			`sub esi,OFFSET`
			`%endif`

			`%ifdef PIC`
			`call Lhere`
			`Lhere:`
			`add ecx,[esp]`
			`add ecx,Lunroll_inner_end-Lhere-(2*CODE_BYTES_PER_LIMB)`
			`add ecx,edx`
			`add esp,4`
			`%else`
			`lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]`
			`%endif`
			`neg edx`
			`%if OFFSET != 0`
			`sub edi,OFFSET`
			`%endif`

			`; The calculated jump mustn't be before the start of the available`
			`; code. This is the limit that UNROLL_COUNT puts on the src operand`
			`; size,but checked here using the jump address directly.`

			`; ASSERT(ae,movl_text_address( Lunroll_inner_start,%eax) cmpl %eax,%ecx)`

			`%ifdef ASSERT`
			`mov eax,Lunroll_inner_start`
			`cmp ecx,eax`
			`jae Lunroll_outer_top`
			`jmp exit`
			`%endif`

			`; eax`
			`; ebx high limb to store`
			`; ecx VAR_JMP`
			`; edx VAR_COUNTER,limbs,negative`
			`; esi &src[size],constant`
			`; edi dst ptr,second highest limb of last addmul`
			`; ebp`

			`%if UNROLL_COUNT % 2 == 1`
			`%define cmovX cmovz`
			`%else`
			`%define cmovX cmovnz`
			`%endif`

			`align 16`
			`Lunroll_outer_top:`
			`mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier`
			`mov [VAR_COUNTER],edx`
			`mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand`
			`mul ebp`
			`test cl,1`
			`mov ebx,edx ; high carry`
			`lea edi,[4+edi]`
			`mov edx,ecx ; jump`
			`mov ecx,eax ; low carry`
			`lea edx,[CODE_BYTES_PER_LIMB+edx]`
			`cmovX ecx,ebx`
			`cmovX ebx,eax`
			`mov [VAR_JMP],edx`
			`jmp edx`

			`; Must be on an even address here so the low bit of the jump address`
			`; will indicate which way around ecx/ebx should start.`

			`; eax scratch`
			`; ebx carry high`
			`; ecx carry low`
			`; edx scratch`
			`; esi src pointer`
			`; edi dst pointer`
			`; ebp multiplier`
			`;`
			`; 15 code bytes each limb`
			`; ecx/ebx reversed on each chunk`

			`align 2`

			`Lunroll_inner_start:`

			`%assign i UNROLL_COUNT`
			`%rep UNROLL_COUNT`
			`%assign disp_src OFFSET-4*i`
			`%assign disp_dst disp_src`
			`; m4_assert(disp_src>=-128 && disp_src<128)`
			`; m4_assert(disp_dst>=-128 && disp_dst<128)`

			`mov eax,[byte disp_src+esi]`
			`mul ebp`
			`%if i % 2 == 0`
			`add [byte disp_dst+edi],ebx`
			`adc ecx,eax`
			`mov ebx,edx`
			`adc ebx,0`
			`%else`
			`add [byte disp_dst+edi],ecx`
			`adc ebx,eax`
			`mov ecx,edx`
			`adc ecx,0`
			`%endif`
			`%assign i i-1`
			`%endrep`

			`Lunroll_inner_end:`
			`add [OFFSET+edi],ebx`
			`mov edx,[VAR_COUNTER]`
			`adc ecx,0`
			`mov [OFFSET+4+edi],ecx`
			`mov ecx,[VAR_JMP]`
			`inc edx`
			`jnz Lunroll_outer_top`

			`%if OFFSET != 0`
			`add esi,OFFSET`
			`add edi,OFFSET`
			`%endif`

			`; eax`
			`; ebx`
			`; ecx`
			`; edx`
			`; esi &src[size]`
			`; edi &dst[2*size-5]`
			`; ebp`

			`align 16`
			`Lcorner:`
			`mov eax,[-12+esi]`
			`mul dword [-8+esi]`
			`add [edi],eax`
			`mov eax,[-12+esi]`
			`mov ebx,0`
			`adc ebx,edx`
			`mul dword [-4+esi]`
			`add ebx,eax`
			`mov eax,[-8+esi]`
			`adc edx,0`
			`add [4+edi],ebx`
			`mov ebx,0`
			`adc ebx,edx`
			`mul dword [-4+esi]`
			`mov ecx,[PARAM_SIZE]`
			`add eax,ebx`
			`adc edx,0`
			`mov [8+edi],eax`
			`mov [12+edi],edx`
			`mov edi,[PARAM_DST]`

			`; Left shift of dst[1..2size-2],the bit shifted out becomes dst[2size-1].`

			`sub ecx,1 ; size-1`
			`xor eax,eax ; ready for final adcl,and clear carry`
			`mov edx,ecx`
			`mov esi,[PARAM_SRC]`

			`; eax`
			`; ebx`
			`; ecx counter,size-1 to 1`
			`; edx size-1 (for later use)`
			`; esi src (for later use)`
			`; edi dst,incrementing`
			`; ebp`

			`Llshift:`
			`rcl dword [4+edi],1`
			`rcl dword [8+edi],1`
			`lea edi,[8+edi]`
			`dec ecx`
			`jnz Llshift`
			`adc eax,eax`
			`mov [4+edi],eax ; dst most significant limb`
			`mov eax,[esi] ; src[0]`
			`lea esi,[4+esi+edx*4] ; &src[size]`
			`sub ecx,edx ; -(size-1)`

			`; Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,`
			`; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the`
			`; low limb of src[0]^2.`

			`mul eax`
			`mov [edi+ecx*8],eax ; dst[0]`

			`; eax scratch`
			`; ebx scratch`
			`; ecx counter,negative`
			`; edx carry`
			`; esi &src[size]`
			`; edi dst[2*size-2]`
			`; ebp`

			`Ldiag:`
			`mov eax,[esi+ecx*4]`
			`mov ebx,edx`
			`mul eax`
			`add [4+edi+ecx*8],ebx`
			`adc [8+edi+ecx*8],eax`
			`adc edx,0`
			`inc ecx`
			`jnz Ldiag`
			`mov esi,[SAVE_ESI]`
			`mov ebx,[SAVE_EBX]`
			`add [4+edi],edx ; dst most significant limb`
			`mov edi,[SAVE_EDI]`
			`mov ebp,[SAVE_EBP]`
			`add esp,frame`
			`ret`

			`end`