mpir/mpn/x86w/p6/sqr_basecase.old.asm


;   Intel P6 mpn_sqr_basecase -- square an mpn number. 
;
;   Copyright 1999,2000,2002 Free Software Foundation,Inc. 
;   
;   This file is part of the GNU MP Library. 
;   
;   The GNU MP Library is free software; you can redistribute it and/or 
;   modify it under the terms of the GNU Lesser General Public License as 
;   published by the Free Software Foundation; either version 2.1 of the 
;   License,or (at your option) any later version. 
;   
;   The GNU MP Library is distributed in the hope that it will be useful,
;   but WITHOUT ANY WARRANTY; without even the implied warranty of 
;   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
;   Lesser General Public License for more details. 
;   
;   You should have received a copy of the GNU Lesser General Public 
;   License along with the GNU MP Library; see the file COPYING.LIB.  If 
;   not,write to the Free Software Foundation,Inc.,59 Temple Place - 
;   Suite 330,Boston,MA 02111-1307,USA. 
;
;  P6: approx 4.0 cycles per cross product,or 7.75 cycles per triangular 
;      product (measured on the speed difference between 20 and 40 limbs,
;      which is the Karatsuba recursing range). 
;
;   These are the same as in mpn/x86/k6/sqr_basecase.asm,see that file for 
;   a description.  The only difference here is that UNROLL_COUNT can go up 
;   to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67. 
;
;  void mpn_sqr_basecase (mp_ptr dst,mp_srcptr src,mp_size_t size); 
; 
;  The algorithm is basically the same as mpn/generic/sqr_basecase.c,but a 
;  lot of function call overheads are avoided,especially when the given size 
;  is small. 
; 
;  The code size might look a bit excessive,but not all of it is executed so 
;  it won't all get into the code cache.  The 1x1,2x2 and 3x3 special cases 
;  clearly apply only to those sizes; mid sizes like 10x10 only need part of 
;  the unrolled addmul; and big sizes like 40x40 that do use the full 
;  unrolling will least be making good use of it,because 40x40 will take 
;  something like 7000 cycles. 

%include "..\\x86i.inc" 

%define	SQR_KARATSUBA_THRESHOLD	10 
%define	SQR_KARATSUBA_THRESHOLD_MAX	67 

%ifdef	SQR_KARATSUBA_THRESHOLD_OVERRIDE
%define	SQR_KARATSUBA_THRESHOLD	SQR_KARATSUBA_THRESHOLD_OVERRIDE
%endif

%define	UNROLL_COUNT	SQR_KARATSUBA_THRESHOLD-3

FR_def	PARAM_SIZE,12
FR_def	PARAM_SRC,8
FR_def	PARAM_DST,4

	section .text
    global  ___gmpn_sqr_basecase 

	align   32
	
___gmpn_sqr_basecase: 
    mov     edx,[PARAM_SIZE]
    mov     eax,[PARAM_SRC]
    cmp     edx,2
    mov     ecx,[PARAM_DST]
    je      Ltwo_limbs
    mov     eax,[eax]
    ja      Lthree_or_more

;  one limb only 
;  eax src limb 
;  ebx 
;  ecx dst 
;  edx 

    mul     eax
    mov     [ecx],eax
    mov     [4+ecx],edx
    ret

;  eax src 
;  ebx 
;  ecx dst 
;  edx 

FR_def	SAVE_ESI, -4
FR_def	SAVE_EBX, -8
FR_def	SAVE_EDI,-12
FR_def	SAVE_EBP,-16

%define STACK_SPACE	16 
%define	frame		16

Ltwo_limbs: 
    sub     esp,frame
    mov     [SAVE_ESI],esi
    mov     esi,eax
    mov     eax,[eax]
    mul     eax				;  src[0]^2 
    mov     [ecx],eax		;  dst[0] 
    mov     eax,[4+esi]
    mov     [SAVE_EBX],ebx
    mov     ebx,edx			;  dst[1] 
    mul     eax				;  src[1]^2 
    mov     [SAVE_EDI],edi
    mov     edi,eax			;  dst[2] 
    mov     eax,[esi]
    mov     [SAVE_EBP],ebp
    mov     ebp,edx			;  dst[3] 
    mul     dword [4+esi]	;  src[0]*src[1] 
    add     ebx,eax
    mov     esi,[SAVE_ESI]
    adc     edi,edx
    adc     ebp,0
    add     eax,ebx
    mov     ebx,[SAVE_EBX]
    adc     edx,edi
    mov     edi,[SAVE_EDI]
    adc     ebp,0
    mov     [4+ecx],eax
    mov     [12+ecx],ebp
    mov     ebp,[SAVE_EBP]
    mov     [8+ecx],edx
    add     esp,frame
    ret

;  eax src low limb 
;  ebx 
;  ecx dst 
;  edx size 

%define       frame   0 

Lthree_or_more: 
    FR_push	esi
    cmp     edx,4
    mov     esi,[PARAM_SRC]
    jae     Lfour_or_more

;  three limbs 
;
;  eax src low limb 
;  ebx 
;  ecx dst 
;  edx 
;  esi src 
;  edi 
;  ebp 

%undef	SAVE_EBP
%undef	SAVE_EDI
%undef	SAVE_EBX

	FR_push	ebp,SAVE_EBP
	FR_push	edi,SAVE_EDI
    mul     eax				;  src[0] ^ 2 
    mov     [ecx],eax
    mov     [4+ecx],edx
    mov     eax,[4+esi]
    xor     ebp,ebp
    mul     eax				;  src[1] ^ 2 
    mov     [8+ecx],eax
    mov     [12+ecx],edx
    mov     eax,[8+esi]
	FR_push	ebx,SAVE_EBX
    mul     eax				;  src[2] ^ 2 
    mov     [16+ecx],eax
    mov     [20+ecx],edx
    mov     eax,[esi]
    mul     dword [4+esi]	;  src[0] * src[1] 
    mov     ebx,eax
    mov     edi,edx
    mov     eax,[esi]
    mul     dword [8+esi]	;  src[0] * src[2] 
    add     edi,eax
    mov     ebp,edx
    adc     ebp,0
    mov     eax,[4+esi]
    mul     dword [8+esi]	;  src[1] * src[2] 
    xor     esi,esi
    add     ebp,eax

;  eax 
;  ebx dst[1] 
;  ecx dst 
;  edx dst[4] 
;  esi zero,will be dst[5] 
;  edi dst[2] 
;  ebp dst[3] 

    adc     edx,0
    add     ebx,ebx
    adc     edi,edi
    adc     ebp,ebp
    adc     edx,edx
    mov     eax,[4+ecx]
    adc     esi,0
    add     eax,ebx
    mov     [4+ecx],eax
    mov     eax,[8+ecx]
    adc     eax,edi
    mov     ebx,[12+ecx]
    adc     ebx,ebp
    mov     edi,[16+ecx]
    mov     [8+ecx],eax
    mov     ebp,[SAVE_EBP]
    mov     [12+ecx],ebx
    mov     ebx,[SAVE_EBX]
    adc     edi,edx
    mov     eax,[20+ecx]
    mov     [16+ecx],edi
    mov     edi,[SAVE_EDI]
    adc     eax,esi			;  no carry out of this 
    mov     esi,[SAVE_ESI]
    mov     [20+ecx],eax
    add     esp,frame
    ret

;  eax src low limb 
;  ebx 
;  ecx 
;  edx size 
;  esi src 
;  edi 
;  ebp 

%define	VAR_COUNTER esp+frame-20 
%define VAR_JMP		esp+frame-24 
%define STACK_SPACE 24 
%define frame		4 

;  First multiply src[0]*src[1..size-1] and store at dst[1..size]. 

Lfour_or_more: 
    sub     esp,STACK_SPACE-frame
%define	frame   STACK_SPACE 
    mov     ecx,1
    mov     [SAVE_EDI],edi
    mov     edi,[PARAM_DST]
    mov     [SAVE_EBX],ebx
    sub     ecx,edx				;  -(size-1) 
    mov     [SAVE_EBP],ebp
    mov     ebx,0				;  initial carry 
    lea     esi,[esi+edx*4]		;  &src[size] 
    mov     ebp,eax				;  multiplier 
    lea     edi,[-4+edi+edx*4]	;  &dst[size-1] 

;  This loop runs at just over 6 c/l. 
;
;  eax scratch 
;  ebx carry 
;  ecx counter,limbs,negative,-(size-1) to -1 
;  edx scratch 
;  esi &src[size] 
;  edi &dst[size-1] 
;  ebp multiplier 

Lmul_1: 
    mov     eax,ebp
    mul     dword [esi+ecx*4]
    add     eax,ebx
    mov     ebx,0
    adc     ebx,edx
    mov     [4+edi+ecx*4],eax
    inc     ecx
    jnz     Lmul_1
    mov     [4+edi],ebx

;  Addmul src[n]*src[n+1..size-1] at dst[2*n-1...],for each n=1..size-2. 
; 
;  The last two addmuls,which are the bottom right corner of the product 
;  triangle,are left to the end.  These are src[size-3]*src[size-2,size-1] 
;  and src[size-2]*src[size-1].  If size is 4 then it's only these corner 
;  cases that need to be done. 
; 
;  The unrolled code is the same as mpn_addmul_1(),see that routine for some 
;  comments. 
; 
;  VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive. 
; 
;  VAR_JMP is the computed jump into the unrolled code,stepped by one code 
;  chunk each outer loop. 
;
;   This is also hard-coded in the address calculation below. 
;
;   With &src[size] and &dst[size-1] pointers,the displacements in the 
;   unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above 
;   that an offset must be added to them. 
;
;  eax 
;  ebx carry 
;  ecx 
;  edx 
;  esi &src[size] 
;  edi &dst[size-1] 
;  ebp 

%define	CODE_BYTES_PER_LIMB	15 

%if	UNROLL_COUNT > 32
%define	OFFSET	UNROLL_COUNT-32
%else
%define	OFFSET	0
%endif

    mov     ecx,[PARAM_SIZE]
    sub     ecx,4
    jz      Lcorner
    mov     edx,ecx
    neg     ecx
    shl     ecx,4
%if	OFFSET != 0
	sub  esi,OFFSET
%endif	
	
%ifdef	PIC
    call    Lhere
Lhere: 
    add     ecx,[esp]
    add     ecx,Lunroll_inner_end-Lhere-2*CODE_BYTES_PER_LIMB
    add     ecx,edx
	add		esp,4
%else
	lea     ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]
%endif
	neg     edx

%if	OFFSET != 0
	sub  edi,OFFSET
%endif

;  The calculated jump mustn't be before the start of the available 
;  code.  This is the limit that UNROLL_COUNT puts on the src operand 
;  size,but checked here using the jump address directly. 

%ifdef	ASSERT
	mov		eax,Lunroll_inner_start
	cmp		ecx,eax
	jae		Lunroll_outer_top
	jmp		exit
%endif

;  eax 
;  ebx high limb to store 
;  ecx VAR_JMP 
;  edx VAR_COUNTER,limbs,negative 
;  esi &src[size],constant 
;  edi dst ptr,second highest limb of last addmul 
;  ebp 

	align   16
Lunroll_outer_top: 
    mov     ebp,[-12+OFFSET+esi+edx*4] ;  multiplier 
    mov     [VAR_COUNTER],edx
    mov     eax,[-8+OFFSET+esi+edx*4] ;  first limb of multiplicand 
    mul     ebp

%if	UNROLL_COUNT % 2 == 1
%define	cmovX	cmovz
%else
%define	cmovX	cmovnz
%endif

    test    cl,1
    mov     ebx,edx  ;  high carry 
    lea     edi,[4+edi]
    mov     edx,ecx  ;  jump 
    mov     ecx,eax  ;  low carry 
    lea     edx,[CODE_BYTES_PER_LIMB+edx]
	cmovX	ecx,ebx
	cmovX	ebx,eax
    mov     [VAR_JMP],edx
    jmp     edx

;  Must be on an even address here so the low bit of the jump address 
;  will indicate which way around ecx/ebx should start. 

;  eax scratch 
;  ebx carry high 
;  ecx carry low 
;  edx scratch 
;  esi src pointer 
;  edi dst pointer 
;  ebp multiplier 
;
;  15 code bytes each limb 
;  ecx/ebx reversed on each chunk 

	align   2
Lunroll_inner_start: 

%assign	i	UNROLL_COUNT
%rep	UNROLL_COUNT
%assign	disp	OFFSET-4*i
	%if	i % 2 == 0
	mov		eax,[byte disp+esi]
	mul		ebp
	add		[byte disp+edi],ebx
	adc		ecx,eax
	mov		ebx,edx
	adc		ebx,0
%else
	;  this one comes out last
	mov		eax,[byte disp+esi]
	mul		ebp
	add		[byte disp+edi],ecx
	adc		ebx,eax
	mov		ecx,edx
	adc		ecx,0
%endif
%assign	i	i-1		
%endrep

Lunroll_inner_end: 
    add     [OFFSET+edi],ebx
    mov     edx,[VAR_COUNTER]
    adc     ecx,0
    mov     [OFFSET+4+edi],ecx
    mov     ecx,[VAR_JMP]
    inc     edx
    jnz     Lunroll_outer_top
%if	OFFSET != 0
    add     esi,OFFSET
    add     edi,OFFSET
%endif

;  eax 
;  ebx 
;  ecx 
;  edx 
;  esi &src[size] 
;  edi &dst[2*size-5] 
;  ebp 

	align   16
Lcorner: 
    mov     eax,[-12+esi]
    mul     dword [-8+esi]
    add     [edi],eax
    mov     eax,[-12+esi]
    mov     ebx,0
    adc     ebx,edx
    mul     dword [-4+esi]
    add     ebx,eax
    mov     eax,[-8+esi]
    adc     edx,0
    add     [4+edi],ebx
    mov     ebx,0
    adc     ebx,edx
    mul     dword [-4+esi]
    mov     ecx,[PARAM_SIZE]
    add     eax,ebx
    adc     edx,0
    mov     [8+edi],eax
    mov     [12+edi],edx
    mov     edi,[PARAM_DST]

;  Left shift of dst[1..2*size-2],the bit shifted out becomes dst[2*size-1]. 

    sub     ecx,1			;  size-1 
    xor     eax,eax         ;  ready for final adcl,and clear carry 
    mov     edx,ecx
    mov     esi,[PARAM_SRC]

;  eax 
;  ebx 
;  ecx counter,size-1 to 1 
;  edx size-1 (for later use) 
;  esi src (for later use) 
;  edi dst,incrementing 
;  ebp 

Llshift: 
    rcl     dword [4+edi],1
    rcl     dword [8+edi],1
    lea     edi,[8+edi]
    dec     ecx
    jnz     Llshift
    adc     eax,eax
    mov     [4+edi],eax			;  dst most significant limb 
    mov     eax,[esi]			;  src[0] 
    lea     esi,[4+esi+edx*4]	;  &src[size] 
    sub     ecx,edx				;  -(size-1) 

;  Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,
;  src[size-1]^2.  dst[0] hasn't yet been set at all yet,and just gets the 
;  low limb of src[0]^2. 

	mul     eax
    mov     [edi+ecx*8],eax   ;  dst[0] 

;  eax scratch 
;  ebx scratch 
;  ecx counter,negative 
;  edx carry 
;  esi &src[size] 
;  edi dst[2*size-2] 
;  ebp 

Ldiag: 
    mov     eax,[esi+ecx*4]
    mov     ebx,edx
    mul     eax
    add     [4+edi+ecx*8],ebx
    adc     [8+edi+ecx*8],eax
    adc     edx,0
    inc     ecx
    jnz     Ldiag
    mov     esi,[SAVE_ESI]
    mov     ebx,[SAVE_EBX]
    add     [4+edi],edx     ;  dst most significant limb 
    mov     edi,[SAVE_EDI]
    mov     ebp,[SAVE_EBP]
    add     esp,frame
    ret

	end
Set native line endings for all .c, .h, as, .asm, .s, .in, .m4, .cc, am 2008-06-25 03:33:36 -04:00
			`; Intel P6 mpn_sqr_basecase -- square an mpn number.`
			`;`
			`; Copyright 1999,2000,2002 Free Software Foundation,Inc.`
			`;`
			`; This file is part of the GNU MP Library.`
			`;`
			`; The GNU MP Library is free software; you can redistribute it and/or`
			`; modify it under the terms of the GNU Lesser General Public License as`
			`; published by the Free Software Foundation; either version 2.1 of the`
			`; License,or (at your option) any later version.`
			`;`
			`; The GNU MP Library is distributed in the hope that it will be useful,`
			`; but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`; Lesser General Public License for more details.`
			`;`
			`; You should have received a copy of the GNU Lesser General Public`
			`; License along with the GNU MP Library; see the file COPYING.LIB. If`
			`; not,write to the Free Software Foundation,Inc.,59 Temple Place -`
			`; Suite 330,Boston,MA 02111-1307,USA.`
			`;`
			`; P6: approx 4.0 cycles per cross product,or 7.75 cycles per triangular`
			`; product (measured on the speed difference between 20 and 40 limbs,`
			`; which is the Karatsuba recursing range).`
			`;`
			`; These are the same as in mpn/x86/k6/sqr_basecase.asm,see that file for`
			`; a description. The only difference here is that UNROLL_COUNT can go up`
			`; to 64 (not 63) making SQR_KARATSUBA_THRESHOLD_MAX 67.`
			`;`
			`; void mpn_sqr_basecase (mp_ptr dst,mp_srcptr src,mp_size_t size);`
			`;`
			`; The algorithm is basically the same as mpn/generic/sqr_basecase.c,but a`
			`; lot of function call overheads are avoided,especially when the given size`
			`; is small.`
			`;`
			`; The code size might look a bit excessive,but not all of it is executed so`
			`; it won't all get into the code cache. The 1x1,2x2 and 3x3 special cases`
			`; clearly apply only to those sizes; mid sizes like 10x10 only need part of`
			`; the unrolled addmul; and big sizes like 40x40 that do use the full`
			`; unrolling will least be making good use of it,because 40x40 will take`
			`; something like 7000 cycles.`

			`%include "..\\x86i.inc"`

			`%define SQR_KARATSUBA_THRESHOLD 10`
			`%define SQR_KARATSUBA_THRESHOLD_MAX 67`

			`%ifdef SQR_KARATSUBA_THRESHOLD_OVERRIDE`
			`%define SQR_KARATSUBA_THRESHOLD SQR_KARATSUBA_THRESHOLD_OVERRIDE`
			`%endif`

			`%define UNROLL_COUNT SQR_KARATSUBA_THRESHOLD-3`

			`FR_def PARAM_SIZE,12`
			`FR_def PARAM_SRC,8`
			`FR_def PARAM_DST,4`

			`section .text`
			`global ___gmpn_sqr_basecase`

			`align 32`

			`___gmpn_sqr_basecase:`
			`mov edx,[PARAM_SIZE]`
			`mov eax,[PARAM_SRC]`
			`cmp edx,2`
			`mov ecx,[PARAM_DST]`
			`je Ltwo_limbs`
			`mov eax,[eax]`
			`ja Lthree_or_more`

			`; one limb only`
			`; eax src limb`
			`; ebx`
			`; ecx dst`
			`; edx`

			`mul eax`
			`mov [ecx],eax`
			`mov [4+ecx],edx`
			`ret`

			`; eax src`
			`; ebx`
			`; ecx dst`
			`; edx`

			`FR_def SAVE_ESI, -4`
			`FR_def SAVE_EBX, -8`
			`FR_def SAVE_EDI,-12`
			`FR_def SAVE_EBP,-16`

			`%define STACK_SPACE 16`
			`%define frame 16`

			`Ltwo_limbs:`
			`sub esp,frame`
			`mov [SAVE_ESI],esi`
			`mov esi,eax`
			`mov eax,[eax]`
			`mul eax ; src[0]^2`
			`mov [ecx],eax ; dst[0]`
			`mov eax,[4+esi]`
			`mov [SAVE_EBX],ebx`
			`mov ebx,edx ; dst[1]`
			`mul eax ; src[1]^2`
			`mov [SAVE_EDI],edi`
			`mov edi,eax ; dst[2]`
			`mov eax,[esi]`
			`mov [SAVE_EBP],ebp`
			`mov ebp,edx ; dst[3]`
			`mul dword [4+esi] ; src[0]*src[1]`
			`add ebx,eax`
			`mov esi,[SAVE_ESI]`
			`adc edi,edx`
			`adc ebp,0`
			`add eax,ebx`
			`mov ebx,[SAVE_EBX]`
			`adc edx,edi`
			`mov edi,[SAVE_EDI]`
			`adc ebp,0`
			`mov [4+ecx],eax`
			`mov [12+ecx],ebp`
			`mov ebp,[SAVE_EBP]`
			`mov [8+ecx],edx`
			`add esp,frame`
			`ret`

			`; eax src low limb`
			`; ebx`
			`; ecx dst`
			`; edx size`

			`%define frame 0`

			`Lthree_or_more:`
			`FR_push esi`
			`cmp edx,4`
			`mov esi,[PARAM_SRC]`
			`jae Lfour_or_more`

			`; three limbs`
			`;`
			`; eax src low limb`
			`; ebx`
			`; ecx dst`
			`; edx`
			`; esi src`
			`; edi`
			`; ebp`

			`%undef SAVE_EBP`
			`%undef SAVE_EDI`
			`%undef SAVE_EBX`

			`FR_push ebp,SAVE_EBP`
			`FR_push edi,SAVE_EDI`
			`mul eax ; src[0] ^ 2`
			`mov [ecx],eax`
			`mov [4+ecx],edx`
			`mov eax,[4+esi]`
			`xor ebp,ebp`
			`mul eax ; src[1] ^ 2`
			`mov [8+ecx],eax`
			`mov [12+ecx],edx`
			`mov eax,[8+esi]`
			`FR_push ebx,SAVE_EBX`
			`mul eax ; src[2] ^ 2`
			`mov [16+ecx],eax`
			`mov [20+ecx],edx`
			`mov eax,[esi]`
			`mul dword [4+esi] ; src[0] * src[1]`
			`mov ebx,eax`
			`mov edi,edx`
			`mov eax,[esi]`
			`mul dword [8+esi] ; src[0] * src[2]`
			`add edi,eax`
			`mov ebp,edx`
			`adc ebp,0`
			`mov eax,[4+esi]`
			`mul dword [8+esi] ; src[1] * src[2]`
			`xor esi,esi`
			`add ebp,eax`

			`; eax`
			`; ebx dst[1]`
			`; ecx dst`
			`; edx dst[4]`
			`; esi zero,will be dst[5]`
			`; edi dst[2]`
			`; ebp dst[3]`

			`adc edx,0`
			`add ebx,ebx`
			`adc edi,edi`
			`adc ebp,ebp`
			`adc edx,edx`
			`mov eax,[4+ecx]`
			`adc esi,0`
			`add eax,ebx`
			`mov [4+ecx],eax`
			`mov eax,[8+ecx]`
			`adc eax,edi`
			`mov ebx,[12+ecx]`
			`adc ebx,ebp`
			`mov edi,[16+ecx]`
			`mov [8+ecx],eax`
			`mov ebp,[SAVE_EBP]`
			`mov [12+ecx],ebx`
			`mov ebx,[SAVE_EBX]`
			`adc edi,edx`
			`mov eax,[20+ecx]`
			`mov [16+ecx],edi`
			`mov edi,[SAVE_EDI]`
			`adc eax,esi ; no carry out of this`
			`mov esi,[SAVE_ESI]`
			`mov [20+ecx],eax`
			`add esp,frame`
			`ret`

			`; eax src low limb`
			`; ebx`
			`; ecx`
			`; edx size`
			`; esi src`
			`; edi`
			`; ebp`

			`%define VAR_COUNTER esp+frame-20`
			`%define VAR_JMP esp+frame-24`
			`%define STACK_SPACE 24`
			`%define frame 4`

			`; First multiply src[0]*src[1..size-1] and store at dst[1..size].`

			`Lfour_or_more:`
			`sub esp,STACK_SPACE-frame`
			`%define frame STACK_SPACE`
			`mov ecx,1`
			`mov [SAVE_EDI],edi`
			`mov edi,[PARAM_DST]`
			`mov [SAVE_EBX],ebx`
			`sub ecx,edx ; -(size-1)`
			`mov [SAVE_EBP],ebp`
			`mov ebx,0 ; initial carry`
			`lea esi,[esi+edx*4] ; &src[size]`
			`mov ebp,eax ; multiplier`
			`lea edi,[-4+edi+edx*4] ; &dst[size-1]`

			`; This loop runs at just over 6 c/l.`
			`;`
			`; eax scratch`
			`; ebx carry`
			`; ecx counter,limbs,negative,-(size-1) to -1`
			`; edx scratch`
			`; esi &src[size]`
			`; edi &dst[size-1]`
			`; ebp multiplier`

			`Lmul_1:`
			`mov eax,ebp`
			`mul dword [esi+ecx*4]`
			`add eax,ebx`
			`mov ebx,0`
			`adc ebx,edx`
			`mov [4+edi+ecx*4],eax`
			`inc ecx`
			`jnz Lmul_1`
			`mov [4+edi],ebx`

			`; Addmul src[n]src[n+1..size-1] at dst[2n-1...],for each n=1..size-2.`
			`;`
			`; The last two addmuls,which are the bottom right corner of the product`
			`; triangle,are left to the end. These are src[size-3]*src[size-2,size-1]`
			`; and src[size-2]*src[size-1]. If size is 4 then it's only these corner`
			`; cases that need to be done.`
			`;`
			`; The unrolled code is the same as mpn_addmul_1(),see that routine for some`
			`; comments.`
			`;`
			`; VAR_COUNTER is the outer loop,running from -(size-4) to -1,inclusive.`
			`;`
			`; VAR_JMP is the computed jump into the unrolled code,stepped by one code`
			`; chunk each outer loop.`
			`;`
			`; This is also hard-coded in the address calculation below.`
			`;`
			`; With &src[size] and &dst[size-1] pointers,the displacements in the`
			`; unrolled code fit in a byte for UNROLL_COUNT values up to 32,but above`
			`; that an offset must be added to them.`
			`;`
			`; eax`
			`; ebx carry`
			`; ecx`
			`; edx`
			`; esi &src[size]`
			`; edi &dst[size-1]`
			`; ebp`

			`%define CODE_BYTES_PER_LIMB 15`

			`%if UNROLL_COUNT > 32`
			`%define OFFSET UNROLL_COUNT-32`
			`%else`
			`%define OFFSET 0`
			`%endif`

			`mov ecx,[PARAM_SIZE]`
			`sub ecx,4`
			`jz Lcorner`
			`mov edx,ecx`
			`neg ecx`
			`shl ecx,4`
			`%if OFFSET != 0`
			`sub esi,OFFSET`
			`%endif`

			`%ifdef PIC`
			`call Lhere`
			`Lhere:`
			`add ecx,[esp]`
			`add ecx,Lunroll_inner_end-Lhere-2*CODE_BYTES_PER_LIMB`
			`add ecx,edx`
			`add esp,4`
			`%else`
			`lea ecx,[Lunroll_inner_end-2*CODE_BYTES_PER_LIMB+ecx+edx]`
			`%endif`
			`neg edx`

			`%if OFFSET != 0`
			`sub edi,OFFSET`
			`%endif`

			`; The calculated jump mustn't be before the start of the available`
			`; code. This is the limit that UNROLL_COUNT puts on the src operand`
			`; size,but checked here using the jump address directly.`

			`%ifdef ASSERT`
			`mov eax,Lunroll_inner_start`
			`cmp ecx,eax`
			`jae Lunroll_outer_top`
			`jmp exit`
			`%endif`

			`; eax`
			`; ebx high limb to store`
			`; ecx VAR_JMP`
			`; edx VAR_COUNTER,limbs,negative`
			`; esi &src[size],constant`
			`; edi dst ptr,second highest limb of last addmul`
			`; ebp`

			`align 16`
			`Lunroll_outer_top:`
			`mov ebp,[-12+OFFSET+esi+edx*4] ; multiplier`
			`mov [VAR_COUNTER],edx`
			`mov eax,[-8+OFFSET+esi+edx*4] ; first limb of multiplicand`
			`mul ebp`

			`%if UNROLL_COUNT % 2 == 1`
			`%define cmovX cmovz`
			`%else`
			`%define cmovX cmovnz`
			`%endif`

			`test cl,1`
			`mov ebx,edx ; high carry`
			`lea edi,[4+edi]`
			`mov edx,ecx ; jump`
			`mov ecx,eax ; low carry`
			`lea edx,[CODE_BYTES_PER_LIMB+edx]`
			`cmovX ecx,ebx`
			`cmovX ebx,eax`
			`mov [VAR_JMP],edx`
			`jmp edx`

			`; Must be on an even address here so the low bit of the jump address`
			`; will indicate which way around ecx/ebx should start.`

			`; eax scratch`
			`; ebx carry high`
			`; ecx carry low`
			`; edx scratch`
			`; esi src pointer`
			`; edi dst pointer`
			`; ebp multiplier`
			`;`
			`; 15 code bytes each limb`
			`; ecx/ebx reversed on each chunk`

			`align 2`
			`Lunroll_inner_start:`

			`%assign i UNROLL_COUNT`
			`%rep UNROLL_COUNT`
			`%assign disp OFFSET-4*i`
			`%if i % 2 == 0`
			`mov eax,[byte disp+esi]`
			`mul ebp`
			`add [byte disp+edi],ebx`
			`adc ecx,eax`
			`mov ebx,edx`
			`adc ebx,0`
			`%else`
			`; this one comes out last`
			`mov eax,[byte disp+esi]`
			`mul ebp`
			`add [byte disp+edi],ecx`
			`adc ebx,eax`
			`mov ecx,edx`
			`adc ecx,0`
			`%endif`
			`%assign i i-1`
			`%endrep`

			`Lunroll_inner_end:`
			`add [OFFSET+edi],ebx`
			`mov edx,[VAR_COUNTER]`
			`adc ecx,0`
			`mov [OFFSET+4+edi],ecx`
			`mov ecx,[VAR_JMP]`
			`inc edx`
			`jnz Lunroll_outer_top`
			`%if OFFSET != 0`
			`add esi,OFFSET`
			`add edi,OFFSET`
			`%endif`

			`; eax`
			`; ebx`
			`; ecx`
			`; edx`
			`; esi &src[size]`
			`; edi &dst[2*size-5]`
			`; ebp`

			`align 16`
			`Lcorner:`
			`mov eax,[-12+esi]`
			`mul dword [-8+esi]`
			`add [edi],eax`
			`mov eax,[-12+esi]`
			`mov ebx,0`
			`adc ebx,edx`
			`mul dword [-4+esi]`
			`add ebx,eax`
			`mov eax,[-8+esi]`
			`adc edx,0`
			`add [4+edi],ebx`
			`mov ebx,0`
			`adc ebx,edx`
			`mul dword [-4+esi]`
			`mov ecx,[PARAM_SIZE]`
			`add eax,ebx`
			`adc edx,0`
			`mov [8+edi],eax`
			`mov [12+edi],edx`
			`mov edi,[PARAM_DST]`

			`; Left shift of dst[1..2size-2],the bit shifted out becomes dst[2size-1].`

			`sub ecx,1 ; size-1`
			`xor eax,eax ; ready for final adcl,and clear carry`
			`mov edx,ecx`
			`mov esi,[PARAM_SRC]`

			`; eax`
			`; ebx`
			`; ecx counter,size-1 to 1`
			`; edx size-1 (for later use)`
			`; esi src (for later use)`
			`; edi dst,incrementing`
			`; ebp`

			`Llshift:`
			`rcl dword [4+edi],1`
			`rcl dword [8+edi],1`
			`lea edi,[8+edi]`
			`dec ecx`
			`jnz Llshift`
			`adc eax,eax`
			`mov [4+edi],eax ; dst most significant limb`
			`mov eax,[esi] ; src[0]`
			`lea esi,[4+esi+edx*4] ; &src[size]`
			`sub ecx,edx ; -(size-1)`

			`; Now add in the squares on the diagonal,src[0]^2,src[1]^2,...,`
			`; src[size-1]^2. dst[0] hasn't yet been set at all yet,and just gets the`
			`; low limb of src[0]^2.`

			`mul eax`
			`mov [edi+ecx*8],eax ; dst[0]`

			`; eax scratch`
			`; ebx scratch`
			`; ecx counter,negative`
			`; edx carry`
			`; esi &src[size]`
			`; edi dst[2*size-2]`
			`; ebp`

			`Ldiag:`
			`mov eax,[esi+ecx*4]`
			`mov ebx,edx`
			`mul eax`
			`add [4+edi+ecx*8],ebx`
			`adc [8+edi+ecx*8],eax`
			`adc edx,0`
			`inc ecx`
			`jnz Ldiag`
			`mov esi,[SAVE_ESI]`
			`mov ebx,[SAVE_EBX]`
			`add [4+edi],edx ; dst most significant limb`
			`mov edi,[SAVE_EDI]`
			`mov ebp,[SAVE_EBP]`
			`add esp,frame`
			`ret`

			`end`