mpir/mpn/x86/pentium/mod_1.asm

dnl  Intel P5 mpn_mod_1 -- mpn by limb remainder.

dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl  Fifth Floor, Boston, MA 02110-1301, USA.

include(`../config.m4')


C P5: 28.0 cycles/limb


C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C                       mp_limb_t carry);
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C                             mp_limb_t inverse);
C
C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of
C multiply by inverse without on-the-fly shifts.  See that code for some
C general comments.
C
C Alternatives:
C
C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles
C slower, probably more depending what it did to register usage.  Using MMX
C on P55 would be better, but still at least 4 or 5 instructions and so 2 or
C 3 cycles.


dnl  These thresholds are the sizes where the multiply by inverse method is
dnl  used, rather than plain "divl"s.  Minimum value 2.
dnl
dnl  MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),
dnl  MUL_UNNORM_THRESHOLD for an unnormalized divisor.
dnl
dnl  With the divl loop at 44 c/l and the inverse at 28 c/l with about 70
dnl  cycles to setup, the threshold should be about ceil(70/16)==5, which is
dnl  what happens in practice.
dnl
dnl  An unnormalized divisor gets an extra 40 cycles at the end for the
dnl  final (r*2^n)%(d*2^n) and shift.  This increases the threshold by about
dnl  40/16=3.
dnl
dnl  PIC adds between 4 and 7 cycles (not sure why it varies), but this
dnl  doesn't change the thresholds.
dnl
dnl  The entry sequence code that chooses between MUL_NORM_THRESHOLD and
dnl  MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles
dnl  (branch free) and ensures the choice between div or mul is optimal.

deflit(MUL_NORM_THRESHOLD,   ifdef(`PIC',5,5))
deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))

deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))


defframe(PARAM_INVERSE, 16)   dnl mpn_preinv_mod_1
defframe(PARAM_CARRY,   16)   dnl mpn_mod_1c
defframe(PARAM_DIVISOR, 12)
defframe(PARAM_SIZE,     8)
defframe(PARAM_SRC,      4)

dnl  re-using parameter space
define(VAR_NORM,    `PARAM_DIVISOR')
define(VAR_INVERSE, `PARAM_SIZE')

	TEXT

	ALIGN(8)
PROLOGUE(mpn_preinv_mod_1)
deflit(`FRAME',0)

	pushl	%ebp	FRAME_pushl()
	pushl	%esi	FRAME_pushl()

	movl	PARAM_SRC, %esi
	movl	PARAM_SIZE, %edx

	pushl	%edi	FRAME_pushl()
	pushl	%ebx	FRAME_pushl()

	movl	PARAM_DIVISOR, %ebp
	movl	PARAM_INVERSE, %eax

	movl	-4(%esi,%edx,4), %edi	C src high limb
	leal	-8(%esi,%edx,4), %esi	C &src[size-2]

	movl	$0, VAR_NORM
	decl	%edx

	jnz	L(start_preinv)

	subl	%ebp, %edi		C src-divisor
	popl	%ebx

	sbbl	%ecx, %ecx		C -1 if underflow
	movl	%edi, %eax		C src-divisor

	andl	%ebp, %ecx		C d if underflow
	popl	%edi

	addl	%ecx, %eax		C remainder, with possible addback
	popl	%esi

	popl	%ebp

	ret

EPILOGUE()


	ALIGN(8)
PROLOGUE(mpn_mod_1c)
deflit(`FRAME',0)

	movl	PARAM_DIVISOR, %eax
	movl	PARAM_SIZE, %ecx

	sarl	$31, %eax			C d highbit
	movl	PARAM_CARRY, %edx

	orl	%ecx, %ecx
	jz	L(done_edx)			C result==carry if size==0

	andl	$MUL_NORM_DELTA, %eax
	pushl	%ebp		FRAME_pushl()

	addl	$MUL_UNNORM_THRESHOLD, %eax	C norm or unnorm thresh
	pushl	%esi		FRAME_pushl()

	movl	PARAM_SRC, %esi
	movl	PARAM_DIVISOR, %ebp

	cmpl	%eax, %ecx
	jb	L(divide_top)

	movl	%edx, %eax		C carry as pretend src high limb
	leal	1(%ecx), %edx		C size+1

	cmpl	$0x1000000, %ebp
	jmp	L(mul_by_inverse_1c)

EPILOGUE()


	ALIGN(8)
PROLOGUE(mpn_mod_1)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	pushl	%ebp		FRAME_pushl()

	orl	%ecx, %ecx
	jz	L(done_zero)

	movl	PARAM_SRC, %eax
	movl	PARAM_DIVISOR, %ebp

	sarl	$31, %ebp		C -1 if divisor normalized
	movl	-4(%eax,%ecx,4), %eax	C src high limb

	movl	PARAM_DIVISOR, %edx
	pushl	%esi		FRAME_pushl()

	andl	$MUL_NORM_DELTA, %ebp
	cmpl	%edx, %eax		C carry flag if high<divisor

	sbbl	%edx, %edx		C -1 if high<divisor
	addl	$MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh

	addl	%edx, %ecx		C size-1 if high<divisor
	jz	L(done_eax)

	cmpl	%ebp, %ecx
	movl	PARAM_DIVISOR, %ebp

	movl	PARAM_SRC, %esi
	jae	L(mul_by_inverse)

	andl	%eax, %edx		C high as initial carry if high<divisor


L(divide_top):
	C eax	scratch (quotient)
	C ebx
	C ecx	counter, limbs, decrementing
	C edx	scratch (remainder)
	C esi	src
	C edi
	C ebp	divisor

	movl	-4(%esi,%ecx,4), %eax

	divl	%ebp

	decl	%ecx
	jnz	L(divide_top)


	popl	%esi
	popl	%ebp

L(done_edx):
	movl	%edx, %eax

	ret


L(done_zero):
	xorl	%eax, %eax
	popl	%ebp

	ret


C -----------------------------------------------------------------------------
C
C The divisor is normalized using the same code as the pentium
C count_leading_zeros in longlong.h.  Going through the GOT for PIC costs a
C couple of cycles, but is more or less unavoidable.


	ALIGN(8)
L(mul_by_inverse):
	C eax	src high limb
	C ebx
	C ecx	size or size-1
	C edx
	C esi	src
	C edi
	C ebp	divisor

	movl	PARAM_SIZE, %edx
	cmpl	$0x1000000, %ebp

L(mul_by_inverse_1c):
	sbbl	%ecx, %ecx
	cmpl	$0x10000, %ebp

	sbbl	$0, %ecx
	cmpl	$0x100, %ebp

	sbbl	$0, %ecx
	pushl	%edi		FRAME_pushl()

	pushl	%ebx		FRAME_pushl()
	movl	%ebp, %ebx		C d

ifdef(`PIC',`
	call	L(here)
L(here):
	popl	%edi
	leal	25(,%ecx,8), %ecx	C 0,-1,-2,-3 -> 25,17,9,1

	shrl	%cl, %ebx
	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi

	C AGI
	movl	__clz_tab@GOT(%edi), %edi
	addl	$-34, %ecx

	C AGI
	movb	(%ebx,%edi), %bl

',`
	leal	25(,%ecx,8), %ecx	C 0,-1,-2,-3 -> 25,17,9,1

	shrl	%cl, %ebx
	addl	$-34, %ecx

	C AGI
	movb	__clz_tab(%ebx), %bl
')
	movl	%eax, %edi		C carry -> n1

	addl	%ebx, %ecx		C -34 + c + __clz_tab[d>>c] = -clz-1
	leal	-8(%esi,%edx,4), %esi	C &src[size-2]

	xorl	$-1, %ecx		C clz
	movl	$-1, %edx

	ASSERT(e,`pushl	%eax		C clz calculation same as bsrl
		bsrl	%ebp, %eax
		xorl	$31, %eax
		cmpl	%eax, %ecx
		popl	%eax')

	shll	%cl, %ebp		C d normalized
	movl	%ecx, VAR_NORM

	subl	%ebp, %edx		C (b-d)-1, so edx:eax = b*(b-d)-1
	movl	$-1, %eax

	divl	%ebp			C floor (b*(b-d)-1) / d

L(start_preinv):
	movl	%eax, VAR_INVERSE
	movl	%ebp, %eax		C d

	movl	%ecx, %edx		C fake high, will cancel


C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src
C high limb, and this may be greater than the divisor and may need one copy
C of the divisor subtracted (only one, because the divisor is normalized).
C This is accomplished by having the initial ecx:edi act as a fake previous
C n2:n10.  The initial edx:eax is d, acting as a fake (q1+1)*d which is
C subtracted from ecx:edi, with the usual addback if it produces an
C underflow.


L(inverse_top):
	C eax	scratch (n10, n1, q1, etc)
	C ebx	scratch (nadj, src limit)
	C ecx	old n2
	C edx	scratch
	C esi	src pointer, &src[size-2] to &src[0]
	C edi	old n10
	C ebp	d

	subl	%eax, %edi	   C low  n - (q1+1)*d
	movl	(%esi), %eax	   C new n10

	sbbl	%edx, %ecx	   C high n - (q1+1)*d, 0 or -1
	movl	%ebp, %ebx	   C d

	sarl	$31, %eax	   C -n1
	andl	%ebp, %ecx	   C d if underflow

	addl	%edi, %ecx	   C remainder -> n2, and possible addback
	ASSERT(b,`cmpl %ebp, %ecx')
	andl	%eax, %ebx	   C -n1 & d

	movl	(%esi), %edi	   C n10
	andl	$1, %eax	   C n1

	addl	%ecx, %eax         C n2+n1
	addl	%edi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow

	mull	VAR_INVERSE        C m*(n2+n1)

	addl	%eax, %ebx         C low(m*(n2+n1) + nadj), giving carry flag
	leal	1(%ecx), %eax      C 1+n2

	adcl	%edx, %eax         C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1
	movl	PARAM_SRC, %ebx

	sbbl	$0, %eax	   C use q1 if q1+1 overflows
	subl	$4, %esi	   C step src ptr

	mull	%ebp		   C (q1+1)*d

	cmpl	%ebx, %esi
	jae	L(inverse_top)


	C %edi (after subtract and addback) is the remainder modulo d*2^n
	C and must be reduced to 0<=r<d by calculating r*2^n mod d*2^n and
	C right shifting by n.
	C
	C If d was already normalized on entry so that n==0 then nothing is
	C needed here.  This is always the case for preinv_mod_1.  For mod_1
	C or mod_1c the chance of n==0 is low, but about 40 cycles can be
	C saved.

	subl	%eax, %edi	   C low  n - (q1+1)*d
	movl	%ecx, %ebx	   C n2

	sbbl	%edx, %ebx	   C high n - (q1+1)*d, 0 or -1
	xorl	%esi, %esi	   C next n2

	andl	%ebp, %ebx	   C d if underflow
	movl	VAR_NORM, %ecx

	addl	%ebx, %edi	   C remainder, with possible addback
	orl	%ecx, %ecx

	jz	L(done_mul_edi)


	C Here using %esi=n2 and %edi=n10, unlike the above

	shldl(	%cl, %edi, %esi)   C n2

	shll	%cl, %edi	   C n10

	movl	%edi, %eax	   C n10
	movl	%edi, %ebx	   C n10

	sarl	$31, %ebx          C -n1

	shrl	$31, %eax          C n1
	andl	%ebp, %ebx         C -n1 & d

	addl	%esi, %eax	   C n2+n1
	addl	%edi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow

	mull	VAR_INVERSE        C m*(n2+n1)

	addl	%eax, %ebx         C m*(n2+n1) + nadj, low giving carry flag
	leal	1(%esi), %eax      C 1+n2

	adcl	%edx, %eax         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1

	sbbl	$0, %eax	   C use q1 if q1+1 overflows

	mull	%ebp		   C (q1+1)*d

	subl	%eax, %edi	   C low  n - (q1+1)*d
	popl	%ebx

	sbbl	%edx, %esi	   C high n - (q1+1)*d, 0 or -1
	movl	%edi, %eax

	andl	%ebp, %esi	   C d if underflow
	popl	%edi

	addl	%esi, %eax	   C addback if underflow
	popl	%esi

	shrl	%cl, %eax	   C denorm remainder
	popl	%ebp

	ret


L(done_mul_edi):
	movl	%edi, %eax
	popl	%ebx

	popl	%edi
L(done_eax):
	popl	%esi

	popl	%ebp

	ret

EPILOGUE()
Basic GMP files with a new core2 directory and amd_64 directory with Martin's and Gaudry's patches. Removed directories for no longer supported architectures. 2008-04-17 17:03:07 -04:00			`dnl Intel P5 mpn_mod_1 -- mpn by limb remainder.`

			`dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.`
			`dnl`
			`dnl This file is part of the GNU MP Library.`
			`dnl`
			`dnl The GNU MP Library is free software; you can redistribute it and/or`
			`dnl modify it under the terms of the GNU Lesser General Public License as`
			`dnl published by the Free Software Foundation; either version 2.1 of the`
			`dnl License, or (at your option) any later version.`
			`dnl`
			`dnl The GNU MP Library is distributed in the hope that it will be useful,`
			`dnl but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`dnl Lesser General Public License for more details.`
			`dnl`
			`dnl You should have received a copy of the GNU Lesser General Public`
			`dnl License along with the GNU MP Library; see the file COPYING.LIB. If`
			`dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,`
			`dnl Fifth Floor, Boston, MA 02110-1301, USA.`

			include(`../config.m4')


			`C P5: 28.0 cycles/limb`


			`C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);`
			`C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,`
			`C mp_limb_t carry);`
			`C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,`
			`C mp_limb_t inverse);`
			`C`
			`C This code is not unlike mpn/x86/p6/mod_1.asm, it does the same sort of`
			`C multiply by inverse without on-the-fly shifts. See that code for some`
			`C general comments.`
			`C`
			`C Alternatives:`
			`C`
			`C P5 shldl is 4 cycles, so shifting on the fly would be at least 5 cycles`
			`C slower, probably more depending what it did to register usage. Using MMX`
			`C on P55 would be better, but still at least 4 or 5 instructions and so 2 or`
			`C 3 cycles.`


			`dnl These thresholds are the sizes where the multiply by inverse method is`
			`dnl used, rather than plain "divl"s. Minimum value 2.`
			`dnl`
			`dnl MUL_NORM_THRESHOLD is for an already normalized divisor (high bit set),`
			`dnl MUL_UNNORM_THRESHOLD for an unnormalized divisor.`
			`dnl`
			`dnl With the divl loop at 44 c/l and the inverse at 28 c/l with about 70`
			`dnl cycles to setup, the threshold should be about ceil(70/16)==5, which is`
			`dnl what happens in practice.`
			`dnl`
			`dnl An unnormalized divisor gets an extra 40 cycles at the end for the`
			`dnl final (r2^n)%(d2^n) and shift. This increases the threshold by about`
			`dnl 40/16=3.`
			`dnl`
			`dnl PIC adds between 4 and 7 cycles (not sure why it varies), but this`
			`dnl doesn't change the thresholds.`
			`dnl`
			`dnl The entry sequence code that chooses between MUL_NORM_THRESHOLD and`
			`dnl MUL_UNNORM_THRESHOLD is a bit horrible, but it adds only 2 cycles`
			`dnl (branch free) and ensures the choice between div or mul is optimal.`

			deflit(MUL_NORM_THRESHOLD, ifdef(`PIC',5,5))
			deflit(MUL_UNNORM_THRESHOLD, ifdef(`PIC',8,8))

			`deflit(MUL_NORM_DELTA, eval(MUL_NORM_THRESHOLD - MUL_UNNORM_THRESHOLD))`


			`defframe(PARAM_INVERSE, 16) dnl mpn_preinv_mod_1`
			`defframe(PARAM_CARRY, 16) dnl mpn_mod_1c`
			`defframe(PARAM_DIVISOR, 12)`
			`defframe(PARAM_SIZE, 8)`
			`defframe(PARAM_SRC, 4)`

			`dnl re-using parameter space`
			define(VAR_NORM, `PARAM_DIVISOR')
			define(VAR_INVERSE, `PARAM_SIZE')

			`TEXT`

			`ALIGN(8)`
			`PROLOGUE(mpn_preinv_mod_1)`
			deflit(`FRAME',0)

			`pushl %ebp FRAME_pushl()`
			`pushl %esi FRAME_pushl()`

			`movl PARAM_SRC, %esi`
			`movl PARAM_SIZE, %edx`

			`pushl %edi FRAME_pushl()`
			`pushl %ebx FRAME_pushl()`

			`movl PARAM_DIVISOR, %ebp`
			`movl PARAM_INVERSE, %eax`

			`movl -4(%esi,%edx,4), %edi C src high limb`
			`leal -8(%esi,%edx,4), %esi C &src[size-2]`

			`movl $0, VAR_NORM`
			`decl %edx`

			`jnz L(start_preinv)`

			`subl %ebp, %edi C src-divisor`
			`popl %ebx`

			`sbbl %ecx, %ecx C -1 if underflow`
			`movl %edi, %eax C src-divisor`

			`andl %ebp, %ecx C d if underflow`
			`popl %edi`

			`addl %ecx, %eax C remainder, with possible addback`
			`popl %esi`

			`popl %ebp`

			`ret`

			`EPILOGUE()`


			`ALIGN(8)`
			`PROLOGUE(mpn_mod_1c)`
			deflit(`FRAME',0)

			`movl PARAM_DIVISOR, %eax`
			`movl PARAM_SIZE, %ecx`

			`sarl $31, %eax C d highbit`
			`movl PARAM_CARRY, %edx`

			`orl %ecx, %ecx`
			`jz L(done_edx) C result==carry if size==0`

			`andl $MUL_NORM_DELTA, %eax`
			`pushl %ebp FRAME_pushl()`

			`addl $MUL_UNNORM_THRESHOLD, %eax C norm or unnorm thresh`
			`pushl %esi FRAME_pushl()`

			`movl PARAM_SRC, %esi`
			`movl PARAM_DIVISOR, %ebp`

			`cmpl %eax, %ecx`
			`jb L(divide_top)`

			`movl %edx, %eax C carry as pretend src high limb`
			`leal 1(%ecx), %edx C size+1`

			`cmpl $0x1000000, %ebp`
			`jmp L(mul_by_inverse_1c)`

			`EPILOGUE()`


			`ALIGN(8)`
			`PROLOGUE(mpn_mod_1)`
			deflit(`FRAME',0)

			`movl PARAM_SIZE, %ecx`
			`pushl %ebp FRAME_pushl()`

			`orl %ecx, %ecx`
			`jz L(done_zero)`

			`movl PARAM_SRC, %eax`
			`movl PARAM_DIVISOR, %ebp`

			`sarl $31, %ebp C -1 if divisor normalized`
			`movl -4(%eax,%ecx,4), %eax C src high limb`

			`movl PARAM_DIVISOR, %edx`
			`pushl %esi FRAME_pushl()`

			`andl $MUL_NORM_DELTA, %ebp`
			`cmpl %edx, %eax C carry flag if high<divisor`

			`sbbl %edx, %edx C -1 if high<divisor`
			`addl $MUL_UNNORM_THRESHOLD, %ebp C norm or unnorm thresh`

			`addl %edx, %ecx C size-1 if high<divisor`
			`jz L(done_eax)`

			`cmpl %ebp, %ecx`
			`movl PARAM_DIVISOR, %ebp`

			`movl PARAM_SRC, %esi`
			`jae L(mul_by_inverse)`

			`andl %eax, %edx C high as initial carry if high<divisor`


			`L(divide_top):`
			`C eax scratch (quotient)`
			`C ebx`
			`C ecx counter, limbs, decrementing`
			`C edx scratch (remainder)`
			`C esi src`
			`C edi`
			`C ebp divisor`

			`movl -4(%esi,%ecx,4), %eax`

			`divl %ebp`

			`decl %ecx`
			`jnz L(divide_top)`


			`popl %esi`
			`popl %ebp`

			`L(done_edx):`
			`movl %edx, %eax`

			`ret`


			`L(done_zero):`
			`xorl %eax, %eax`
			`popl %ebp`

			`ret`


			`C -----------------------------------------------------------------------------`
			`C`
			`C The divisor is normalized using the same code as the pentium`
			`C count_leading_zeros in longlong.h. Going through the GOT for PIC costs a`
			`C couple of cycles, but is more or less unavoidable.`


			`ALIGN(8)`
			`L(mul_by_inverse):`
			`C eax src high limb`
			`C ebx`
			`C ecx size or size-1`
			`C edx`
			`C esi src`
			`C edi`
			`C ebp divisor`

			`movl PARAM_SIZE, %edx`
			`cmpl $0x1000000, %ebp`

			`L(mul_by_inverse_1c):`
			`sbbl %ecx, %ecx`
			`cmpl $0x10000, %ebp`

			`sbbl $0, %ecx`
			`cmpl $0x100, %ebp`

			`sbbl $0, %ecx`
			`pushl %edi FRAME_pushl()`

			`pushl %ebx FRAME_pushl()`
			`movl %ebp, %ebx C d`

			ifdef(`PIC',`
			`call L(here)`
			`L(here):`
			`popl %edi`
			`leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1`

			`shrl %cl, %ebx`
			`addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %edi`

			`C AGI`
			`movl __clz_tab@GOT(%edi), %edi`
			`addl $-34, %ecx`

			`C AGI`
			`movb (%ebx,%edi), %bl`

			',`
			`leal 25(,%ecx,8), %ecx C 0,-1,-2,-3 -> 25,17,9,1`

			`shrl %cl, %ebx`
			`addl $-34, %ecx`

			`C AGI`
			`movb __clz_tab(%ebx), %bl`
			`')`
			`movl %eax, %edi C carry -> n1`

			`addl %ebx, %ecx C -34 + c + __clz_tab[d>>c] = -clz-1`
			`leal -8(%esi,%edx,4), %esi C &src[size-2]`

			`xorl $-1, %ecx C clz`
			`movl $-1, %edx`

			ASSERT(e,`pushl %eax C clz calculation same as bsrl
			`bsrl %ebp, %eax`
			`xorl $31, %eax`
			`cmpl %eax, %ecx`
			`popl %eax')`

			`shll %cl, %ebp C d normalized`
			`movl %ecx, VAR_NORM`

			`subl %ebp, %edx C (b-d)-1, so edx:eax = b*(b-d)-1`
			`movl $-1, %eax`

			`divl %ebp C floor (b*(b-d)-1) / d`

			`L(start_preinv):`
			`movl %eax, VAR_INVERSE`
			`movl %ebp, %eax C d`

			`movl %ecx, %edx C fake high, will cancel`


			`C For mpn_mod_1 and mpn_preinv_mod_1, the initial carry in %edi is the src`
			`C high limb, and this may be greater than the divisor and may need one copy`
			`C of the divisor subtracted (only one, because the divisor is normalized).`
			`C This is accomplished by having the initial ecx:edi act as a fake previous`
			`C n2:n10. The initial edx:eax is d, acting as a fake (q1+1)*d which is`
			`C subtracted from ecx:edi, with the usual addback if it produces an`
			`C underflow.`


			`L(inverse_top):`
			`C eax scratch (n10, n1, q1, etc)`
			`C ebx scratch (nadj, src limit)`
			`C ecx old n2`
			`C edx scratch`
			`C esi src pointer, &src[size-2] to &src[0]`
			`C edi old n10`
			`C ebp d`

			`subl %eax, %edi C low n - (q1+1)*d`
			`movl (%esi), %eax C new n10`

			`sbbl %edx, %ecx C high n - (q1+1)*d, 0 or -1`
			`movl %ebp, %ebx C d`

			`sarl $31, %eax C -n1`
			`andl %ebp, %ecx C d if underflow`

			`addl %edi, %ecx C remainder -> n2, and possible addback`
			ASSERT(b,`cmpl %ebp, %ecx')
			`andl %eax, %ebx C -n1 & d`

			`movl (%esi), %edi C n10`
			`andl $1, %eax C n1`

			`addl %ecx, %eax C n2+n1`
			`addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow`

			`mull VAR_INVERSE C m*(n2+n1)`

			`addl %eax, %ebx C low(m*(n2+n1) + nadj), giving carry flag`
			`leal 1(%ecx), %eax C 1+n2`

			`adcl %edx, %eax C 1 + high[n2<<32 + m*(n2+n1) + nadj] = q1+1`
			`movl PARAM_SRC, %ebx`

			`sbbl $0, %eax C use q1 if q1+1 overflows`
			`subl $4, %esi C step src ptr`

			`mull %ebp C (q1+1)*d`

			`cmpl %ebx, %esi`
			`jae L(inverse_top)`



			`C %edi (after subtract and addback) is the remainder modulo d*2^n`
			`C and must be reduced to 0<=r<d by calculating r2^n mod d2^n and`
			`C right shifting by n.`
			`C`
			`C If d was already normalized on entry so that n==0 then nothing is`
			`C needed here. This is always the case for preinv_mod_1. For mod_1`
			`C or mod_1c the chance of n==0 is low, but about 40 cycles can be`
			`C saved.`

			`subl %eax, %edi C low n - (q1+1)*d`
			`movl %ecx, %ebx C n2`

			`sbbl %edx, %ebx C high n - (q1+1)*d, 0 or -1`
			`xorl %esi, %esi C next n2`

			`andl %ebp, %ebx C d if underflow`
			`movl VAR_NORM, %ecx`

			`addl %ebx, %edi C remainder, with possible addback`
			`orl %ecx, %ecx`

			`jz L(done_mul_edi)`


			`C Here using %esi=n2 and %edi=n10, unlike the above`

			`shldl( %cl, %edi, %esi) C n2`

			`shll %cl, %edi C n10`

			`movl %edi, %eax C n10`
			`movl %edi, %ebx C n10`

			`sarl $31, %ebx C -n1`

			`shrl $31, %eax C n1`
			`andl %ebp, %ebx C -n1 & d`

			`addl %esi, %eax C n2+n1`
			`addl %edi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow`

			`mull VAR_INVERSE C m*(n2+n1)`

			`addl %eax, %ebx C m*(n2+n1) + nadj, low giving carry flag`
			`leal 1(%esi), %eax C 1+n2`

			`adcl %edx, %eax C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1`

			`sbbl $0, %eax C use q1 if q1+1 overflows`

			`mull %ebp C (q1+1)*d`

			`subl %eax, %edi C low n - (q1+1)*d`
			`popl %ebx`

			`sbbl %edx, %esi C high n - (q1+1)*d, 0 or -1`
			`movl %edi, %eax`

			`andl %ebp, %esi C d if underflow`
			`popl %edi`

			`addl %esi, %eax C addback if underflow`
			`popl %esi`

			`shrl %cl, %eax C denorm remainder`
			`popl %ebp`

			`ret`


			`L(done_mul_edi):`
			`movl %edi, %eax`
			`popl %ebx`

			`popl %edi`
			`L(done_eax):`
			`popl %esi`

			`popl %ebp`

			`ret`

			`EPILOGUE()`