mpir/mpn/x86/pentium4/sse2/divrem_1.asm

dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.

dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
dnl  Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl  Fifth Floor, Boston, MA 02110-1301, USA.

include(`../config.m4')


C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.


C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
C                         mp_srcptr src, mp_size_t size,
C                         mp_limb_t divisor);
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
C                          mp_srcptr src, mp_size_t size,
C                          mp_limb_t divisor, mp_limb_t carry);
C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
C                                mp_srcptr src, mp_size_t size,
C                                mp_limb_t divisor, mp_limb_t inverse,
C                                unsigned shift);
C
C Algorithm:
C
C The method and nomenclature follow part 8 of "Division by Invariant
C Integers using Multiplication" by Granlund and Montgomery, reference in
C gmp.texi.
C
C "m" is written for what is m' in the paper, and "d" for d_norm, which
C won't cause any confusion since it's only the normalized divisor that's of
C any use in the code.  "b" is written for 2^N, the size of a limb, N being
C 32 here.
C
C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
C us have just a psubq on the dependent chain.
C
C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
C
C Notes:
C
C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
C carry, since in normal circumstances that will be a very rare event.
C
C The test for skipping a division is branch free (once size>=1 is tested).
C The store to the destination high limb is 0 when a divide is skipped, or
C if it's not skipped then a copy of the src high limb is stored.  The
C latter is in case src==dst.
C
C There's a small bias towards expecting xsize==0, by having code for
C xsize==0 in a straight line and xsize!=0 under forward jumps.
C
C Enhancements:
C
C The loop measures 32 cycles, but the dependent chain would suggest it
C could be done with 30.  Not sure where to start looking for the extras.
C
C Alternatives:
C
C If the divisor is normalized (high bit set) then a division step can
C always be skipped, since the high destination limb is always 0 or 1 in
C that case.  It doesn't seem worth checking for this though, since it
C probably occurs infrequently.


dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
dnl
dnl  The inverse takes about 80-90 cycles to calculate, but after that the
dnl  multiply is 32 c/l versus division at about 58 c/l.
dnl
dnl  At 4 limbs the div is a touch faster than the mul (and of course
dnl  simpler), so start the mul from 5 limbs.

deflit(MUL_THRESHOLD, 5)


defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
defframe(PARAM_DIVISOR,20)
defframe(PARAM_SIZE,   16)
defframe(PARAM_SRC,    12)
defframe(PARAM_XSIZE,  8)
defframe(PARAM_DST,    4)

dnl  re-use parameter space
define(SAVE_ESI,`PARAM_SIZE')
define(SAVE_EBP,`PARAM_SRC')
define(SAVE_EDI,`PARAM_DIVISOR')
define(SAVE_EBX,`PARAM_DST')

	TEXT

	ALIGN(16)
PROLOGUE(mpn_preinv_divrem_1)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	xorl	%edx, %edx		C carry if can't skip a div

	movl	%esi, SAVE_ESI
	movl	PARAM_SRC, %esi

	movl	%ebp, SAVE_EBP
	movl	PARAM_DIVISOR, %ebp

	movl	%edi, SAVE_EDI
	movl	PARAM_DST, %edi

	movl	-4(%esi,%ecx,4), %eax	C src high limb

	movl	%ebx, SAVE_EBX
	movl	PARAM_XSIZE, %ebx

	movd	PARAM_PREINV_INVERSE, %mm4

	movd	PARAM_PREINV_SHIFT, %mm7  C l
	cmpl	%ebp, %eax		C high cmp divisor

	cmovc(	%eax, %edx)		C high is carry if high<divisor
	movd	%edx, %mm0		C carry

	movd	%edx, %mm1		C carry
	movl	$0, %edx

	movd	%ebp, %mm5		C d
	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
					C (the latter in case src==dst)
	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]

	movl	%edx, (%edi,%ecx,4)	C dst high limb
	sbbl	$0, %ecx		C skip one division if high<divisor
	movl	$32, %eax

	subl	PARAM_PREINV_SHIFT, %eax
	psllq	%mm7, %mm5		C d normalized
	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]

	movd	%eax, %mm6		C 32-l
	jmp	L(start_preinv)

EPILOGUE()


	ALIGN(16)
PROLOGUE(mpn_divrem_1c)
deflit(`FRAME',0)

	movl	PARAM_CARRY, %edx

	movl	PARAM_SIZE, %ecx

	movl	%esi, SAVE_ESI
	movl	PARAM_SRC, %esi

	movl	%ebp, SAVE_EBP
	movl	PARAM_DIVISOR, %ebp

	movl	%edi, SAVE_EDI
	movl	PARAM_DST, %edi

	movl	%ebx, SAVE_EBX
	movl	PARAM_XSIZE, %ebx

	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
	jmp	L(start_1c)

EPILOGUE()


	ALIGN(16)
PROLOGUE(mpn_divrem_1)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	xorl	%edx, %edx		C initial carry (if can't skip a div)

	movl	%esi, SAVE_ESI
	movl	PARAM_SRC, %esi

	movl	%ebp, SAVE_EBP
	movl	PARAM_DIVISOR, %ebp

	movl	%edi, SAVE_EDI
	movl	PARAM_DST, %edi

	movl	%ebx, SAVE_EBX
	movl	PARAM_XSIZE, %ebx
	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]

	orl	%ecx, %ecx		C size
	jz	L(no_skip_div)		C if size==0
	movl	-4(%esi,%ecx,4), %eax	C src high limb

	cmpl	%ebp, %eax		C high cmp divisor

	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
	movl	%edx, (%edi,%ecx,4)	C dst high limb

	movl	$0, %edx
	cmovc(	%eax, %edx)		C high is carry if high<divisor

	sbbl	$0, %ecx		C size-1 if high<divisor
L(no_skip_div):


L(start_1c):
	C eax
	C ebx	xsize
	C ecx	size
	C edx	carry
	C esi	src
	C edi	&dst[xsize-1]
	C ebp	divisor

	leal	(%ebx,%ecx), %eax	C size+xsize
	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]

	cmpl	$MUL_THRESHOLD, %eax
	jae	L(mul_by_inverse)


	orl	%ecx, %ecx
	jz	L(divide_no_integer)	C if size==0

L(divide_integer):
	C eax	scratch (quotient)
	C ebx	xsize
	C ecx	counter
	C edx	carry
	C esi	src, decrementing
	C edi	dst, decrementing
	C ebp	divisor

	movl	(%esi), %eax
	subl	$4, %esi

	divl	%ebp

	movl	%eax, (%edi)
	subl	$4, %edi

	subl	$1, %ecx
	jnz	L(divide_integer)


L(divide_no_integer):
	orl	%ebx, %ebx
	jnz	L(divide_fraction)	C if xsize!=0

L(divide_done):
	movl	SAVE_ESI, %esi
	movl	SAVE_EDI, %edi
	movl	SAVE_EBX, %ebx
	movl	SAVE_EBP, %ebp
	movl	%edx, %eax
	ret


L(divide_fraction):
	C eax	scratch (quotient)
	C ebx	counter
	C ecx
	C edx	carry
	C esi
	C edi	dst, decrementing
	C ebp	divisor

	movl	$0, %eax

	divl	%ebp

	movl	%eax, (%edi)
	subl	$4, %edi

	subl	$1, %ebx
	jnz	L(divide_fraction)

	jmp	L(divide_done)


C -----------------------------------------------------------------------------

L(mul_by_inverse):
	C eax
	C ebx	xsize
	C ecx	size
	C edx	carry
	C esi	&src[size-1]
	C edi	&dst[size+xsize-1]
	C ebp	divisor

	bsrl	%ebp, %eax		C 31-l
	movd	%edx, %mm0		C carry
	movd	%edx, %mm1		C carry
	movl	%ecx, %edx		C size
	movl	$31, %ecx

	C

	xorl	%eax, %ecx		C l = leading zeros on d
	addl	$1, %eax

	shll	%cl, %ebp		C d normalized
	movd	%ecx, %mm7		C l
	movl	%edx, %ecx		C size

	movd	%eax, %mm6		C 32-l
	movl	$-1, %edx
	movl	$-1, %eax

	C

	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1

	divl	%ebp			C floor (b*(b-d)-1 / d)
	movd	%ebp, %mm5		C d

	C

	movd	%eax, %mm4		C m


L(start_preinv):
	C eax	inverse
	C ebx	xsize
	C ecx	size
	C edx
	C esi	&src[size-1]
	C edi	&dst[size+xsize-1]
	C ebp
	C
	C mm0	carry
	C mm1	carry
	C mm2
	C mm4	m
	C mm5	d
	C mm6	31-l
	C mm7	l

	psllq	%mm7, %mm0		C n2 = carry << l, for size==0

	subl	$1, %ecx
	jb	L(integer_none)

	movd	(%esi), %mm0		C src high limb
	punpckldq %mm1, %mm0
	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
	jz	L(integer_last)


C The dependent chain here consists of
C
C	2   paddd    n1+n2
C	8   pmuludq  m*(n1+n2)
C	2   paddq    n2:nadj + m*(n1+n2)
C	2   psrlq    q1
C	8   pmuludq  d*q1
C	2   psubq    (n-d)-q1*d
C	2   psrlq    high n-(q1+1)*d mask
C	2   pand     d masked
C	2   paddd    n2+d addback
C	--
C	30
C
C But it seems to run at 32 cycles, so presumably there's something else
C going on.

	ALIGN(16)
L(integer_top):
	C eax
	C ebx
	C ecx	counter, size-1 to 0
	C edx
	C esi	src, decrementing
	C edi	dst, decrementing
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6	32-l
	C mm7	l

	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')

	movd	-4(%esi), %mm1		C next src limbs
	movd	(%esi), %mm2
	leal	-4(%esi), %esi

	punpckldq %mm2, %mm1
	psrlq	%mm6, %mm1		C n10

	movq	%mm1, %mm2		C n10
	movq	%mm1, %mm3		C n10
	psrad	$31, %mm1		C -n1
	pand	%mm5, %mm1		C -n1 & d
	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow

	psrld	$31, %mm2		C n1
	paddd	%mm0, %mm2		C n2+n1
	punpckldq %mm0, %mm1		C n2:nadj

	pmuludq	%mm4, %mm2		C m*(n2+n1)

	C

	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
	pcmpeqd	%mm2, %mm2		C FF...FF
	psrlq	$63, %mm2		C 1

	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))

	paddd	%mm1, %mm2		C q1+1
	pmuludq	%mm5, %mm1		C q1*d

	punpckldq %mm0, %mm3		C n = n2:n10
	pxor	%mm0, %mm0

	psubq	%mm5, %mm3		C n - d

	C

	psubq	%mm1, %mm3		C n - (q1+1)*d

	por	%mm3, %mm0		C copy remainder -> new n2
	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	 movd	%mm3, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')

	paddd	%mm3, %mm2		C q
	pand	%mm5, %mm3		C mask & d

	paddd	%mm3, %mm0		C addback if necessary
	movd	%mm2, (%edi)
	leal	-4(%edi), %edi

	subl	$1, %ecx
	ja	L(integer_top)


L(integer_last):
	C eax
	C ebx	xsize
	C ecx
	C edx
	C esi	&src[0]
	C edi	&dst[xsize]
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6
	C mm7	l

	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')

	movd	(%esi), %mm1		C src[0]
	psllq	%mm7, %mm1		C n10

	movq	%mm1, %mm2		C n10
	movq	%mm1, %mm3		C n10
	psrad	$31, %mm1		C -n1
	pand	%mm5, %mm1		C -n1 & d
	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow

	psrld	$31, %mm2		C n1
	paddd	%mm0, %mm2		C n2+n1
	punpckldq %mm0, %mm1		C n2:nadj

	pmuludq	%mm4, %mm2		C m*(n2+n1)

	C

	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
	pcmpeqd	%mm2, %mm2		C FF...FF
	psrlq	$63, %mm2		C 1

	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
	paddd	%mm1, %mm2		C q1

	pmuludq	%mm5, %mm1		C q1*d
	punpckldq %mm0, %mm3		C n
	psubq	%mm5, %mm3		C n - d
	pxor	%mm0, %mm0

	C

	psubq	%mm1, %mm3		C n - (q1+1)*d

	por	%mm3, %mm0		C remainder -> n2
	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	 movd	%mm3, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')

	paddd	%mm3, %mm2		C q
	pand	%mm5, %mm3		C mask & d

	paddd	%mm3, %mm0		C addback if necessary
	movd	%mm2, (%edi)
	leal	-4(%edi), %edi


L(integer_none):
	C eax
	C ebx	xsize

	orl	%ebx, %ebx
	jnz	L(fraction_some)	C if xsize!=0


L(fraction_done):
	movl	SAVE_EBP, %ebp
	psrld	%mm7, %mm0		C remainder

	movl	SAVE_EDI, %edi
	movd	%mm0, %eax

	movl	SAVE_ESI, %esi
	movl	SAVE_EBX, %ebx
	emms
	ret


C -----------------------------------------------------------------------------
C

L(fraction_some):
	C eax
	C ebx	xsize
	C ecx
	C edx
	C esi
	C edi	&dst[xsize-1]
	C ebp


L(fraction_top):
	C eax
	C ebx	counter, xsize iterations
	C ecx
	C edx
	C esi	src, decrementing
	C edi	dst, decrementing
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6	32-l
	C mm7	l

	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')

	movq	%mm0, %mm1		C n2
	pmuludq	%mm4, %mm0		C m*n2

	pcmpeqd	%mm2, %mm2
	psrlq	$63, %mm2

	C

	psrlq	$32, %mm0		C high(m*n2)

	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)

	paddd	%mm0, %mm2		C q1+1
	pmuludq	%mm5, %mm0		C q1*d

	psllq	$32, %mm1		C n = n2:0
	psubq	%mm5, %mm1		C n - d

	C

	psubq	%mm0, %mm1		C r = n - (q1+1)*d
	pxor	%mm0, %mm0

	por	%mm1, %mm0		C r -> n2
	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	 movd	%mm1, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')

	paddd	%mm1, %mm2		C q
	pand	%mm5, %mm1		C mask & d

	paddd	%mm1, %mm0		C addback if necessary
	movd	%mm2, (%edi)
	leal	-4(%edi), %edi

	subl	$1, %ebx
	jne	L(fraction_top)


	jmp	L(fraction_done)

EPILOGUE()
Basic GMP files with a new core2 directory and amd_64 directory with Martin's and Gaudry's patches. Removed directories for no longer supported architectures. 2008-04-17 17:03:07 -04:00			`dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.`

			`dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,`
			`dnl Inc.`
			`dnl`
			`dnl This file is part of the GNU MP Library.`
			`dnl`
			`dnl The GNU MP Library is free software; you can redistribute it and/or`
			`dnl modify it under the terms of the GNU Lesser General Public License as`
			`dnl published by the Free Software Foundation; either version 2.1 of the`
			`dnl License, or (at your option) any later version.`
			`dnl`
			`dnl The GNU MP Library is distributed in the hope that it will be useful,`
			`dnl but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`dnl Lesser General Public License for more details.`
			`dnl`
			`dnl You should have received a copy of the GNU Lesser General Public`
			`dnl License along with the GNU MP Library; see the file COPYING.LIB. If`
			`dnl not, write to the Free Software Foundation, Inc., 51 Franklin Street,`
			`dnl Fifth Floor, Boston, MA 02110-1301, USA.`

			include(`../config.m4')


			`C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.`


			`C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,`
			`C mp_srcptr src, mp_size_t size,`
			`C mp_limb_t divisor);`
			`C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,`
			`C mp_srcptr src, mp_size_t size,`
			`C mp_limb_t divisor, mp_limb_t carry);`
			`C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,`
			`C mp_srcptr src, mp_size_t size,`
			`C mp_limb_t divisor, mp_limb_t inverse,`
			`C unsigned shift);`
			`C`
			`C Algorithm:`
			`C`
			`C The method and nomenclature follow part 8 of "Division by Invariant`
			`C Integers using Multiplication" by Granlund and Montgomery, reference in`
			`C gmp.texi.`
			`C`
			`C "m" is written for what is m' in the paper, and "d" for d_norm, which`
			`C won't cause any confusion since it's only the normalized divisor that's of`
			`C any use in the code. "b" is written for 2^N, the size of a limb, N being`
			`C 32 here.`
			`C`
			`C The step "sdword dr = n - 2^Nd + (2^N-1-q1) d" is instead done as`
			`C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets`
			`C us have just a psubq on the dependent chain.`
			`C`
			`C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,`
			`C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.`
			`C`
			`C Notes:`
			`C`
			`C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high`
			`C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero`
			`C carry, since in normal circumstances that will be a very rare event.`
			`C`
			`C The test for skipping a division is branch free (once size>=1 is tested).`
			`C The store to the destination high limb is 0 when a divide is skipped, or`
			`C if it's not skipped then a copy of the src high limb is stored. The`
			`C latter is in case src==dst.`
			`C`
			`C There's a small bias towards expecting xsize==0, by having code for`
			`C xsize==0 in a straight line and xsize!=0 under forward jumps.`
			`C`
			`C Enhancements:`
			`C`
			`C The loop measures 32 cycles, but the dependent chain would suggest it`
			`C could be done with 30. Not sure where to start looking for the extras.`
			`C`
			`C Alternatives:`
			`C`
			`C If the divisor is normalized (high bit set) then a division step can`
			`C always be skipped, since the high destination limb is always 0 or 1 in`
			`C that case. It doesn't seem worth checking for this though, since it`
			`C probably occurs infrequently.`


			`dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by`
			`dnl inverse method is used, rather than plain "divl"s. Minimum value 1.`
			`dnl`
			`dnl The inverse takes about 80-90 cycles to calculate, but after that the`
			`dnl multiply is 32 c/l versus division at about 58 c/l.`
			`dnl`
			`dnl At 4 limbs the div is a touch faster than the mul (and of course`
			`dnl simpler), so start the mul from 5 limbs.`

			`deflit(MUL_THRESHOLD, 5)`


			`defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1`
			`defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1`
			`defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c`
			`defframe(PARAM_DIVISOR,20)`
			`defframe(PARAM_SIZE, 16)`
			`defframe(PARAM_SRC, 12)`
			`defframe(PARAM_XSIZE, 8)`
			`defframe(PARAM_DST, 4)`

			`dnl re-use parameter space`
			define(SAVE_ESI,`PARAM_SIZE')
			define(SAVE_EBP,`PARAM_SRC')
			define(SAVE_EDI,`PARAM_DIVISOR')
			define(SAVE_EBX,`PARAM_DST')

			`TEXT`

			`ALIGN(16)`
			`PROLOGUE(mpn_preinv_divrem_1)`
			deflit(`FRAME',0)

			`movl PARAM_SIZE, %ecx`
			`xorl %edx, %edx C carry if can't skip a div`

			`movl %esi, SAVE_ESI`
			`movl PARAM_SRC, %esi`

			`movl %ebp, SAVE_EBP`
			`movl PARAM_DIVISOR, %ebp`

			`movl %edi, SAVE_EDI`
			`movl PARAM_DST, %edi`

			`movl -4(%esi,%ecx,4), %eax C src high limb`

			`movl %ebx, SAVE_EBX`
			`movl PARAM_XSIZE, %ebx`

			`movd PARAM_PREINV_INVERSE, %mm4`

			`movd PARAM_PREINV_SHIFT, %mm7 C l`
			`cmpl %ebp, %eax C high cmp divisor`

			`cmovc( %eax, %edx) C high is carry if high<divisor`
			`movd %edx, %mm0 C carry`

			`movd %edx, %mm1 C carry`
			`movl $0, %edx`

			`movd %ebp, %mm5 C d`
			`cmovnc( %eax, %edx) C 0 if skip div, src high if not`
			`C (the latter in case src==dst)`
			`leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]`

			`movl %edx, (%edi,%ecx,4) C dst high limb`
			`sbbl $0, %ecx C skip one division if high<divisor`
			`movl $32, %eax`

			`subl PARAM_PREINV_SHIFT, %eax`
			`psllq %mm7, %mm5 C d normalized`
			`leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]`
			`leal -4(%esi,%ecx,4), %esi C &src[size-1]`

			`movd %eax, %mm6 C 32-l`
			`jmp L(start_preinv)`

			`EPILOGUE()`


			`ALIGN(16)`
			`PROLOGUE(mpn_divrem_1c)`
			deflit(`FRAME',0)

			`movl PARAM_CARRY, %edx`

			`movl PARAM_SIZE, %ecx`

			`movl %esi, SAVE_ESI`
			`movl PARAM_SRC, %esi`

			`movl %ebp, SAVE_EBP`
			`movl PARAM_DIVISOR, %ebp`

			`movl %edi, SAVE_EDI`
			`movl PARAM_DST, %edi`

			`movl %ebx, SAVE_EBX`
			`movl PARAM_XSIZE, %ebx`

			`leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]`
			`jmp L(start_1c)`

			`EPILOGUE()`


			`ALIGN(16)`
			`PROLOGUE(mpn_divrem_1)`
			deflit(`FRAME',0)

			`movl PARAM_SIZE, %ecx`
			`xorl %edx, %edx C initial carry (if can't skip a div)`

			`movl %esi, SAVE_ESI`
			`movl PARAM_SRC, %esi`

			`movl %ebp, SAVE_EBP`
			`movl PARAM_DIVISOR, %ebp`

			`movl %edi, SAVE_EDI`
			`movl PARAM_DST, %edi`

			`movl %ebx, SAVE_EBX`
			`movl PARAM_XSIZE, %ebx`
			`leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]`

			`orl %ecx, %ecx C size`
			`jz L(no_skip_div) C if size==0`
			`movl -4(%esi,%ecx,4), %eax C src high limb`

			`cmpl %ebp, %eax C high cmp divisor`

			`cmovnc( %eax, %edx) C 0 if skip div, src high if not`
			`movl %edx, (%edi,%ecx,4) C dst high limb`

			`movl $0, %edx`
			`cmovc( %eax, %edx) C high is carry if high<divisor`

			`sbbl $0, %ecx C size-1 if high<divisor`
			`L(no_skip_div):`


			`L(start_1c):`
			`C eax`
			`C ebx xsize`
			`C ecx size`
			`C edx carry`
			`C esi src`
			`C edi &dst[xsize-1]`
			`C ebp divisor`

			`leal (%ebx,%ecx), %eax C size+xsize`
			`leal -4(%esi,%ecx,4), %esi C &src[size-1]`
			`leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]`

			`cmpl $MUL_THRESHOLD, %eax`
			`jae L(mul_by_inverse)`


			`orl %ecx, %ecx`
			`jz L(divide_no_integer) C if size==0`

			`L(divide_integer):`
			`C eax scratch (quotient)`
			`C ebx xsize`
			`C ecx counter`
			`C edx carry`
			`C esi src, decrementing`
			`C edi dst, decrementing`
			`C ebp divisor`

			`movl (%esi), %eax`
			`subl $4, %esi`

			`divl %ebp`

			`movl %eax, (%edi)`
			`subl $4, %edi`

			`subl $1, %ecx`
			`jnz L(divide_integer)`


			`L(divide_no_integer):`
			`orl %ebx, %ebx`
			`jnz L(divide_fraction) C if xsize!=0`

			`L(divide_done):`
			`movl SAVE_ESI, %esi`
			`movl SAVE_EDI, %edi`
			`movl SAVE_EBX, %ebx`
			`movl SAVE_EBP, %ebp`
			`movl %edx, %eax`
			`ret`


			`L(divide_fraction):`
			`C eax scratch (quotient)`
			`C ebx counter`
			`C ecx`
			`C edx carry`
			`C esi`
			`C edi dst, decrementing`
			`C ebp divisor`

			`movl $0, %eax`

			`divl %ebp`

			`movl %eax, (%edi)`
			`subl $4, %edi`

			`subl $1, %ebx`
			`jnz L(divide_fraction)`

			`jmp L(divide_done)`



			`C -----------------------------------------------------------------------------`

			`L(mul_by_inverse):`
			`C eax`
			`C ebx xsize`
			`C ecx size`
			`C edx carry`
			`C esi &src[size-1]`
			`C edi &dst[size+xsize-1]`
			`C ebp divisor`

			`bsrl %ebp, %eax C 31-l`
			`movd %edx, %mm0 C carry`
			`movd %edx, %mm1 C carry`
			`movl %ecx, %edx C size`
			`movl $31, %ecx`

			`C`

			`xorl %eax, %ecx C l = leading zeros on d`
			`addl $1, %eax`

			`shll %cl, %ebp C d normalized`
			`movd %ecx, %mm7 C l`
			`movl %edx, %ecx C size`

			`movd %eax, %mm6 C 32-l`
			`movl $-1, %edx`
			`movl $-1, %eax`

			`C`

			`subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1`

			`divl %ebp C floor (b*(b-d)-1 / d)`
			`movd %ebp, %mm5 C d`

			`C`

			`movd %eax, %mm4 C m`


			`L(start_preinv):`
			`C eax inverse`
			`C ebx xsize`
			`C ecx size`
			`C edx`
			`C esi &src[size-1]`
			`C edi &dst[size+xsize-1]`
			`C ebp`
			`C`
			`C mm0 carry`
			`C mm1 carry`
			`C mm2`
			`C mm4 m`
			`C mm5 d`
			`C mm6 31-l`
			`C mm7 l`

			`psllq %mm7, %mm0 C n2 = carry << l, for size==0`

			`subl $1, %ecx`
			`jb L(integer_none)`

			`movd (%esi), %mm0 C src high limb`
			`punpckldq %mm1, %mm0`
			`psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)`
			`jz L(integer_last)`


			`C The dependent chain here consists of`
			`C`
			`C 2 paddd n1+n2`
			`C 8 pmuludq m*(n1+n2)`
			`C 2 paddq n2:nadj + m*(n1+n2)`
			`C 2 psrlq q1`
			`C 8 pmuludq d*q1`
			`C 2 psubq (n-d)-q1*d`
			`C 2 psrlq high n-(q1+1)*d mask`
			`C 2 pand d masked`
			`C 2 paddd n2+d addback`
			`C --`
			`C 30`
			`C`
			`C But it seems to run at 32 cycles, so presumably there's something else`
			`C going on.`

			`ALIGN(16)`
			`L(integer_top):`
			`C eax`
			`C ebx`
			`C ecx counter, size-1 to 0`
			`C edx`
			`C esi src, decrementing`
			`C edi dst, decrementing`
			`C`
			`C mm0 n2`
			`C mm4 m`
			`C mm5 d`
			`C mm6 32-l`
			`C mm7 l`

			ASSERT(b,`C n2<d
			`movd %mm0, %eax`
			`movd %mm5, %edx`
			`cmpl %edx, %eax')`

			`movd -4(%esi), %mm1 C next src limbs`
			`movd (%esi), %mm2`
			`leal -4(%esi), %esi`

			`punpckldq %mm2, %mm1`
			`psrlq %mm6, %mm1 C n10`

			`movq %mm1, %mm2 C n10`
			`movq %mm1, %mm3 C n10`
			`psrad $31, %mm1 C -n1`
			`pand %mm5, %mm1 C -n1 & d`
			`paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow`

			`psrld $31, %mm2 C n1`
			`paddd %mm0, %mm2 C n2+n1`
			`punpckldq %mm0, %mm1 C n2:nadj`

			`pmuludq %mm4, %mm2 C m*(n2+n1)`

			`C`

			`paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)`
			`pxor %mm2, %mm2 C break dependency, saves 4 cycles`
			`pcmpeqd %mm2, %mm2 C FF...FF`
			`psrlq $63, %mm2 C 1`

			`psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))`

			`paddd %mm1, %mm2 C q1+1`
			`pmuludq %mm5, %mm1 C q1*d`

			`punpckldq %mm0, %mm3 C n = n2:n10`
			`pxor %mm0, %mm0`

			`psubq %mm5, %mm3 C n - d`

			`C`

			`psubq %mm1, %mm3 C n - (q1+1)*d`

			`por %mm3, %mm0 C copy remainder -> new n2`
			`psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1`

			ASSERT(be,`C 0 or -1
			`movd %mm3, %eax`
			`addl $1, %eax`
			`cmpl $1, %eax')`

			`paddd %mm3, %mm2 C q`
			`pand %mm5, %mm3 C mask & d`

			`paddd %mm3, %mm0 C addback if necessary`
			`movd %mm2, (%edi)`
			`leal -4(%edi), %edi`

			`subl $1, %ecx`
			`ja L(integer_top)`


			`L(integer_last):`
			`C eax`
			`C ebx xsize`
			`C ecx`
			`C edx`
			`C esi &src[0]`
			`C edi &dst[xsize]`
			`C`
			`C mm0 n2`
			`C mm4 m`
			`C mm5 d`
			`C mm6`
			`C mm7 l`

			ASSERT(b,`C n2<d
			`movd %mm0, %eax`
			`movd %mm5, %edx`
			`cmpl %edx, %eax')`

			`movd (%esi), %mm1 C src[0]`
			`psllq %mm7, %mm1 C n10`

			`movq %mm1, %mm2 C n10`
			`movq %mm1, %mm3 C n10`
			`psrad $31, %mm1 C -n1`
			`pand %mm5, %mm1 C -n1 & d`
			`paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow`

			`psrld $31, %mm2 C n1`
			`paddd %mm0, %mm2 C n2+n1`
			`punpckldq %mm0, %mm1 C n2:nadj`

			`pmuludq %mm4, %mm2 C m*(n2+n1)`

			`C`

			`paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)`
			`pcmpeqd %mm2, %mm2 C FF...FF`
			`psrlq $63, %mm2 C 1`

			`psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))`
			`paddd %mm1, %mm2 C q1`

			`pmuludq %mm5, %mm1 C q1*d`
			`punpckldq %mm0, %mm3 C n`
			`psubq %mm5, %mm3 C n - d`
			`pxor %mm0, %mm0`

			`C`

			`psubq %mm1, %mm3 C n - (q1+1)*d`

			`por %mm3, %mm0 C remainder -> n2`
			`psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1`

			ASSERT(be,`C 0 or -1
			`movd %mm3, %eax`
			`addl $1, %eax`
			`cmpl $1, %eax')`

			`paddd %mm3, %mm2 C q`
			`pand %mm5, %mm3 C mask & d`

			`paddd %mm3, %mm0 C addback if necessary`
			`movd %mm2, (%edi)`
			`leal -4(%edi), %edi`


			`L(integer_none):`
			`C eax`
			`C ebx xsize`

			`orl %ebx, %ebx`
			`jnz L(fraction_some) C if xsize!=0`


			`L(fraction_done):`
			`movl SAVE_EBP, %ebp`
			`psrld %mm7, %mm0 C remainder`

			`movl SAVE_EDI, %edi`
			`movd %mm0, %eax`

			`movl SAVE_ESI, %esi`
			`movl SAVE_EBX, %ebx`
			`emms`
			`ret`



			`C -----------------------------------------------------------------------------`
			`C`

			`L(fraction_some):`
			`C eax`
			`C ebx xsize`
			`C ecx`
			`C edx`
			`C esi`
			`C edi &dst[xsize-1]`
			`C ebp`


			`L(fraction_top):`
			`C eax`
			`C ebx counter, xsize iterations`
			`C ecx`
			`C edx`
			`C esi src, decrementing`
			`C edi dst, decrementing`
			`C`
			`C mm0 n2`
			`C mm4 m`
			`C mm5 d`
			`C mm6 32-l`
			`C mm7 l`

			ASSERT(b,`C n2<d
			`movd %mm0, %eax`
			`movd %mm5, %edx`
			`cmpl %edx, %eax')`

			`movq %mm0, %mm1 C n2`
			`pmuludq %mm4, %mm0 C m*n2`

			`pcmpeqd %mm2, %mm2`
			`psrlq $63, %mm2`

			`C`

			`psrlq $32, %mm0 C high(m*n2)`

			`paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)`

			`paddd %mm0, %mm2 C q1+1`
			`pmuludq %mm5, %mm0 C q1*d`

			`psllq $32, %mm1 C n = n2:0`
			`psubq %mm5, %mm1 C n - d`

			`C`

			`psubq %mm0, %mm1 C r = n - (q1+1)*d`
			`pxor %mm0, %mm0`

			`por %mm1, %mm0 C r -> n2`
			`psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1`

			ASSERT(be,`C 0 or -1
			`movd %mm1, %eax`
			`addl $1, %eax`
			`cmpl $1, %eax')`

			`paddd %mm1, %mm2 C q`
			`pand %mm5, %mm1 C mask & d`

			`paddd %mm1, %mm0 C addback if necessary`
			`movd %mm2, (%edi)`
			`leal -4(%edi), %edi`

			`subl $1, %ebx`
			`jne L(fraction_top)`


			`jmp L(fraction_done)`

			`EPILOGUE()`