dnl  Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder.

dnl  Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 2.1 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public
dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
dnl  Fifth Floor, Boston, MA 02110-1301, USA.

include(`../config.m4')


dnl  P4: 31 cycles/limb.


C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C                       mp_limb_t carry);
C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
C                             mp_limb_t inverse);
C
C An idea was tried in the mul-by-inverse to process the last limb by a jump
C back to the top of the loop skipping the -4(%esi) fetch.  But that seemed
C to produce slightly strange timings, like 9 and 10 limb operations about
C the same speed.  The jump would be successively taken and not-taken, which
C in theory should predict ok, but perhaps isn't enjoyed by the chip.
C Duplicating the loop for the last limb seems to be a couple of cycles
C quicker too.
C
C Enhancements:
C
C The loop measures 31 cycles, but the dependent chain would suggest it
C could be done with 30.  Not sure where to start looking for the extra
C cycle.


dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
dnl  used, rather than plain "divl"s.  Minimum value 2.
dnl
dnl  The inverse takes about 80-90 cycles to calculate, but after that the
dnl  multiply is 31 c/l versus division at about 58 c/l.

deflit(MUL_THRESHOLD, 5)


defframe(PARAM_INVERSE,16)  dnl mpn_preinv_mod_1
defframe(PARAM_CARRY,  16)  dnl mpn_mod_1c
defframe(PARAM_DIVISOR,12)
defframe(PARAM_SIZE,    8)
defframe(PARAM_SRC,     4)

dnl  re-use parameter space
define(SAVE_ESI,`PARAM_SIZE')
define(SAVE_EBP,`PARAM_SRC')

	TEXT

	ALIGN(16)
PROLOGUE(mpn_preinv_mod_1)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	movl	%esi, SAVE_ESI
	movl	$32, %eax

	movd	%eax, %mm6			C l = 0, so 32-l = 32
	movl	PARAM_SRC, %esi
	movl	%ebp, SAVE_EBP

	movd	PARAM_DIVISOR, %mm5
	pxor	%mm7, %mm7			C l = 0

	movd	-4(%esi,%ecx,4), %mm0		C src high limb
	leal	-8(%esi,%ecx,4), %esi		C &src[size-2]

	movd	PARAM_INVERSE, %mm4
	subl	$2, %ecx			C size-2

	psubq	%mm5, %mm0			C high-divisor
	movq	%mm0, %mm2

	psrlq	$32, %mm0			C -1 if underflow

	pand	%mm5, %mm0			C divisor if underflow

	paddq	%mm2, %mm0			C addback if underflow
	jz	L(inverse_last)			C if size==2
	ja	L(inverse_top)			C if size>2


	C if size==1
	movl	SAVE_ESI, %esi
	movd	%mm0, %eax
	emms
	ret

EPILOGUE()


	ALIGN(16)
PROLOGUE(mpn_mod_1c)
deflit(`FRAME',0)
	movl	PARAM_SIZE, %ecx
	movl	%esi, SAVE_ESI

	movl	PARAM_SRC, %esi
	movl	%ebp, SAVE_EBP

	movl	PARAM_CARRY, %edx
	orl	%ecx, %ecx
	jz	L(divide_done)		C result==carry if size==0

	movl	PARAM_DIVISOR, %ebp
	jmp	L(start_1c)

EPILOGUE()


	ALIGN(16)
PROLOGUE(mpn_mod_1)
deflit(`FRAME',0)

	movl	PARAM_SIZE, %ecx
	movl	%esi, SAVE_ESI

	movl	PARAM_SRC, %esi
	movl	%ebp, SAVE_EBP

	movl	PARAM_DIVISOR, %ebp
	xorl	%edx, %edx		C result 0 if size==0

	orl	%ecx, %ecx
	jz	L(divide_done)
	movl	-4(%esi,%ecx,4), %eax	C src high limb

	leal	-1(%ecx), %edx
	cmpl	%ebp, %eax		C c if high<divisor

	cmovc(	%edx, %ecx)		C size-1 if high<divisor

	movl	$0, %edx		C initial carry
	cmovc(	%eax, %edx)		C src high limb if high<divisor

	orl	%ecx, %ecx
	jz	L(divide_done)		C if size==1 and skip div


L(start_1c):
	C eax
	C ebx
	C ecx	size
	C edx	carry
	C esi	src
	C edi
	C ebp	divisor

	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
	cmpl	$MUL_THRESHOLD, %ecx
	jae	L(mul_by_inverse)


L(divide_top):
	C eax
	C ebx
	C ecx	counter, limbs, decrementing
	C edx	remainder
	C esi	src, decrementing
	C edi
	C ebp	divisor

	movl	(%esi), %eax
	subl	$4, %esi

	divl	%ebp

	subl	$1, %ecx
	jnz	L(divide_top)


L(divide_done):
	movl	SAVE_ESI, %esi
	movl	SAVE_EBP, %ebp
	movl	%edx, %eax
	ret


C -----------------------------------------------------------------------------

L(mul_by_inverse):
	C eax
	C ebx
	C ecx	size
	C edx	carry
	C esi	src
	C edi
	C ebp	divisor

	bsrl	%ebp, %eax		C 31-l

	movd	%edx, %mm1		C carry
	movl	%ecx, %edx		C size
	movl	$31, %ecx

	C

	xorl	%eax, %ecx		C l = leading zeros on d
	addl	$1, %eax		C 32-l

	shll	%cl, %ebp		C normalize d
	movd	%ecx, %mm7		C l
	leal	-1(%edx), %ecx		C size-1

	movd	%eax, %mm6		C 32-l
	movl	$-1, %edx
	movl	$-1, %eax

	C

	subl	%ebp, %edx		C (b-d)-1 so edx:eax = b*(b-d)-1

	divl	%ebp			C floor (b*(b-d)-1 / d)

	movd	%ebp, %mm5		C d
	movd	(%esi), %mm0		C src high limb
	punpckldq %mm1, %mm0
	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)

	C

	movd	%eax, %mm4		C m


C The dependent chain here consists of
C
C	2   paddd    n1+n2
C	8   pmuludq  m*(n1+n2)
C	2   paddq    n2:nadj + m*(n1+n2)
C	2   psrlq    q1
C	8   pmuludq  d*q1
C	2   psubq    (n-d)-q1*d
C	2   psrlq    high mask
C	2   pand     d masked
C	2   paddd    n2+d addback
C	--
C	30
C
C But it seems to run at 31 cycles, so presumably there's something else
C going on.


	ALIGN(16)
L(inverse_top):
	C eax
	C ebx
	C ecx	counter, size-1 to 1
	C edx
	C esi	src, decrementing
	C edi
	C ebp
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6	32-l
	C mm7	l

	ASSERT(b,`C n2<d
	 movd	%mm0, %eax
	 movd	%mm5, %edx
	 cmpl	%edx, %eax')

	movd	-4(%esi), %mm1		C next src limbs
	movd	(%esi), %mm2
	leal	-4(%esi), %esi

	punpckldq %mm2, %mm1
	psrlq	%mm6, %mm1		C n10

	movq	%mm1, %mm2		C n10
	movq	%mm1, %mm3		C n10
	psrad	$31, %mm1		C -n1
	pand	%mm5, %mm1		C -n1 & d
	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow

	psrld	$31, %mm2		C n1
	paddd	%mm0, %mm2		C n2+n1
	punpckldq %mm0, %mm1		C n2:nadj

	pmuludq	%mm4, %mm2		C m*(n2+n1)

	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)

	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))

	pmuludq	%mm5, %mm1		C q1*d
	punpckldq %mm0, %mm3		C n
	psubq	%mm5, %mm3		C n - d
	pxor	%mm0, %mm0

	psubq	%mm1, %mm3		C n - (q1+1)*d

	por	%mm3, %mm0		C remainder -> n2
	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	 movd	%mm3, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')

	pand	%mm5, %mm3		C mask & d

	paddd	%mm3, %mm0		C addback if necessary

	subl	$1, %ecx
	jnz	L(inverse_top)


	C Least significant limb.
	C Same code as the loop, but there's no -4(%esi) limb to fetch.

L(inverse_last):
	C eax
	C ebx
	C ecx
	C edx
	C esi	&src[0]
	C
	C mm0	n2
	C mm4	m
	C mm5	d
	C mm6	32-l
	C mm7	l

	movd	(%esi), %mm1		C src[0]
	psllq	%mm7, %mm1		C n10

	movq	%mm1, %mm2		C n10
	movq	%mm1, %mm3		C n10
	psrad	$31, %mm1		C -n1
	pand	%mm5, %mm1		C -n1 & d
	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow

	psrld	$31, %mm2		C n1
	paddd	%mm0, %mm2		C n2+n1
	punpckldq %mm0, %mm1		C n2:nadj

	pmuludq	%mm4, %mm2		C m*(n2+n1)

	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)

	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))

	pmuludq	%mm5, %mm1		C q1*d
	punpckldq %mm0, %mm3		C n
	psubq	%mm5, %mm3		C n - d
	pxor	%mm0, %mm0

	psubq	%mm1, %mm3		C n - (q1+1)*d

	por	%mm3, %mm0		C remainder -> n2
	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1

	ASSERT(be,`C 0 or -1
	 movd	%mm3, %eax
	 addl	$1, %eax
	 cmpl	$1, %eax')

	movl	SAVE_EBP, %ebp
	pand	%mm5, %mm3		C mask & d

	movl	SAVE_ESI, %esi
	paddd	%mm3, %mm0		C addback if necessary

	psrld	%mm7, %mm0

	movd	%mm0, %eax

	emms
	ret

EPILOGUE()