mpir/mpn/generic/tdiv_q.c

/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
   write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp.  If
   qxn is non-zero, generate that many fraction limbs and append them after the
   other quotient limbs, and update the remainder accordningly.  The input
   operands are unaffected.

   Preconditions:
   1. The most significant limb of of the divisor must be non-zero.
   2. No argument overlap is permitted.  (??? relax this ???)
   3. nn >= dn, even if qxn is non-zero.  (??? relax this ???)

   The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
   complexity of multiplication.

Copyright 1997, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
Copyright 2009, 2010 William Hart

This file is part of the MPIR Library.

The MPIR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The MPIR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the MPIR Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */

#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"

/* 
   dc_divappr_q requires the divisor be << GMP_NUMBMAX limbs
   the asymptotics of midmul are also only toom42 asymptotics,
   thus for performance reasons we limit the size of divisors
   used in dc_divapp_q
*/
#define DC_DIVAPPR_Q_LIMIT 32768

void
mpn_tdiv_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, 
                                   mp_srcptr dp, mp_size_t dn)
{
  ASSERT (nn >= 0);
  ASSERT (dn >= 0);
  ASSERT (dn == 0 || dp[dn - 1] != 0);
  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));

  switch (dn)
    {
    case 0:
      DIVIDE_BY_ZERO;

    case 1:
      {
  	mpn_divmod_1 (qp, np, nn, dp[0]);
	return;
      }

    case 2:
      {
	mp_ptr n2p, d2p;
	mp_limb_t qhl, cy;
	TMP_DECL;
	TMP_MARK;
	if ((dp[1] & GMP_NUMB_HIGHBIT) == 0)
	  {
	    int cnt;
	    mp_limb_t dtmp[2];
	    count_leading_zeros (cnt, dp[1]);
	    d2p = dtmp;
	    d2p[1] = (dp[1] << cnt) | (dp[0] >> (GMP_NUMB_BITS - cnt));
	    d2p[0] = (dp[0] << cnt);
	    n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn + 1);
	    cy = mpn_lshift (n2p, np, nn, cnt);
	    n2p[nn] = cy;
	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
	    if (cy == 0)
	      qp[nn - 2] = qhl;	/* always store nn-2+1 quotient limbs */
	  }
	else
	  {
	    d2p = (mp_ptr) dp;
	    n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn);
	    MPN_COPY (n2p, np, nn);
	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
	    qp[nn - 2] = qhl;	/* always store nn-2+1 quotient limbs */
	  }
	TMP_FREE;
	return;
      }

    default:
      {
	int adjust;
	TMP_DECL;
	TMP_MARK;
	adjust = np[nn - 1] >= dp[dn - 1];	/* conservative tests for quotient size */
	if (nn + adjust >= 2 * dn)
	  {
	    mp_ptr n2p, d2p, q2p;
	    mp_limb_t cy;
	    int cnt;

	    qp[nn - dn] = 0;			  /* zero high quotient limb */
	    if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) /* normalize divisor */
	      {
		count_leading_zeros (cnt, dp[dn - 1]);
		d2p = (mp_ptr) TMP_ALLOC_LIMBS (dn);
		mpn_lshift (d2p, dp, dn, cnt);
		n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn + 1);
		cy = mpn_lshift (n2p, np, nn, cnt);
		n2p[nn] = cy;
		nn += adjust;
	      }
	    else
	      {
		cnt = 0;
		d2p = (mp_ptr) dp;
		n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn + 1);
		MPN_COPY (n2p, np, nn);
		n2p[nn] = 0;
		nn += adjust;
	      }

	    if (dn < DIV_DC_THRESHOLD)
	      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
	    else
	      {
		/* Divide 2*dn / dn limbs as long as the limbs in np last.  */
		q2p = qp + nn - dn;
		n2p += nn - dn;
		do
		  {
		    q2p -= dn;  n2p -= dn;
		    mpn_dc_divrem_n (q2p, n2p, d2p, dn);
		    nn -= dn;
		  }
		while (nn >= 2 * dn);

		if (nn != dn)
		  {
		    mp_limb_t ql;
		    n2p -= nn - dn;

		    /* We have now dn < nn - dn < 2dn.  Make a recursive call,
		       since falling out to the code below isn't pretty.
		       Unfortunately, mpn_tdiv_qr returns nn-dn+1 quotient
		       limbs, which would overwrite one already generated
		       quotient limbs.  Preserve it with an ugly hack.  */
		    /* FIXME: This suggests that we should have an
		       mpn_tdiv_qr_internal that instead returns the most
		       significant quotient limb and move the meat of this
		       function there.  */
		    /* FIXME: Perhaps call mpn_sb_divrem_mn here for certain
		       operand ranges, to decrease overhead for small
		       operands?  */
		    ql = qp[nn - dn]; /* preserve quotient limb... */
		    mpn_tdiv_q (qp, n2p, nn, d2p, dn);
          qp[nn - dn] = ql; /* ...restore it again */
		  }
	      }

       TMP_FREE;
	    return;
	  }

	/* When we come here, the numerator/partial remainder is less
	   than twice the size of the denominator.  */

	  {
	    /* Problem:

	       Divide a numerator N with nn limbs by a denominator D with dn
	       limbs forming a quotient of qn=nn-dn+1 limbs.  When qn is small
	       compared to dn, conventional division algorithms perform poorly.
	       We want an algorithm that has an expected running time that is
	       dependent only on qn.

	       Algorithm (very informally stated):

	       1) Divide the 2 x qn most significant limbs from the numerator
		  by the qn most significant limbs from the denominator.  Call
		  the result qest.  This is either the correct quotient, but
		  might be 1 or 2 too large.  Compute the remainder from the
		  division.  (This step is implemented by a mpn_divrem call.)

	       2) Is the most significant limb from the remainder < p, where p
		  is the product of the most significant limb from the quotient
		  and the next(d)?  (Next(d) denotes the next ignored limb from
		  the denominator.)  If it is, decrement qest, and adjust the
		  remainder accordingly.

	       3) Is the remainder >= qest?  If it is, qest is the desired
		  quotient.  The algorithm terminates.

	       4) Subtract qest x next(d) from the remainder.  If there is
		  borrow out, decrement qest, and adjust the remainder
		  accordingly.

	       5) Skip one word from the denominator (i.e., let next(d) denote
		  the next less significant limb.  */

	    mp_size_t qn;
	    mp_ptr n2p, n3p, d2p;
	    mp_ptr tp;
	    mp_limb_t cy;
	    mp_size_t in, rn;
	    mp_limb_t quotient_too_large;
	    unsigned int cnt;

	    qn = nn - dn;
	    qp[qn] = 0;				/* zero high quotient limb */
	    qn += adjust;			/* qn cannot become bigger */

	    if (qn == 0)
	      {
		TMP_FREE;
		return;
	      }

	    in = dn - qn;		/* (at least partially) ignored # of limbs in ops */

	    /* Normalize denominator by shifting it to the left such that its
	       most significant bit is set.  Then shift the numerator the same
	       amount, to mathematically preserve quotient.  */
	    if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0)
	      {
		count_leading_zeros (cnt, dp[dn - 1]);
		
		d2p = (mp_ptr) TMP_ALLOC_LIMBS (qn);
		mpn_lshift (d2p, dp + in, qn, cnt);
		d2p[0] |= dp[in - 1] >> (GMP_NUMB_BITS - cnt);

		n2p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);
		n3p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);
		cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
		if (adjust)
		  {
		    n2p[2 * qn] = cy;
		    n2p++;
		  }
		else
		  {
		    n2p[0] |= np[nn - 2 * qn - 1] >> (GMP_NUMB_BITS - cnt);
		  }
	      }
	    else
	      {
		cnt = 0;
		d2p = (mp_ptr) dp + in;

		n2p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);
		n3p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);
		MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
		if (adjust)
		  {
		    n2p[2 * qn] = 0;
		    n2p++;
		  }
	      }

	    /* Get an approximate quotient using the extracted operands.  */
	    if (qn == 1)
	      {
		mp_limb_t q0, r0;
		mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
		/* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
		   temps here.  This doesn't hurt code quality on any machines
		   so we do it unconditionally.  */
		gcc272bug_n1 = n2p[1];
		gcc272bug_n0 = n2p[0];
		gcc272bug_d0 = d2p[0];
		udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0,
			    gcc272bug_d0);
		qp[0] = q0;
	      }
	    else if (qn == 2)
       {
         MPN_COPY(n3p, n2p, 2 * qn);
	      mpn_divrem_2 (qp, 0L, n3p, 4L, d2p);	      
       }
       else if (qn < DIV_DC_THRESHOLD)
       {
	        mp_limb_t dip[2], cy;
           MPN_COPY(n3p, n2p, 2 * qn);
           mpn_invert(dip, d2p + qn - 2, 2);
           //mpn_sb_divrem_mn(qp, n3p, 2*qn, d2p, qn);
           cy = mpn_sb_divappr_q (qp, n3p, 2 * qn, d2p, qn, dip);
           if (cy) mpn_sub_1(qp, qp, qn, 1);
       }
	    else             
       {
           mp_limb_t dip[2], cy;
           MPN_COPY(n3p, n2p, 2 * qn);
           
           // dc_divappr_q_n requires qn << GMP_NUMBMAX
           if (qn < DC_DIVAPPR_Q_LIMIT) 
           {
              mp_ptr tp = TMP_ALLOC_LIMBS (10*qn);
              mpn_invert(dip, d2p + qn - 2, 2);
              cy = mpn_dc_divappr_q_n (qp, n3p, d2p, qn, dip, tp);
              // quotient can overflow as it is approx.
              if (cy) mpn_sub_1(qp, qp, qn, 1); 
           } else
              mpn_dc_divrem_n(qp, n3p, d2p, qn);
       }
       
       /* At this point the quotient may be correct or up to 3 too large.
          It could be one too large from the approximate quotient and
          up to a further two too large from the fact that we didn't have
          all the limbs of the numerator and denominator.
       */

	    rn = qn;

       /* Each limb of q*d has a contribution from low limbs of product
          limbs of q*d, a contribution from high limbs and a contribution
          from carries, which does not exceed the number of limbs in q.

          We compute the low limb contribution which qp[qn-1]*d2p[1] 
          contributes to.          
       */
       {
          mp_limb_t lp[3] = {0L, 0L, 0L};
          mp_limb_t p[2];
          mp_limb_t d1, d2, dl, cy;
          long i;
          mp_limb_t max_carries;
        
try_again:
                
          max_carries = qn + in - 1;
          if (max_carries > qn) max_carries = qn; 
          
          for (i = qn - 1; i >= 1L; i--)
          {
             lp[2] += qp[i] * d2p[qn - i];
          }

          /* Now we compute the high limb contribution. We discard carries as
             they don't interest us.
          */

          for (i = qn - 1; i >= 0L; i--)
          {
             umul_ppmm(p[1], p[0], qp[i], d2p[qn - i - 1]);
             add_ssaaaa(lp[2], lp[1], lp[2], lp[1], p[1], p[0]);
          }

          /* If the product is too large, the quotient must be too large */
          while ((mp_limb_signed_t) (lp[2] - n2p[qn]) > 0L)
          {
             if (qp[0] == CNST_LIMB(0))
             {
                mpn_decr_u (qp, (mp_limb_t) 1);
                lp[0] = CNST_LIMB(0);
                lp[1] = CNST_LIMB(0);
                lp[2] = CNST_LIMB(0);
                goto try_again;
             }
             mpn_decr_u (qp, (mp_limb_t) 1);
             sub_ddmmss(lp[2], lp[1], lp[2], lp[1], CNST_LIMB(0), d2p[qn - 1]);
          }
          
          /* If carries can't take this up to the value of the limb in the
             numerator, we must have the right quotient
          */

          if (max_carries < n2p[qn] - lp[2])
          {
             TMP_FREE;
             return;
          }
          
          if (max_carries > in + qn - 2) max_carries--;

          /* That proved inconclusive, so we compute the next limb of the
             product and perform a similar comparison
          */

          if (in)
          {
             if (in < 2)
                dl = 0;
             else 
                dl = dp[in - 2];

             if (cnt) d1 = (dp[in - 1] << cnt) | (dl >> (GMP_NUMB_BITS - cnt));
             else d1 = dp[in - 1];
          } else d1 = 0;

          for (i = qn - 2; i >= 0L; i--)
          {
             umul_ppmm(p[1], p[0], qp[i], d2p[qn - i - 2]);
             lp[2] += mpn_add_n(lp, lp, p, 2);
          }

          umul_ppmm(p[1], p[0], qp[qn - 1], d1);
          lp[2] += mpn_add_n(lp, lp, p, 2);

          while ((mp_limb_signed_t) (lp[2] - n2p[qn]) > 0L)
          {
             if (qp[0] == CNST_LIMB(0))
             {
                mpn_decr_u (qp, (mp_limb_t) 1);
                lp[0] = CNST_LIMB(0);
                lp[1] = CNST_LIMB(0);
                lp[2] = CNST_LIMB(0);
                goto try_again;
             }
             mpn_decr_u (qp, (mp_limb_t) 1);
             sub_ddmmss(lp[2], lp[1], lp[2], lp[1], CNST_LIMB(0), d2p[qn - 1]);
             if (qn > 1) mpn_sub_1(lp, lp, 3, d2p[qn - 2]);
             else mpn_sub_1(lp, lp, 3, d1);
          }
          
          if ((mp_limb_signed_t) (n2p[qn] - lp[2]) > 0L)
          {
             if (n2p[qn] > lp[2] + 1)
             {
                TMP_FREE;
                return;
             }

             if (n2p[qn - 1] + max_carries > lp[1])
             {
                TMP_FREE;
                return;
             }
             
             goto skip;
          }

          /* Now the top limb is known to be correct, so check next limb 
             If the product is too large, the quotient must be too large 
          */
          
          while (lp[1] > n2p[qn - 1])
          {
             if (qp[0] == CNST_LIMB(0))
             {
                mpn_decr_u (qp, (mp_limb_t) 1);
                lp[0] = CNST_LIMB(0);
                lp[1] = CNST_LIMB(0);
                lp[2] = CNST_LIMB(0);
                goto try_again;
             }
             mpn_decr_u (qp, (mp_limb_t) 1);
             if (qn > 1) cy = mpn_sub_n(lp, lp, d2p + qn - 2, 2);
             else
             {
                sub_ddmmss(cy, lp[1], (mp_limb_t) 0, lp[1], (mp_limb_t) 0, d2p[0]);
                cy += mpn_sub_1(lp, lp, 2, d1);
             }

             if (cy)
             {
                if (max_carries < n2p[qn - 1] - lp[1])
                {
                   TMP_FREE;
                   return;
                } else
                   goto skip;
             }
          }
          
          /* If carries can't take this up to the value of the limb in the
             numerator, we must have the right quotient
          */

          if (max_carries < n2p[qn - 1] - lp[1])
          {
             TMP_FREE;
             return;
          }
          
skip:
          /* We have been unable to determine the quotient so multiply out
             and see whether it is correct or one too large.
             We should get here only rarely, i.e. optimise something else
          */
          {
          
          mp_size_t n2n = nn + 1;
          n2p = TMP_ALLOC_LIMBS(nn+1);
          n2p[nn] = 0L;
          n2p[nn - 1] = 0L;
          mpn_mul(n2p, dp, dn, qp, qn);
          MPN_NORMALIZE (n2p, n2n);
          if ((n2n > nn) || (mpn_cmp(n2p, np, nn) > 0))
          {
             mpn_decr_u (qp, (mp_limb_t) 1);
          }
          
          TMP_FREE;
          return;

          }
       }
     }
   }
  }
}
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and`
			`write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If`
			`qxn is non-zero, generate that many fraction limbs and append them after the`
			`other quotient limbs, and update the remainder accordningly. The input`
			`operands are unaffected.`

			`Preconditions:`
			`1. The most significant limb of of the divisor must be non-zero.`
			`2. No argument overlap is permitted. (??? relax this ???)`
			`3. nn >= dn, even if qxn is non-zero. (??? relax this ???)`

			`The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time`
			`complexity of multiplication.`

			`Copyright 1997, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.`
Added divrem_1 and divrem_2 x86_64 assembly code from GMP, replaced divrem_euclidean_qr_1 with divrem_1. 2010-02-06 08:28:23 -05:00			`Copyright 2009, 2010 William Hart`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00
Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`This file is part of the MPIR Library.`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00
Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`The MPIR Library is free software; you can redistribute it and/or modify`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`it under the terms of the GNU Lesser General Public License as published by`
			`the Free Software Foundation; either version 2.1 of the License, or (at your`
			`option) any later version.`

Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`The MPIR Library is distributed in the hope that it will be useful, but`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`License for more details.`

			`You should have received a copy of the GNU Lesser General Public License`
Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`along with the MPIR Library; see the file COPYING.LIB. If not, write to`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,`
			`MA 02110-1301, USA. */`

			`#include "mpir.h"`
			`#include "gmp-impl.h"`
			`#include "longlong.h"`

Switch over to exact quotient when divisor gets large in mpn_tdiv_q. 2009-11-17 08:15:23 -05:00			`/*`
			`dc_divappr_q requires the divisor be << GMP_NUMBMAX limbs`
			`the asymptotics of midmul are also only toom42 asymptotics,`
			`thus for performance reasons we limit the size of divisors`
			`used in dc_divapp_q`
			`*/`
			`#define DC_DIVAPPR_Q_LIMIT 32768`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00
			`void`
			`mpn_tdiv_q (mp_ptr qp, mp_srcptr np, mp_size_t nn,`
			`mp_srcptr dp, mp_size_t dn)`
			`{`
			`ASSERT (nn >= 0);`
			`ASSERT (dn >= 0);`
			`ASSERT (dn == 0 \|\| dp[dn - 1] != 0);`
			`ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));`
			`ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));`

			`switch (dn)`
			`{`
			`case 0:`
			`DIVIDE_BY_ZERO;`

			`case 1:`
			`{`
Added divrem_1 and divrem_2 x86_64 assembly code from GMP, replaced divrem_euclidean_qr_1 with divrem_1. 2010-02-06 08:28:23 -05:00			`mpn_divmod_1 (qp, np, nn, dp[0]);`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`return;`
			`}`

			`case 2:`
			`{`
			`mp_ptr n2p, d2p;`
			`mp_limb_t qhl, cy;`
			`TMP_DECL;`
			`TMP_MARK;`
			`if ((dp[1] & GMP_NUMB_HIGHBIT) == 0)`
			`{`
			`int cnt;`
			`mp_limb_t dtmp[2];`
			`count_leading_zeros (cnt, dp[1]);`
			`d2p = dtmp;`
			`d2p[1] = (dp[1] << cnt) \| (dp[0] >> (GMP_NUMB_BITS - cnt));`
			`d2p[0] = (dp[0] << cnt);`
			`n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn + 1);`
			`cy = mpn_lshift (n2p, np, nn, cnt);`
			`n2p[nn] = cy;`
			`qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);`
			`if (cy == 0)`
			`qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */`
			`}`
			`else`
			`{`
			`d2p = (mp_ptr) dp;`
			`n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn);`
			`MPN_COPY (n2p, np, nn);`
			`qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);`
			`qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */`
			`}`
			`TMP_FREE;`
			`return;`
			`}`

			`default:`
			`{`
			`int adjust;`
			`TMP_DECL;`
			`TMP_MARK;`
			`adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */`
			`if (nn + adjust >= 2 * dn)`
			`{`
			`mp_ptr n2p, d2p, q2p;`
			`mp_limb_t cy;`
			`int cnt;`

			`qp[nn - dn] = 0; /* zero high quotient limb */`
			`if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) /* normalize divisor */`
			`{`
			`count_leading_zeros (cnt, dp[dn - 1]);`
			`d2p = (mp_ptr) TMP_ALLOC_LIMBS (dn);`
			`mpn_lshift (d2p, dp, dn, cnt);`
			`n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn + 1);`
			`cy = mpn_lshift (n2p, np, nn, cnt);`
			`n2p[nn] = cy;`
			`nn += adjust;`
			`}`
			`else`
			`{`
			`cnt = 0;`
			`d2p = (mp_ptr) dp;`
			`n2p = (mp_ptr) TMP_ALLOC_LIMBS (nn + 1);`
			`MPN_COPY (n2p, np, nn);`
			`n2p[nn] = 0;`
			`nn += adjust;`
			`}`

			`if (dn < DIV_DC_THRESHOLD)`
			`mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);`
			`else`
			`{`
			`/* Divide 2dn / dn limbs as long as the limbs in np last. /`
			`q2p = qp + nn - dn;`
			`n2p += nn - dn;`
			`do`
			`{`
			`q2p -= dn; n2p -= dn;`
			`mpn_dc_divrem_n (q2p, n2p, d2p, dn);`
			`nn -= dn;`
			`}`
			`while (nn >= 2 * dn);`

			`if (nn != dn)`
			`{`
			`mp_limb_t ql;`
			`n2p -= nn - dn;`

			`/* We have now dn < nn - dn < 2dn. Make a recursive call,`
			`since falling out to the code below isn't pretty.`
			`Unfortunately, mpn_tdiv_qr returns nn-dn+1 quotient`
			`limbs, which would overwrite one already generated`
			`quotient limbs. Preserve it with an ugly hack. */`
			`/* FIXME: This suggests that we should have an`
			`mpn_tdiv_qr_internal that instead returns the most`
			`significant quotient limb and move the meat of this`
			`function there. */`
			`/* FIXME: Perhaps call mpn_sb_divrem_mn here for certain`
			`operand ranges, to decrease overhead for small`
			`operands? */`
			`ql = qp[nn - dn]; /* preserve quotient limb... */`
			`mpn_tdiv_q (qp, n2p, nn, d2p, dn);`
Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`qp[nn - dn] = ql; /* ...restore it again */`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`}`
			`}`

			`TMP_FREE;`
			`return;`
			`}`

			`/* When we come here, the numerator/partial remainder is less`
			`than twice the size of the denominator. */`

			`{`
			`/* Problem:`

			`Divide a numerator N with nn limbs by a denominator D with dn`
			`limbs forming a quotient of qn=nn-dn+1 limbs. When qn is small`
			`compared to dn, conventional division algorithms perform poorly.`
			`We want an algorithm that has an expected running time that is`
			`dependent only on qn.`

			`Algorithm (very informally stated):`

			`1) Divide the 2 x qn most significant limbs from the numerator`
			`by the qn most significant limbs from the denominator. Call`
			`the result qest. This is either the correct quotient, but`
			`might be 1 or 2 too large. Compute the remainder from the`
			`division. (This step is implemented by a mpn_divrem call.)`

			`2) Is the most significant limb from the remainder < p, where p`
			`is the product of the most significant limb from the quotient`
			`and the next(d)? (Next(d) denotes the next ignored limb from`
			`the denominator.) If it is, decrement qest, and adjust the`
			`remainder accordingly.`

			`3) Is the remainder >= qest? If it is, qest is the desired`
			`quotient. The algorithm terminates.`

			`4) Subtract qest x next(d) from the remainder. If there is`
			`borrow out, decrement qest, and adjust the remainder`
			`accordingly.`

			`5) Skip one word from the denominator (i.e., let next(d) denote`
			`the next less significant limb. */`

			`mp_size_t qn;`
			`mp_ptr n2p, n3p, d2p;`
			`mp_ptr tp;`
			`mp_limb_t cy;`
			`mp_size_t in, rn;`
			`mp_limb_t quotient_too_large;`
			`unsigned int cnt;`

			`qn = nn - dn;`
			`qp[qn] = 0; /* zero high quotient limb */`
			`qn += adjust; /* qn cannot become bigger */`

			`if (qn == 0)`
			`{`
			`TMP_FREE;`
			`return;`
			`}`

			`in = dn - qn; /* (at least partially) ignored # of limbs in ops */`

			`/* Normalize denominator by shifting it to the left such that its`
			`most significant bit is set. Then shift the numerator the same`
			`amount, to mathematically preserve quotient. */`
			`if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0)`
			`{`
			`count_leading_zeros (cnt, dp[dn - 1]);`

			`d2p = (mp_ptr) TMP_ALLOC_LIMBS (qn);`
			`mpn_lshift (d2p, dp + in, qn, cnt);`
			`d2p[0] \|= dp[in - 1] >> (GMP_NUMB_BITS - cnt);`

			`n2p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);`
			`n3p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);`
			`cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);`
			`if (adjust)`
			`{`
			`n2p[2 * qn] = cy;`
			`n2p++;`
			`}`
			`else`
			`{`
			`n2p[0] \|= np[nn - 2 * qn - 1] >> (GMP_NUMB_BITS - cnt);`
			`}`
			`}`
			`else`
			`{`
			`cnt = 0;`
			`d2p = (mp_ptr) dp + in;`

			`n2p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);`
			`n3p = (mp_ptr) TMP_ALLOC_LIMBS (2 * qn + 1);`
			`MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);`
			`if (adjust)`
			`{`
			`n2p[2 * qn] = 0;`
			`n2p++;`
			`}`
			`}`

			`/* Get an approximate quotient using the extracted operands. */`
			`if (qn == 1)`
			`{`
			`mp_limb_t q0, r0;`
			`mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;`
			`/* Due to a gcc 2.7.2.3 reload pass bug, we have to use some`
			`temps here. This doesn't hurt code quality on any machines`
			`so we do it unconditionally. */`
			`gcc272bug_n1 = n2p[1];`
			`gcc272bug_n0 = n2p[0];`
			`gcc272bug_d0 = d2p[0];`
			`udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0,`
			`gcc272bug_d0);`
			`qp[0] = q0;`
			`}`
			`else if (qn == 2)`
			`{`
			`MPN_COPY(n3p, n2p, 2 * qn);`
			`mpn_divrem_2 (qp, 0L, n3p, 4L, d2p);`
			`}`
			`else if (qn < DIV_DC_THRESHOLD)`
			`{`
Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`mp_limb_t dip[2], cy;`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`MPN_COPY(n3p, n2p, 2 * qn);`
			`mpn_invert(dip, d2p + qn - 2, 2);`
			`//mpn_sb_divrem_mn(qp, n3p, 2*qn, d2p, qn);`
Fixed a carry bug in mpn_tdiv_q and added the failing cases to tests/mpz/t-tdiv.c. 2009-10-13 00:52:00 -04:00			`cy = mpn_sb_divappr_q (qp, n3p, 2 * qn, d2p, qn, dip);`
			`if (cy) mpn_sub_1(qp, qp, qn, 1);`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`}`
Extended the mpz/t-tdiv test and fixed a bug in mpn_tdiv_q. 2009-10-07 22:54:11 -04:00			`else`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`{`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00			`mp_limb_t dip[2], cy;`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`MPN_COPY(n3p, n2p, 2 * qn);`
Switch over to exact quotient when divisor gets large in mpn_tdiv_q. 2009-11-17 08:15:23 -05:00
			`// dc_divappr_q_n requires qn << GMP_NUMBMAX`
			`if (qn < DC_DIVAPPR_Q_LIMIT)`
			`{`
			`mp_ptr tp = TMP_ALLOC_LIMBS (10*qn);`
			`mpn_invert(dip, d2p + qn - 2, 2);`
			`cy = mpn_dc_divappr_q_n (qp, n3p, d2p, qn, dip, tp);`
			`// quotient can overflow as it is approx.`
			`if (cy) mpn_sub_1(qp, qp, qn, 1);`
			`} else`
			`mpn_dc_divrem_n(qp, n3p, d2p, qn);`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`}`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`/* At this point the quotient may be correct or up to 3 too large.`
			`It could be one too large from the approximate quotient and`
			`up to a further two too large from the fact that we didn't have`
			`all the limbs of the numerator and denominator.`
			`*/`

			`rn = qn;`

			`/* Each limb of q*d has a contribution from low limbs of product`
			`limbs of q*d, a contribution from high limbs and a contribution`
			`from carries, which does not exceed the number of limbs in q.`

			`We compute the low limb contribution which qp[qn-1]*d2p[1]`
			`contributes to.`
			`*/`
			`{`
			`mp_limb_t lp[3] = {0L, 0L, 0L};`
			`mp_limb_t p[2];`
			`mp_limb_t d1, d2, dl, cy;`
			`long i;`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00			`mp_limb_t max_carries;`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00			`try_again:`

Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00			`max_carries = qn + in - 1;`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`if (max_carries > qn) max_carries = qn;`

			`for (i = qn - 1; i >= 1L; i--)`
			`{`
			`lp[2] += qp[i] * d2p[qn - i];`
			`}`

			`/* Now we compute the high limb contribution. We discard carries as`
			`they don't interest us.`
			`*/`

			`for (i = qn - 1; i >= 0L; i--)`
			`{`
			`umul_ppmm(p[1], p[0], qp[i], d2p[qn - i - 1]);`
			`add_ssaaaa(lp[2], lp[1], lp[2], lp[1], p[1], p[0]);`
			`}`

			`/* If the product is too large, the quotient must be too large */`
			`while ((mp_limb_signed_t) (lp[2] - n2p[qn]) > 0L)`
			`{`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00			`if (qp[0] == CNST_LIMB(0))`
			`{`
			`mpn_decr_u (qp, (mp_limb_t) 1);`
			`lp[0] = CNST_LIMB(0);`
			`lp[1] = CNST_LIMB(0);`
			`lp[2] = CNST_LIMB(0);`
			`goto try_again;`
			`}`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`mpn_decr_u (qp, (mp_limb_t) 1);`
			`sub_ddmmss(lp[2], lp[1], lp[2], lp[1], CNST_LIMB(0), d2p[qn - 1]);`
			`}`

			`/* If carries can't take this up to the value of the limb in the`
			`numerator, we must have the right quotient`
			`*/`

			`if (max_carries < n2p[qn] - lp[2])`
			`{`
			`TMP_FREE;`
			`return;`
			`}`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00
			`if (max_carries > in + qn - 2) max_carries--;`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00
			`/* That proved inconclusive, so we compute the next limb of the`
			`product and perform a similar comparison`
			`*/`

			`if (in)`
			`{`
			`if (in < 2)`
			`dl = 0;`
			`else`
			`dl = dp[in - 2];`

			`if (cnt) d1 = (dp[in - 1] << cnt) \| (dl >> (GMP_NUMB_BITS - cnt));`
			`else d1 = dp[in - 1];`
			`} else d1 = 0;`

			`for (i = qn - 2; i >= 0L; i--)`
			`{`
			`umul_ppmm(p[1], p[0], qp[i], d2p[qn - i - 2]);`
			`lp[2] += mpn_add_n(lp, lp, p, 2);`
			`}`

			`umul_ppmm(p[1], p[0], qp[qn - 1], d1);`
			`lp[2] += mpn_add_n(lp, lp, p, 2);`

			`while ((mp_limb_signed_t) (lp[2] - n2p[qn]) > 0L)`
			`{`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00			`if (qp[0] == CNST_LIMB(0))`
			`{`
			`mpn_decr_u (qp, (mp_limb_t) 1);`
			`lp[0] = CNST_LIMB(0);`
			`lp[1] = CNST_LIMB(0);`
			`lp[2] = CNST_LIMB(0);`
			`goto try_again;`
			`}`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`mpn_decr_u (qp, (mp_limb_t) 1);`
			`sub_ddmmss(lp[2], lp[1], lp[2], lp[1], CNST_LIMB(0), d2p[qn - 1]);`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00			`if (qn > 1) mpn_sub_1(lp, lp, 3, d2p[qn - 2]);`
			`else mpn_sub_1(lp, lp, 3, d1);`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`}`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`if ((mp_limb_signed_t) (n2p[qn] - lp[2]) > 0L)`
			`{`
			`if (n2p[qn] > lp[2] + 1)`
			`{`
			`TMP_FREE;`
			`return;`
			`}`

Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00			`if (n2p[qn - 1] + max_carries > lp[1])`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`{`
			`TMP_FREE;`
			`return;`
			`}`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`goto skip;`
			`}`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`/* Now the top limb is known to be correct, so check next limb`
			`If the product is too large, the quotient must be too large`
			`*/`

Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00			`while (lp[1] > n2p[qn - 1])`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`{`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00			`if (qp[0] == CNST_LIMB(0))`
			`{`
			`mpn_decr_u (qp, (mp_limb_t) 1);`
			`lp[0] = CNST_LIMB(0);`
			`lp[1] = CNST_LIMB(0);`
			`lp[2] = CNST_LIMB(0);`
			`goto try_again;`
			`}`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`mpn_decr_u (qp, (mp_limb_t) 1);`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00			`if (qn > 1) cy = mpn_sub_n(lp, lp, d2p + qn - 2, 2);`
			`else`
			`{`
			`sub_ddmmss(cy, lp[1], (mp_limb_t) 0, lp[1], (mp_limb_t) 0, d2p[0]);`
			`cy += mpn_sub_1(lp, lp, 2, d1);`
			`}`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00
			`if (cy)`
			`{`
Fixed a major bug in mpn_tdiv_q and extended some tests to run longer. 2009-10-08 07:54:06 -04:00			`if (max_carries < n2p[qn - 1] - lp[1])`
			`{`
			`TMP_FREE;`
			`return;`
			`} else`
			`goto skip;`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`}`
			`}`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`/* If carries can't take this up to the value of the limb in the`
			`numerator, we must have the right quotient`
			`*/`

			`if (max_carries < n2p[qn - 1] - lp[1])`
			`{`
			`TMP_FREE;`
			`return;`
			`}`
Fixed some bugs in mpn_tdiv_q and added a try test for mpn_tdiv_q. 2009-10-11 02:46:10 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`skip:`
			`/* We have been unable to determine the quotient so multiply out`
			`and see whether it is correct or one too large.`
			`We should get here only rarely, i.e. optimise something else`
			`*/`
			`{`

			`mp_size_t n2n = nn + 1;`
			`n2p = TMP_ALLOC_LIMBS(nn+1);`
			`n2p[nn] = 0L;`
			`n2p[nn - 1] = 0L;`
			`mpn_mul(n2p, dp, dn, qp, qn);`
			`MPN_NORMALIZE (n2p, n2n);`
Extended the mpz/t-tdiv test and fixed a bug in mpn_tdiv_q. 2009-10-07 22:54:11 -04:00			`if ((n2n > nn) \|\| (mpn_cmp(n2p, np, nn) > 0))`
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`{`
			`mpn_decr_u (qp, (mp_limb_t) 1);`
			`}`
Extended the mpz/t-tdiv test and fixed a bug in mpn_tdiv_q. 2009-10-07 22:54:11 -04:00
Added fast mpn_tdiv_q and test code and used it in mpz_tdiv_q. 2009-10-07 01:27:59 -04:00			`TMP_FREE;`
			`return;`

			`}`
			`}`
			`}`
			`}`
			`}`
			`}`