/* Schönhage's 1987 algorithm, reorganized into hgcd form */

#include <stdio.h>  /* for NULL */

#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"


/* ******************************************************************
 *    Here we are including the original GMP version of mpn_gcd
 *    but we rename it as mpn_basic_gcd.  It needs to be available
 *    for the ngcd algorithm to call in the base case.
 *
 *  Preconditions [U = (up, usize) and V = (vp, vsize)]:
 *
 *   1.  V is odd.
 *   2.  numbits(U) >= numbits(V).
 *
 *   Both U and V are destroyed by the operation.  The result is left at vp,
 *   and its size is returned.
 *
 *   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
 *
 *   Funding for this work has been partially provided by Conselho
 *   Nacional de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do
 *   Brazil, Grant 301314194-2, and was done while I was a visiting
 *   reseacher in the Instituto de Matema'tica at Universidade Federal
 *   do Rio Grande do Sul (UFRGS).
 *
 *   Refer to K. Weber, The accelerated integer GCD algorithm, ACM
 *	Transactions on Mathematical Software, v. 21 (March), 1995,
 *	pp. 111-122.
 *
 * *****************************************************************/


/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
   algorithm is used, otherwise the binary algorithm is used.  This may be
   adjusted for different architectures.  */
#ifndef GCD_ACCEL_THRESHOLD
#define GCD_ACCEL_THRESHOLD 5
#endif

/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
   algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
   is used.  0 <= BMOD_THRESHOLD < GMP_NUMB_BITS.  */
enum
  {
    BMOD_THRESHOLD = GMP_NUMB_BITS/2
  };


/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
   Both U and V must be odd.  */
static inline mp_size_t
gcd_2 (mp_ptr vp, mp_srcptr up)
{
  mp_limb_t u0, u1, v0, v1;
  mp_size_t vsize;

  u0 = up[0];
  u1 = up[1];
  v0 = vp[0];
  v1 = vp[1];

  while (u1 != v1 && u0 != v0)
    {
      unsigned long int r;
      if (u1 > v1)
	{
	  u1 -= v1 + (u0 < v0);
	  u0 = (u0 - v0) & GMP_NUMB_MASK;
	  count_trailing_zeros (r, u0);
	  u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
	  u1 >>= r;
	}
      else  /* u1 < v1.  */
	{
	  v1 -= u1 + (v0 < u0);
	  v0 = (v0 - u0) & GMP_NUMB_MASK;
	  count_trailing_zeros (r, v0);
	  v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
	  v1 >>= r;
	}
    }

  vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);

  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
  if (u1 == v1 && u0 == v0)
    return vsize;

  v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
  vp[0] = mpn_gcd_1 (vp, vsize, v0);

  return 1;
}

/* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists
   0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS).
   In the reference article, D was computed along with N, but it is better to
   compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating
   the result as a twos' complement signed integer.

   Initialize N1 to C mod 2^(2*GMP_NUMB_BITS).  According to the reference
   article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use
   2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double
   precision.  If N2 > N1 initially, the first iteration of the while loop
   will swap them.  In all other situations, N1 >= N2 is maintained.  */

#if HAVE_NATIVE_mpn_gcd_finda
#define find_a(cp)  mpn_gcd_finda (cp)

#else
static
#if ! defined (__i386__)
inline				/* don't inline this for the x86 */
#endif
mp_limb_t
find_a (mp_srcptr cp)
{
  unsigned long int leading_zero_bits = 0;

  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l.  */
  mp_limb_t n1_h = cp[1];

  mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK);	/* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l.  */
  mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK);

  /* Main loop.  */
  while (n2_h != 0)		/* While N2 >= 2^GMP_NUMB_BITS.  */
    {
      /* N1 <-- N1 % N2.  */
      if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0)
	{
	  unsigned long int i;
	  count_leading_zeros (i, n2_h);
	  i -= GMP_NAIL_BITS;
	  i -= leading_zero_bits;
	  leading_zero_bits += i;
	  n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i));
	  n2_l = (n2_l << i) & GMP_NUMB_MASK;
	  do
	    {
	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
		{
		  n1_h -= n2_h + (n1_l < n2_l);
		  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
		}
	      n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK);
	      n2_h >>= 1;
	      i -= 1;
	    }
	  while (i != 0);
	}
      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
	{
	  n1_h -= n2_h + (n1_l < n2_l);
	  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
	}

      MP_LIMB_T_SWAP (n1_h, n2_h);
      MP_LIMB_T_SWAP (n1_l, n2_l);
    }

  return n2_l;
}
#endif


mp_size_t
mpn_basic_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
{
  mp_ptr orig_vp = vp;
  mp_size_t orig_vsize = vsize;
  int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
  mp_size_t scratch;
  mp_ptr tp;
  TMP_DECL;

  ASSERT (usize >= 1);
  ASSERT (vsize >= 1);
  ASSERT (usize >= vsize);
  ASSERT (vp[0] & 1);
  ASSERT (up[usize - 1] != 0);
  ASSERT (vp[vsize - 1] != 0);
#if WANT_ASSERT
  if (usize == vsize)
    {
      int  uzeros, vzeros;
      count_leading_zeros (uzeros, up[usize - 1]);
      count_leading_zeros (vzeros, vp[vsize - 1]);
      ASSERT (uzeros <= vzeros);
    }
#endif
  ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
  ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize));
  ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize));

  TMP_MARK;

  /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
     Two EXTRA limbs for U and V are required for kary reduction.  */
  if (vsize >= GCD_ACCEL_THRESHOLD)
    {
      unsigned long int vbitsize, d;
      mp_ptr orig_up = up;
      mp_size_t orig_usize = usize;
      mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);

      MPN_COPY (anchor_up, orig_up, usize);
      up = anchor_up;

      count_leading_zeros (d, up[usize - 1]);
      d -= GMP_NAIL_BITS;
      d = usize * GMP_NUMB_BITS - d;
      count_leading_zeros (vbitsize, vp[vsize - 1]);
      vbitsize -= GMP_NAIL_BITS;
      vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
      ASSERT (d >= vbitsize);
      d = d - vbitsize + 1;

      /* Use bmod reduction to quickly discover whether V divides U.  */
      up[usize++] = 0;				/* Insert leading zero.  */
      mpn_bdivmod (up, up, usize, vp, vsize, d);

      /* Now skip U/V mod 2^d and any low zero limbs.  */
      d /= GMP_NUMB_BITS, up += d, usize -= d;
      while (usize != 0 && up[0] == 0)
	up++, usize--;

      if (usize == 0)				/* GCD == ORIG_V.  */
	goto done;

      vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
      MPN_COPY (vp, orig_vp, vsize);

      do					/* Main loop.  */
	{
	  /* mpn_com_n can't be used here because anchor_up and up may
	     partially overlap */
	  if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0)  /* U < 0; take twos' compl. */
	    {
	      mp_size_t i;
	      anchor_up[0] = -up[0] & GMP_NUMB_MASK;
	      for (i = 1; i < usize; i++)
		anchor_up[i] = (~up[i] & GMP_NUMB_MASK);
	      up = anchor_up;
	    }

	  MPN_NORMALIZE_NOT_ZERO (up, usize);

	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
	    {
	      unsigned int r;
	      count_trailing_zeros (r, up[0]);
	      mpn_rshift (anchor_up, up, usize, r);
	      usize -= (anchor_up[usize - 1] == 0);
	    }
	  else if (anchor_up != up)
	    MPN_COPY_INCR (anchor_up, up, usize);

	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
	  up = anchor_up;

	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
	    break;			/* isn't efficient for == 2 limbs.  */

	  d = vbitsize;
	  count_leading_zeros (vbitsize, vp[vsize - 1]);
	  vbitsize -= GMP_NAIL_BITS;
	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
	  d = d - vbitsize + 1;

	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
	    {
	      up[usize++] = 0;
	      mpn_bdivmod (up, up, usize, vp, vsize, d);
	      d /= GMP_NUMB_BITS, up += d, usize -= d;
	    }
	  else				/* Kary reduction.  */
	    {
	      mp_limb_t bp[2], cp[2];

	      /* C <-- V/U mod 2^(2*GMP_NUMB_BITS).  */
	      {
		mp_limb_t u_inv, hi, lo;
		modlimb_invert (u_inv, up[0]);
		cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK;
		umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS);
		lo >>= GMP_NAIL_BITS;
		cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK;
	      }

	      /* U <-- find_a (C)  *  U.  */
	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
	      usize++;

	      /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1).
		  bp[0] <-- U/V mod 2^GMP_NUMB_BITS and
		  bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2

		  Like V/U above, but simplified because only the low bit of
		  bp[1] is wanted. */
	      {
		mp_limb_t  v_inv, hi, lo;
		modlimb_invert (v_inv, vp[0]);
		bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK;
		umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS);
		lo >>= GMP_NAIL_BITS;
		bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1;
	      }

	      up[usize++] = 0;
	      if (bp[1] != 0)	/* B < 0: U <-- U + (-B)  * V.  */
		{
		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK);
		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
		}
	      else		/* B >= 0:  U <-- U - B * V.  */
		{
		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
		}

	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
	    }

	  /* Must remove low zero limbs before complementing.  */
	  while (usize != 0 && up[0] == 0)
	    up++, usize--;
	}
      while (usize != 0);

      /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
      up = orig_up, usize = orig_usize;
      binary_gcd_ctr = 2;
    }
  else
    binary_gcd_ctr = 1;

  scratch = MPN_NGCD_LEHMER_ITCH (vsize);
  if (usize + 1 > scratch)
    scratch = usize + 1;
  
  tp = TMP_ALLOC_LIMBS (scratch);

  /* Finish up with the binary algorithm.  Executes once or twice.  */
  for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
    {
#if 1
      if (usize > vsize)
	{
	  /* FIXME: Could use mpn_bdivmod. */
	  mp_size_t rsize;
	      
	  mpn_tdiv_qr (tp + vsize, tp, 0, up, usize, vp, vsize);
	  rsize = vsize;
	  MPN_NORMALIZE (tp, rsize);
	  if (rsize == 0)
	    continue;

	  MPN_COPY (up, tp, vsize);
	}
      vsize = mpn_ngcd_lehmer (vp, up, vp, vsize, tp);
#else
      if (usize > 2)		/* First make U close to V in size.  */
	{
	  unsigned long int vbitsize, d;
	  count_leading_zeros (d, up[usize - 1]);
	  d -= GMP_NAIL_BITS;
	  d = usize * GMP_NUMB_BITS - d;
	  count_leading_zeros (vbitsize, vp[vsize - 1]);
	  vbitsize -= GMP_NAIL_BITS;
	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
	  d = d - vbitsize - 1;
	  if (d != -(unsigned long int)1 && d > 2)
	    {
	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
	      d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d;
	    }
	}
      
      /* Start binary GCD.  */
      do
	{
	  mp_size_t zeros;

	  /* Make sure U is odd.  */
	  MPN_NORMALIZE (up, usize);
	  while (up[0] == 0)
	    up += 1, usize -= 1;
	  if ((up[0] & 1) == 0)
	    {
	      unsigned int r;
	      count_trailing_zeros (r, up[0]);
	      mpn_rshift (up, up, usize, r);
	      usize -= (up[usize - 1] == 0);
	    }

	  /* Keep usize >= vsize.  */
	  if (usize < vsize)
	    MPN_PTR_SWAP (up, usize, vp, vsize);

	  if (usize <= 2)				/* Double precision. */
	    {
	      if (vsize == 1)
		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
	      else
		vsize = gcd_2 (vp, up);
	      break;					/* Binary GCD done.  */
	    }

	  /* Count number of low zero limbs of U - V.  */
	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
	    continue;

	  /* If U < V, swap U and V; in any case, subtract V from U.  */
	  if (zeros == vsize)				/* Subtract done.  */
	    up += zeros, usize -= zeros;
	  else if (usize == vsize)
	    {
	      mp_size_t size = vsize;
	      do
		size--;
	      while (up[size] == vp[size]);
	      if (up[size] < vp[size])			/* usize == vsize.  */
		MP_PTR_SWAP (up, vp);
	      up += zeros, usize = size + 1 - zeros;
	      mpn_sub_n (up, up, vp + zeros, usize);
	    }
	  else
	    {
	      mp_size_t size = vsize - zeros;
	      up += zeros, usize -= zeros;
	      if (mpn_sub_n (up, up, vp + zeros, size))
		{
		  while (up[size] == 0)			/* Propagate borrow. */
		    up[size++] = -(mp_limb_t)1;
		  up[size] -= 1;
		}
	    }
	}
      while (usize);					/* End binary GCD.  */
#endif
    }

done:
  if (vp != gp)
    MPN_COPY_INCR (gp, vp, vsize);
  TMP_FREE;
  return vsize;
}


/* ******************************************************************
 *     END of original GMP mpn_gcd
 * *****************************************************************/


/*
 * The remainder of this code is from Moller's patches.
 *
 * To make this code work with "make tune" we need to conditionally
 * exclude the Moller code when this file gets included inside of
 * gcd_bin.c in ../tune.
 */
#ifndef INSIDE_TUNE_GCD_BIN

/* For input of size n, matrix elements are of size at most ceil(n/2)
   - 1, but we need one limb extra. */

void
mpn_ngcd_matrix_init (struct ngcd_matrix *M, mp_size_t n, mp_ptr p)
{
  mp_size_t s = (n+1)/2;
  M->alloc = s;
  M->n = 1;
  MPN_ZERO (p, 4 * s);
  M->p[0][0] = p;
  M->p[0][1] = p + s;
  M->p[1][0] = p + 2 * s;
  M->p[1][1] = p + 3 * s;
  M->tp = p + 4*s;

  M->p[0][0][0] = M->p[1][1][0] = 1;
}

#define NHGCD_BASE_ITCH MPN_NGCD_STEP_ITCH

/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
   with elements of size at most (n+1)/2 - 1. Returns new size of a,
   b, or zero if no reduction is possible. */
static mp_size_t
nhgcd_base (mp_ptr ap, mp_ptr bp, mp_size_t n,
	    struct ngcd_matrix *M, mp_ptr tp)
{
  mp_size_t s = n/2 + 1;
  mp_size_t nn;

  ASSERT (n > s);
  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);

  nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
  if (!nn)
    return 0;

  for (;;)
    {
      n = nn;
      ASSERT (n > s);
      nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
      if (!nn )
	return n;      
    }
}

/* Size analysis for nhgcd:

   For the recursive calls, we have n1 <= ceil(n / 2). Then the
   storage need is determined by the storage for the recursive call
   computing M1, and ngcd_matrix_adjust and ngcd_matrix_mul calls that use M1
   (after this, the storage needed for M1 can be recycled).

   Let S(r) denote the required storage. For M1 we need 5 * ceil(n1/2)
   = 5 * ceil(n/4), and for the ngcd_matrix_adjust call, we need n + 2. For the 
   matrix multiplication we need 4*n1 + 3*ceil(n1/2) + 3, so 3n + 3 will do.
   In total, 5 * ceil(n/4) + 3n + 3 <= 17 ceil(n/4) + 3.

   For the recursive call, we need S(n1) = S(ceil(n/2)).

   S(n) <= 17*ceil(n/4) + 3 + S(ceil(n/2))
        <= 17*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 3k + S(ceil(n/2^k))
        <= 17*(2 ceil(n/4) + k) + 3k + S(n/2^k)   
	<= 34 ceil(n/4) + 20k + S(n/2^k)
	
*/

mp_size_t
mpn_nhgcd_itch (mp_size_t n)
{
  unsigned k;
  mp_size_t nn;

  /* Inefficient way to almost compute
     log_2(n/NHGCD_BASE_THRESHOLD) */
  for (k = 0, nn = n;
       ABOVE_THRESHOLD (nn, NHGCD_THRESHOLD);
       nn = (nn + 1) / 2)
    k++;

  if (k == 0)
    return NHGCD_BASE_ITCH (n);

  return 35 * ((n+3) / 4) + 20 * k
    + NHGCD_BASE_ITCH (NHGCD_THRESHOLD);
}

/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
   with elements of size at most (n+1)/2 - 1. Returns new size of a,
   b, or zero if no reduction is possible. */

mp_size_t
mpn_nhgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
	   struct ngcd_matrix *M, mp_ptr tp)
{
  mp_size_t s = n/2 + 1;
  mp_size_t n2 = (3*n)/4 + 1;
  
  mp_size_t p, nn;
  unsigned count;
  int success = 0;
  
  ASSERT (n > s);
  ASSERT ((ap[n-1] | bp[n-1]) > 0);

  ASSERT ((n+1)/2 - 1 < M->alloc);

  if (BELOW_THRESHOLD (n, NHGCD_THRESHOLD))
    return nhgcd_base (ap, bp, n, M, tp);

  p = n/2;
  nn = mpn_nhgcd (ap + p, bp + p, n - p, M, tp);
  if (nn > 0)
    {
      /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
	 = 2 (n - 1) */
      n = mpn_ngcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
      success = 1;
    }
  count = 0;
  while (n > n2)
    {
      count++;
      /* Needs n + 1 storage */
      nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
      if (!nn)
	return success ? n : 0;
      n = nn;
      success = 1;
    }

  if (n > s + 2)
    {
      struct ngcd_matrix M1;
      mp_size_t scratch;

      p = 2*s - n + 1;
      scratch = MPN_NGCD_MATRIX_INIT_ITCH (n-p);

      mpn_ngcd_matrix_init(&M1, n - p, tp);
      nn = mpn_nhgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
      if (nn > 0)
	{
	  /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
	     = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
	  n = mpn_ngcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
	  /* Needs M->n <= n2 - s - 1 < n/4 */
	  mpn_ngcd_matrix_mul (M, &M1, tp + scratch);
	  success = 1;
	}
    }

  /* FIXME: This really is the base case */
  for (count = 0;; count++)
    {
      /* Needs s+3 < n */
      nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
      if (!nn)
	return success ? n : 0;

      n = nn;
      success = 1;
    } 
}

#define EVEN_P(x) (((x) & 1) == 0)

#define LGCD_THRESHOLD 64

#define P_SIZE(n) (n/2)

mp_size_t
mpn_gcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
{
  mp_size_t init_scratch;
  mp_size_t scratch;
  mp_ptr tp;
  TMP_DECL;
  
  ASSERT (an >= n);

  if (BELOW_THRESHOLD (n, NGCD_THRESHOLD))
  {    
    if (BELOW_THRESHOLD (n, LGCD_THRESHOLD))
       return mpn_lgcd (gp, ap, an, bp, n);
       
    return mpn_basic_gcd (gp, ap, an, bp, n);
  }
  
  init_scratch = MPN_NGCD_MATRIX_INIT_ITCH (n-P_SIZE(n));
  scratch = mpn_nhgcd_itch ((n+1)/2);

  /* Space needed for mpn_ngcd_matrix_adjust */
  if (scratch < 2*n)
    scratch = 2*n;
    
  if (scratch < MPN_NGCD_LEHMER_ITCH(n)) /* Space needed by Lehmer GCD */
    scratch = MPN_NGCD_LEHMER_ITCH(n);

  TMP_MARK;
  
  if (an + 1 > init_scratch + scratch)
    tp = TMP_ALLOC_LIMBS (an + 1);
  else
    tp = TMP_ALLOC_LIMBS (init_scratch + scratch);
  
  if (an > n)
    {
      mp_ptr rp = tp;
      mp_ptr qp = rp + n;

      mpn_tdiv_qr (qp, rp, 0, ap, an, bp, n);
      MPN_COPY (ap, rp, n);
      an = n;
      MPN_NORMALIZE (ap, an);
      if (an == 0)
	{	  
	  MPN_COPY (gp, bp, n);
	  TMP_FREE;
	  return n;
	}
    }

  while (ABOVE_THRESHOLD (n, NGCD_THRESHOLD))
    {
      struct ngcd_matrix M;
      mp_size_t p = P_SIZE(n);
      mp_size_t nn;
      
      mpn_ngcd_matrix_init (&M, n - p, tp);
      nn = mpn_nhgcd (ap + p, bp + p, n - p, &M, tp + init_scratch);
      if (nn > 0)
	/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
	   = 2(n-1) */
	n = mpn_ngcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + init_scratch);

      else	
	{
	  mp_size_t gn;
	  n = mpn_ngcd_subdiv_step (gp, &gn, ap, bp, n, tp);
	  if (n == 0)
	    {
	      TMP_FREE;
	      return gn;
	    }
	}
    }

  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
  
  if (ap[n-1] < bp[n-1])
    MP_PTR_SWAP (ap, bp);
    
  if (BELOW_THRESHOLD (n, LGCD_THRESHOLD))  
  {
    n = mpn_ngcd_lehmer (gp, ap, bp, n, tp);

    TMP_FREE;
    return n;
  }

  an = n;
  MPN_NORMALIZE (bp, n);

  if (n == 0)
    {
      MPN_COPY (gp, ap, an);
      TMP_FREE;
      return an;
    }

  if (EVEN_P (bp[0]))
    {
      /* Then a must be odd (since the calling convention implies that
	 there's no common factor of 2) */
      ASSERT (!EVEN_P (ap[0]));

      while (bp[0] == 0)
	{
	  bp++;
	  n--;
	}

      if (EVEN_P(bp[0]))
	{
	  int count;
	  count_trailing_zeros (count, bp[0]);
	  ASSERT_NOCARRY (mpn_rshift (bp, bp, n, count));
	  n -= (bp[n-1] == 0);
	}
    }

  TMP_FREE;  
  return mpn_basic_gcd (gp, ap, an, bp, n);
}
#endif