mpir/mpn/generic/gcd.c
2009-07-22 04:26:36 +00:00

772 lines
20 KiB
C

/* Schönhage's 1987 algorithm, reorganized into hgcd form */
#include <stdio.h> /* for NULL */
#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"
/* ******************************************************************
* Here we are including the original GMP version of mpn_gcd
* but we rename it as mpn_basic_gcd. It needs to be available
* for the ngcd algorithm to call in the base case.
*
* Preconditions [U = (up, usize) and V = (vp, vsize)]:
*
* 1. V is odd.
* 2. numbits(U) >= numbits(V).
*
* Both U and V are destroyed by the operation. The result is left at vp,
* and its size is returned.
*
* Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
*
* Funding for this work has been partially provided by Conselho
* Nacional de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do
* Brazil, Grant 301314194-2, and was done while I was a visiting
* reseacher in the Instituto de Matema'tica at Universidade Federal
* do Rio Grande do Sul (UFRGS).
*
* Refer to K. Weber, The accelerated integer GCD algorithm, ACM
* Transactions on Mathematical Software, v. 21 (March), 1995,
* pp. 111-122.
*
* *****************************************************************/
/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
algorithm is used, otherwise the binary algorithm is used. This may be
adjusted for different architectures. */
#ifndef GCD_ACCEL_THRESHOLD
#define GCD_ACCEL_THRESHOLD 5
#endif
/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
algorithm reduces using the bmod operation. Otherwise, the k-ary reduction
is used. 0 <= BMOD_THRESHOLD < GMP_NUMB_BITS. */
enum
{
BMOD_THRESHOLD = GMP_NUMB_BITS/2
};
/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
Both U and V must be odd. */
static inline mp_size_t
gcd_2 (mp_ptr vp, mp_srcptr up)
{
mp_limb_t u0, u1, v0, v1;
mp_size_t vsize;
u0 = up[0];
u1 = up[1];
v0 = vp[0];
v1 = vp[1];
while (u1 != v1 && u0 != v0)
{
unsigned long int r;
if (u1 > v1)
{
u1 -= v1 + (u0 < v0);
u0 = (u0 - v0) & GMP_NUMB_MASK;
count_trailing_zeros (r, u0);
u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
u1 >>= r;
}
else /* u1 < v1. */
{
v1 -= u1 + (v0 < u0);
v0 = (v0 - u0) & GMP_NUMB_MASK;
count_trailing_zeros (r, v0);
v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
v1 >>= r;
}
}
vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
/* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
if (u1 == v1 && u0 == v0)
return vsize;
v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
vp[0] = mpn_gcd_1 (vp, vsize, v0);
return 1;
}
/* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists
0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS).
In the reference article, D was computed along with N, but it is better to
compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating
the result as a twos' complement signed integer.
Initialize N1 to C mod 2^(2*GMP_NUMB_BITS). According to the reference
article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use
2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double
precision. If N2 > N1 initially, the first iteration of the while loop
will swap them. In all other situations, N1 >= N2 is maintained. */
#if HAVE_NATIVE_mpn_gcd_finda
#define find_a(cp) mpn_gcd_finda (cp)
#else
static
#if ! defined (__i386__)
inline /* don't inline this for the x86 */
#endif
mp_limb_t
find_a (mp_srcptr cp)
{
unsigned long int leading_zero_bits = 0;
mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l. */
mp_limb_t n1_h = cp[1];
mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK); /* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l. */
mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK);
/* Main loop. */
while (n2_h != 0) /* While N2 >= 2^GMP_NUMB_BITS. */
{
/* N1 <-- N1 % N2. */
if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0)
{
unsigned long int i;
count_leading_zeros (i, n2_h);
i -= GMP_NAIL_BITS;
i -= leading_zero_bits;
leading_zero_bits += i;
n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i));
n2_l = (n2_l << i) & GMP_NUMB_MASK;
do
{
if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
{
n1_h -= n2_h + (n1_l < n2_l);
n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
}
n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK);
n2_h >>= 1;
i -= 1;
}
while (i != 0);
}
if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
{
n1_h -= n2_h + (n1_l < n2_l);
n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
}
MP_LIMB_T_SWAP (n1_h, n2_h);
MP_LIMB_T_SWAP (n1_l, n2_l);
}
return n2_l;
}
#endif
mp_size_t
mpn_basic_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
{
mp_ptr orig_vp = vp;
mp_size_t orig_vsize = vsize;
int binary_gcd_ctr; /* Number of times binary gcd will execute. */
mp_size_t scratch;
mp_ptr tp;
TMP_DECL;
ASSERT (usize >= 1);
ASSERT (vsize >= 1);
ASSERT (usize >= vsize);
ASSERT (vp[0] & 1);
ASSERT (up[usize - 1] != 0);
ASSERT (vp[vsize - 1] != 0);
#if WANT_ASSERT
if (usize == vsize)
{
int uzeros, vzeros;
count_leading_zeros (uzeros, up[usize - 1]);
count_leading_zeros (vzeros, vp[vsize - 1]);
ASSERT (uzeros <= vzeros);
}
#endif
ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize));
ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize));
TMP_MARK;
/* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
Two EXTRA limbs for U and V are required for kary reduction. */
if (vsize >= GCD_ACCEL_THRESHOLD)
{
unsigned long int vbitsize, d;
mp_ptr orig_up = up;
mp_size_t orig_usize = usize;
mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
MPN_COPY (anchor_up, orig_up, usize);
up = anchor_up;
count_leading_zeros (d, up[usize - 1]);
d -= GMP_NAIL_BITS;
d = usize * GMP_NUMB_BITS - d;
count_leading_zeros (vbitsize, vp[vsize - 1]);
vbitsize -= GMP_NAIL_BITS;
vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
ASSERT (d >= vbitsize);
d = d - vbitsize + 1;
/* Use bmod reduction to quickly discover whether V divides U. */
up[usize++] = 0; /* Insert leading zero. */
mpn_bdivmod (up, up, usize, vp, vsize, d);
/* Now skip U/V mod 2^d and any low zero limbs. */
d /= GMP_NUMB_BITS, up += d, usize -= d;
while (usize != 0 && up[0] == 0)
up++, usize--;
if (usize == 0) /* GCD == ORIG_V. */
goto done;
vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
MPN_COPY (vp, orig_vp, vsize);
do /* Main loop. */
{
/* mpn_com_n can't be used here because anchor_up and up may
partially overlap */
if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0) /* U < 0; take twos' compl. */
{
mp_size_t i;
anchor_up[0] = -up[0] & GMP_NUMB_MASK;
for (i = 1; i < usize; i++)
anchor_up[i] = (~up[i] & GMP_NUMB_MASK);
up = anchor_up;
}
MPN_NORMALIZE_NOT_ZERO (up, usize);
if ((up[0] & 1) == 0) /* Result even; remove twos. */
{
unsigned int r;
count_trailing_zeros (r, up[0]);
mpn_rshift (anchor_up, up, usize, r);
usize -= (anchor_up[usize - 1] == 0);
}
else if (anchor_up != up)
MPN_COPY_INCR (anchor_up, up, usize);
MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
up = anchor_up;
if (vsize <= 2) /* Kary can't handle < 2 limbs and */
break; /* isn't efficient for == 2 limbs. */
d = vbitsize;
count_leading_zeros (vbitsize, vp[vsize - 1]);
vbitsize -= GMP_NAIL_BITS;
vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
d = d - vbitsize + 1;
if (d > BMOD_THRESHOLD) /* Bmod reduction. */
{
up[usize++] = 0;
mpn_bdivmod (up, up, usize, vp, vsize, d);
d /= GMP_NUMB_BITS, up += d, usize -= d;
}
else /* Kary reduction. */
{
mp_limb_t bp[2], cp[2];
/* C <-- V/U mod 2^(2*GMP_NUMB_BITS). */
{
mp_limb_t u_inv, hi, lo;
modlimb_invert (u_inv, up[0]);
cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK;
umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS);
lo >>= GMP_NAIL_BITS;
cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK;
}
/* U <-- find_a (C) * U. */
up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
usize++;
/* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1).
bp[0] <-- U/V mod 2^GMP_NUMB_BITS and
bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2
Like V/U above, but simplified because only the low bit of
bp[1] is wanted. */
{
mp_limb_t v_inv, hi, lo;
modlimb_invert (v_inv, vp[0]);
bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK;
umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS);
lo >>= GMP_NAIL_BITS;
bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1;
}
up[usize++] = 0;
if (bp[1] != 0) /* B < 0: U <-- U + (-B) * V. */
{
mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK);
mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
}
else /* B >= 0: U <-- U - B * V. */
{
mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
}
up += 2, usize -= 2; /* At least two low limbs are zero. */
}
/* Must remove low zero limbs before complementing. */
while (usize != 0 && up[0] == 0)
up++, usize--;
}
while (usize != 0);
/* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */
up = orig_up, usize = orig_usize;
binary_gcd_ctr = 2;
}
else
binary_gcd_ctr = 1;
scratch = MPN_NGCD_LEHMER_ITCH (vsize);
if (usize + 1 > scratch)
scratch = usize + 1;
tp = TMP_ALLOC_LIMBS (scratch);
/* Finish up with the binary algorithm. Executes once or twice. */
for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
{
#if 1
if (usize > vsize)
{
/* FIXME: Could use mpn_bdivmod. */
mp_size_t rsize;
mpn_tdiv_qr (tp + vsize, tp, 0, up, usize, vp, vsize);
rsize = vsize;
MPN_NORMALIZE (tp, rsize);
if (rsize == 0)
continue;
MPN_COPY (up, tp, vsize);
}
vsize = mpn_ngcd_lehmer (vp, up, vp, vsize, tp);
#else
if (usize > 2) /* First make U close to V in size. */
{
unsigned long int vbitsize, d;
count_leading_zeros (d, up[usize - 1]);
d -= GMP_NAIL_BITS;
d = usize * GMP_NUMB_BITS - d;
count_leading_zeros (vbitsize, vp[vsize - 1]);
vbitsize -= GMP_NAIL_BITS;
vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
d = d - vbitsize - 1;
if (d != -(unsigned long int)1 && d > 2)
{
mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */
d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d;
}
}
/* Start binary GCD. */
do
{
mp_size_t zeros;
/* Make sure U is odd. */
MPN_NORMALIZE (up, usize);
while (up[0] == 0)
up += 1, usize -= 1;
if ((up[0] & 1) == 0)
{
unsigned int r;
count_trailing_zeros (r, up[0]);
mpn_rshift (up, up, usize, r);
usize -= (up[usize - 1] == 0);
}
/* Keep usize >= vsize. */
if (usize < vsize)
MPN_PTR_SWAP (up, usize, vp, vsize);
if (usize <= 2) /* Double precision. */
{
if (vsize == 1)
vp[0] = mpn_gcd_1 (up, usize, vp[0]);
else
vsize = gcd_2 (vp, up);
break; /* Binary GCD done. */
}
/* Count number of low zero limbs of U - V. */
for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
continue;
/* If U < V, swap U and V; in any case, subtract V from U. */
if (zeros == vsize) /* Subtract done. */
up += zeros, usize -= zeros;
else if (usize == vsize)
{
mp_size_t size = vsize;
do
size--;
while (up[size] == vp[size]);
if (up[size] < vp[size]) /* usize == vsize. */
MP_PTR_SWAP (up, vp);
up += zeros, usize = size + 1 - zeros;
mpn_sub_n (up, up, vp + zeros, usize);
}
else
{
mp_size_t size = vsize - zeros;
up += zeros, usize -= zeros;
if (mpn_sub_n (up, up, vp + zeros, size))
{
while (up[size] == 0) /* Propagate borrow. */
up[size++] = -(mp_limb_t)1;
up[size] -= 1;
}
}
}
while (usize); /* End binary GCD. */
#endif
}
done:
if (vp != gp)
MPN_COPY_INCR (gp, vp, vsize);
TMP_FREE;
return vsize;
}
/* ******************************************************************
* END of original GMP mpn_gcd
* *****************************************************************/
/*
* The remainder of this code is from Moller's patches.
*
* To make this code work with "make tune" we need to conditionally
* exclude the Moller code when this file gets included inside of
* gcd_bin.c in ../tune.
*/
#ifndef INSIDE_TUNE_GCD_BIN
/* For input of size n, matrix elements are of size at most ceil(n/2)
- 1, but we need one limb extra. */
void
mpn_ngcd_matrix_init (struct ngcd_matrix *M, mp_size_t n, mp_ptr p)
{
mp_size_t s = (n+1)/2;
M->alloc = s;
M->n = 1;
MPN_ZERO (p, 4 * s);
M->p[0][0] = p;
M->p[0][1] = p + s;
M->p[1][0] = p + 2 * s;
M->p[1][1] = p + 3 * s;
M->tp = p + 4*s;
M->p[0][0][0] = M->p[1][1][0] = 1;
}
#define NHGCD_BASE_ITCH MPN_NGCD_STEP_ITCH
/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
with elements of size at most (n+1)/2 - 1. Returns new size of a,
b, or zero if no reduction is possible. */
static mp_size_t
nhgcd_base (mp_ptr ap, mp_ptr bp, mp_size_t n,
struct ngcd_matrix *M, mp_ptr tp)
{
mp_size_t s = n/2 + 1;
mp_size_t nn;
ASSERT (n > s);
ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
if (!nn)
return 0;
for (;;)
{
n = nn;
ASSERT (n > s);
nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
if (!nn )
return n;
}
}
/* Size analysis for nhgcd:
For the recursive calls, we have n1 <= ceil(n / 2). Then the
storage need is determined by the storage for the recursive call
computing M1, and ngcd_matrix_adjust and ngcd_matrix_mul calls that use M1
(after this, the storage needed for M1 can be recycled).
Let S(r) denote the required storage. For M1 we need 5 * ceil(n1/2)
= 5 * ceil(n/4), and for the ngcd_matrix_adjust call, we need n + 2. For the
matrix multiplication we need 4*n1 + 3*ceil(n1/2) + 3, so 3n + 3 will do.
In total, 5 * ceil(n/4) + 3n + 3 <= 17 ceil(n/4) + 3.
For the recursive call, we need S(n1) = S(ceil(n/2)).
S(n) <= 17*ceil(n/4) + 3 + S(ceil(n/2))
<= 17*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 3k + S(ceil(n/2^k))
<= 17*(2 ceil(n/4) + k) + 3k + S(n/2^k)
<= 34 ceil(n/4) + 20k + S(n/2^k)
*/
mp_size_t
mpn_nhgcd_itch (mp_size_t n)
{
unsigned k;
mp_size_t nn;
/* Inefficient way to almost compute
log_2(n/NHGCD_BASE_THRESHOLD) */
for (k = 0, nn = n;
ABOVE_THRESHOLD (nn, NHGCD_THRESHOLD);
nn = (nn + 1) / 2)
k++;
if (k == 0)
return NHGCD_BASE_ITCH (n);
return 35 * ((n+3) / 4) + 20 * k
+ NHGCD_BASE_ITCH (NHGCD_THRESHOLD);
}
/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
with elements of size at most (n+1)/2 - 1. Returns new size of a,
b, or zero if no reduction is possible. */
mp_size_t
mpn_nhgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
struct ngcd_matrix *M, mp_ptr tp)
{
mp_size_t s = n/2 + 1;
mp_size_t n2 = (3*n)/4 + 1;
mp_size_t p, nn;
unsigned count;
int success = 0;
ASSERT (n > s);
ASSERT ((ap[n-1] | bp[n-1]) > 0);
ASSERT ((n+1)/2 - 1 < M->alloc);
if (BELOW_THRESHOLD (n, NHGCD_THRESHOLD))
return nhgcd_base (ap, bp, n, M, tp);
p = n/2;
nn = mpn_nhgcd (ap + p, bp + p, n - p, M, tp);
if (nn > 0)
{
/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
= 2 (n - 1) */
n = mpn_ngcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
success = 1;
}
count = 0;
while (n > n2)
{
count++;
/* Needs n + 1 storage */
nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
if (!nn)
return success ? n : 0;
n = nn;
success = 1;
}
if (n > s + 2)
{
struct ngcd_matrix M1;
mp_size_t scratch;
p = 2*s - n + 1;
scratch = MPN_NGCD_MATRIX_INIT_ITCH (n-p);
mpn_ngcd_matrix_init(&M1, n - p, tp);
nn = mpn_nhgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
if (nn > 0)
{
/* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
= 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
n = mpn_ngcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
/* Needs M->n <= n2 - s - 1 < n/4 */
mpn_ngcd_matrix_mul (M, &M1, tp + scratch);
success = 1;
}
}
/* FIXME: This really is the base case */
for (count = 0;; count++)
{
/* Needs s+3 < n */
nn = mpn_ngcd_step (n, ap, bp, s, M, tp);
if (!nn)
return success ? n : 0;
n = nn;
success = 1;
}
}
#define EVEN_P(x) (((x) & 1) == 0)
#define LGCD_THRESHOLD 64
#define P_SIZE(n) (n/2)
mp_size_t
mpn_gcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
{
mp_size_t init_scratch;
mp_size_t scratch;
mp_ptr tp;
TMP_DECL;
ASSERT (an >= n);
if (BELOW_THRESHOLD (n, NGCD_THRESHOLD))
{
if (BELOW_THRESHOLD (n, LGCD_THRESHOLD))
return mpn_lgcd (gp, ap, an, bp, n);
return mpn_basic_gcd (gp, ap, an, bp, n);
}
init_scratch = MPN_NGCD_MATRIX_INIT_ITCH (n-P_SIZE(n));
scratch = mpn_nhgcd_itch ((n+1)/2);
/* Space needed for mpn_ngcd_matrix_adjust */
if (scratch < 2*n)
scratch = 2*n;
if (scratch < MPN_NGCD_LEHMER_ITCH(n)) /* Space needed by Lehmer GCD */
scratch = MPN_NGCD_LEHMER_ITCH(n);
TMP_MARK;
if (an + 1 > init_scratch + scratch)
tp = TMP_ALLOC_LIMBS (an + 1);
else
tp = TMP_ALLOC_LIMBS (init_scratch + scratch);
if (an > n)
{
mp_ptr rp = tp;
mp_ptr qp = rp + n;
mpn_tdiv_qr (qp, rp, 0, ap, an, bp, n);
MPN_COPY (ap, rp, n);
an = n;
MPN_NORMALIZE (ap, an);
if (an == 0)
{
MPN_COPY (gp, bp, n);
TMP_FREE;
return n;
}
}
while (ABOVE_THRESHOLD (n, NGCD_THRESHOLD))
{
struct ngcd_matrix M;
mp_size_t p = P_SIZE(n);
mp_size_t nn;
mpn_ngcd_matrix_init (&M, n - p, tp);
nn = mpn_nhgcd (ap + p, bp + p, n - p, &M, tp + init_scratch);
if (nn > 0)
/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
= 2(n-1) */
n = mpn_ngcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + init_scratch);
else
{
mp_size_t gn;
n = mpn_ngcd_subdiv_step (gp, &gn, ap, bp, n, tp);
if (n == 0)
{
TMP_FREE;
return gn;
}
}
}
ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
if (ap[n-1] < bp[n-1])
MP_PTR_SWAP (ap, bp);
if (BELOW_THRESHOLD (n, LGCD_THRESHOLD))
{
n = mpn_ngcd_lehmer (gp, ap, bp, n, tp);
TMP_FREE;
return n;
}
an = n;
MPN_NORMALIZE (bp, n);
if (n == 0)
{
MPN_COPY (gp, ap, an);
TMP_FREE;
return an;
}
if (EVEN_P (bp[0]))
{
/* Then a must be odd (since the calling convention implies that
there's no common factor of 2) */
ASSERT (!EVEN_P (ap[0]));
while (bp[0] == 0)
{
bp++;
n--;
}
if (EVEN_P(bp[0]))
{
int count;
count_trailing_zeros (count, bp[0]);
ASSERT_NOCARRY (mpn_rshift (bp, bp, n, count));
n -= (bp[n-1] == 0);
}
}
TMP_FREE;
return mpn_basic_gcd (gp, ap, an, bp, n);
}
#endif