mpir/mpn/generic/hgcd.c
jasonmoxham e554550755 for file in $(find -name \*.c ) ; do sed -e "s/#include \"gmp\.h\"/#include \"mpir.h\"/g" $file > temp ; mv temp $file ; done
for file in $(find -name \*.h ) ; do sed -e "s/#include \"gmp\.h\"/#include \"mpir.h\"/g" $file > temp ; mv temp $file ; done
for file in $(find -name \*.cc) ; do sed -e "s/#include \"gmp\.h\"/#include \"mpir.h\"/g" $file > temp ; mv temp $file ; done
2009-02-12 10:24:24 +00:00

2044 lines
49 KiB
C

/* hgcd.c.
THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
Copyright 2003, 2004 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA. */
#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"
/* For NULL */
#include <stddef.h>
/* Comparison of _normalized_ numbers. */
#define MPN_EQUAL_P(ap, asize, bp, bsize) \
((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0)
#define MPN_LEQ_P(ap, asize, bp, bsize) \
((asize) < (bsize) || ((asize) == (bsize) \
&& mpn_cmp ((ap), (bp), (asize)) <= 0))
#define MPN_LESS_P(ap, asize, bp, bsize) \
((asize) < (bsize) || ((asize) == (bsize) \
&& mpn_cmp ((ap), (bp), (asize)) < 0))
/* Extract one limb, shifting count bits left
________ ________
|___xh___||___xl___|
|____r____|
>count <
The count includes any nail bits, so it should work fine if
count is computed using count_leading_zeros.
*/
#define MPN_EXTRACT_LIMB(count, xh, xl) \
((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \
((xl) >> (GMP_LIMB_BITS - (count))))
/* Return -1 if a < x + y + z,
0 if a = x + y + z,
1 if a > x + y + z. */
static int
mpn_cmp_sum3 (mp_srcptr ap, mp_size_t an,
mp_srcptr xp, mp_size_t xn,
mp_srcptr yp, mp_size_t yn,
mp_srcptr zp, mp_size_t zn)
{
mp_limb_t cy;
/* Check that all limbs beyond an are zero. This should be slightly
cheaper than fully normalizing all the input numbers. */
while (xn > an)
if (xp[--xn] > 0) return -1;
while (yn > an)
if (yp[--yn] > 0) return -1;
while (zn > an)
if (zp[--zn] > 0) return -1;
/* Start by sorting so that xn >= yn >= zn. Six permutations, so we
can't get away with less than three comparisons, at least not for
the worst case. */
if (xn < yn)
MPN_SRCPTR_SWAP (xp, xn, yp, yn);
if (yn < zn)
MPN_SRCPTR_SWAP (yp, yn, zp, zn);
if (xn < yn)
MPN_SRCPTR_SWAP (xp, xn, yp, yn);
ASSERT (an >= xn && xn >= yn && yn >= zn);
/* Assume that a = x + y + z, and write the addition limb by limb.
(c[1], a[0]) = x[0] + y[0] + z[0] + c[0]
(c[2], a[1]) = x[1] + y[1] + z[1] + c[1]
(c[k+1], a[k]) = x[k] + y[k] + z[k] + c[2]
...
(c[n], a[n-1]) = x[n-1] + y[n-1] + z[n-1] + c[n-1]
where the start and stop conditions are that c[0] = c[n] = 0.
Then we can start at the high end, iterating
c[k] = (c[k+1], a[k]) - x[k] - y[k] - z[k]
If equality holds, then 0 <= c[k] <= 2 for all k (since for
example 0xf + 0xf + 0xf + 2 = 0x2f). If we find c[k] < 0, then we
know that a < x + y + z, and if we find c[k] > 2, then we know a
> x + y + z. */
cy = 0;
while (an > xn)
{
/* c[k] = (c[k+1], a[k]) */
if (cy > 0)
return 1;
cy = ap[--an];
}
#if GMP_NAIL_BITS >= 2
while (an > yn)
{
if (cy > 1)
return 1;
cy = (cy << GMP_NUMB_BITS) + ap[--an];
if (cy < xp[an])
return -1;
cy -= xp[an];
}
while (an > zn)
{
mp_limb_t s;
if (cy > 2)
return 1;
cy = (cy << GMP_NUMB_BITS ) + ap[--an];
s = xp[an] + yp[an];
if (cy < s)
return -1;
cy -= s;
}
while (an > 0)
{
mp_limb_t s;
if (cy > 2)
return 1;
cy = (cy << GMP_NUMB_BITS ) + ap[--an];
s = xp[an] + yp[an] + zp[an];
if (cy < s)
return -1;
cy -= s;
}
#else /* GMP_NAIL_BITS < 2 */
#if GMP_NAIL_BITS == 1
loselose
#endif
while (an > yn)
{
/* c[k] = (c[k+1], a[k]) - x[k] */
if (cy > 1)
return 1;
--an;
if (cy == 1)
{
if (ap[an] >= xp[an])
return 1;
cy = (ap[an] - xp[an]) & GMP_NUMB_MASK;
}
else
{
/* cy == 0 */
if (ap[an] < xp[an])
return -1;
else
cy = ap[an] - xp[an];
}
}
while (an > zn)
{
mp_limb_t sh, sl;
/* c[k] = (c[k+1], a[k]) - x[k] - y[k] */
if (cy > 2)
return 1;
--an;
sl = xp[an] + yp[an];
sh = (sl < xp[an]);
if (cy < sh || (cy == sh && ap[an] < sl))
return -1;
sl = ap[an] - sl; /* Monkey business */
sh = cy - sh - (sl > ap[an]);
if (sh > 0)
return 1;
cy = sl;
}
while (an > 0)
{
mp_limb_t sh, sl;
if (cy > 2)
return 1;
--an;
sl = xp[an] + yp[an];
sh = (sl < xp[an]);
sl += zp[an];
sh += sl < zp[an];
if (cy < sh || (cy == sh && ap[an] < sl))
return -1;
sl = ap[an] - sl; /* Monkey business */
sh = cy - sh - (sl > ap[an]);
if (sh > 0)
return 1;
cy = sl;
}
#endif /* GMP_NAIL_BITS < 2 */
return cy > 0;
}
/* Only the first row has v = 0, a = 1 * a + 0 * b */
static inline int
hgcd_start_row_p (const struct hgcd_row *r, mp_size_t n)
{
mp_size_t i;
mp_srcptr vp = r->uvp[1];
for (i = 0; i < n; i++)
if (vp[i] != 0)
return 0;
return 1;
}
/* Called when r[0, 1, 2] >= W^M, r[3] < W^M. Returns the number of
remainders that satisfy Jebelean's criterion, i.e. find the largest k
such that
r[k+1] >= max (-u[k+1], - v[k+1])
r[k] - r[k-1] >= max (u[k+1] - u[k], v[k+1] - v[k])
Return 0 on failure, i.e. if B or A mod B < W^M. Return 1 in case
r0 and r1 are correct, but we still make no progress because r0 =
A, r1 = B.
Otherwise return 2, 3 or 4, the number of r:s that are correct.
*/
static int
hgcd_jebelean (const struct hgcd *hgcd, mp_size_t M)
{
mp_size_t L;
unsigned bit;
ASSERT (hgcd->row[0].rsize > M);
ASSERT (hgcd->row[1].rsize > M);
ASSERT (hgcd->row[2].rsize > M);
ASSERT (hgcd->row[3].rsize <= M);
ASSERT (MPN_LESS_P (hgcd->row[1].rp, hgcd->row[1].rsize,
hgcd->row[0].rp, hgcd->row[0].rsize));
ASSERT (MPN_LESS_P (hgcd->row[2].rp, hgcd->row[2].rsize,
hgcd->row[1].rp, hgcd->row[1].rsize));
ASSERT (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
hgcd->row[2].rp, hgcd->row[2].rsize));
ASSERT (mpn_cmp (hgcd->row[0].uvp[1], hgcd->row[1].uvp[1], hgcd->size) <= 0);
ASSERT (mpn_cmp (hgcd->row[1].uvp[1], hgcd->row[2].uvp[1], hgcd->size) <= 0);
ASSERT (mpn_cmp (hgcd->row[2].uvp[1], hgcd->row[3].uvp[1], hgcd->size) <= 0);
/* The bound is really floor (N/2), which is <= M = ceil (N/2) */
L = hgcd->size;
ASSERT (L <= M);
ASSERT (L > 0);
ASSERT (hgcd->row[3].uvp[1][L - 1] != 0);
bit = hgcd->sign < 0;
/* Check r1 - r2 >= max (u2 - u1, v2 - v1) = {|u1| + |u2|, |v1| + |v2|}[bit] */
if (mpn_cmp_sum3 (hgcd->row[1].rp, hgcd->row[1].rsize,
hgcd->row[2].rp, hgcd->row[2].rsize,
hgcd->row[1].uvp[bit], L,
hgcd->row[2].uvp[bit], L) < 0)
return 2 - (hgcd_start_row_p (hgcd->row, hgcd->size));
/* Ok, r2 is correct */
/* Check r3 >= max (-u3, -v3) = (|u3|, |v3|)[bit] */
if (hgcd->row[3].rsize > L)
/* Condition satisfied */
;
else
{
mp_size_t size;
for (size = L; size > hgcd->row[3].rsize; size--)
{
if (hgcd->row[3].uvp[bit][size-1] != 0)
return 3;
}
if (mpn_cmp (hgcd->row[3].rp, hgcd->row[3].uvp[bit], size) < 0)
return 3;
}
/* Check r3 - r2 >= max(u3-u2, v3-v2) = {|u2| + |u3|, |v2| +|v3|}[1-bit] */
if (mpn_cmp_sum3 (hgcd->row[2].rp, hgcd->row[2].rsize,
hgcd->row[3].rp, hgcd->row[3].rsize,
hgcd->row[2].uvp[bit ^ 1], L,
hgcd->row[3].uvp[bit ^ 1], L) < 0)
return 3;
/* Ok, r3 is correct */
return 4;
}
/* Compute au + bv. u and v are single limbs, a and b are n limbs each.
Stores n+1 limbs in rp, and returns the (n+2)'nd limb. */
/* FIXME: With nails, we can instead return limb n+1, possibly including
one non-zero nail bit. */
static mp_limb_t
mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n,
mp_srcptr ap, mp_limb_t u,
mp_srcptr bp, mp_limb_t v)
{
mp_limb_t h;
mp_limb_t cy;
h = mpn_mul_1 (rp, ap, n, u);
cy = mpn_addmul_1 (rp, bp, n, v);
h += cy;
#if GMP_NAIL_BITS == 0
rp[n] = h;
return (h < cy);
#else /* GMP_NAIL_BITS > 0 */
rp[n] = h & GMP_NUMB_MASK;
return h >> GMP_NUMB_BITS;
#endif /* GMP_NAIL_BITS > 0 */
}
/* hgcd2 operations */
/* Computes P = R * S. No overlap allowed. */
static mp_size_t
hgcd2_mul (struct hgcd_row *P, mp_size_t alloc,
const struct hgcd2_row *R,
const struct hgcd_row *S, mp_size_t n)
{
int grow = 0;
mp_limb_t h = 0;
unsigned i;
unsigned j;
ASSERT (n < alloc);
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
/* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j]
= u_i s0j + v_i s1j */
mp_limb_t cy;
cy = mpn_addmul2_n_1 (P[i].uvp[j], n,
S[0].uvp[j], R[i].u,
S[1].uvp[j], R[i].v);
if (cy)
{
ASSERT (n + 2 <= alloc);
P[i].uvp[j][n+1] = cy;
grow = 1;
}
else
h |= P[i].uvp[j][n];
}
if (grow)
return n + 2;
else
/* Don't add redundant zeroes */
return n + (h != 0);
}
unsigned
mpn_hgcd_max_recursion (mp_size_t n)
{
int count;
count_leading_zeros (count, (mp_limb_t)
(1 + n / (HGCD_SCHOENHAGE_THRESHOLD - 5)));
return GMP_LIMB_BITS - count;
}
mp_size_t
mpn_hgcd_init_itch (mp_size_t size)
{
/* r0 <= a, r1, r2, r3 <= b, but for simplicity, we allocate asize +
1 for all of them. The size of the uv:s are limited to asize / 2,
but we allocate one extra limb. The quotients aren't included
here. */
return 4 * (size + 1) + 8 * ((size / 2) + 1);
}
void
mpn_hgcd_init (struct hgcd *hgcd,
mp_size_t asize,
mp_limb_t *limbs)
{
unsigned i;
unsigned j;
mp_size_t alloc = (asize / 2) + 1;
hgcd->sign = 0;
for (i = 0; i < 4; i++)
{
hgcd->row[i].rp = limbs;
hgcd->row[i].rsize = asize + 1; limbs += asize + 1;
}
hgcd->alloc = alloc;
hgcd->size = alloc;
for (i = 0; i < 4; i++)
for (j = 0; j < 2; j++)
{
hgcd->row[i].uvp[j] = limbs;
limbs += alloc;
}
hgcd->qp[0] = hgcd->qp[1] = NULL;
}
/* FIXME: Could be a macro */
mp_size_t
mpn_hgcd_quotients_init_itch (mp_size_t asize)
{
return 2*(asize + 1);
}
void
mpn_hgcd_quotients_init (struct hgcd *hgcd,
mp_size_t asize,
mp_limb_t *limbs)
{
hgcd->qp[0] = limbs;
hgcd->qp[1] = limbs + asize + 1;
hgcd->qsize[0] = hgcd->qsize[1] = 0;
}
/* Helper functions for hgcd */
/* Sets (a, b, c, d) <-- (b, c, d, a) */
#define HGCD_SWAP4_LEFT(row) \
do { \
struct hgcd_row __hgcd_swap4_left_tmp; \
__hgcd_swap4_left_tmp = row[0]; \
row[0] = row[1]; \
row[1] = row[2]; \
row[2] = row[3]; \
row[3] = __hgcd_swap4_left_tmp; \
} while (0)
/* Sets (a, b, c, d) <-- (d, a, b, c) */
#define HGCD_SWAP4_RIGHT(row) \
do { \
struct hgcd_row __hgcd_swap4_right_tmp; \
__hgcd_swap4_right_tmp = row[3]; \
row[3] = row[2]; \
row[2] = row[1]; \
row[1] = row[0]; \
row[0] = __hgcd_swap4_right_tmp; \
} while (0)
/* Sets (a, b, c, d) <-- (c, d, a, b) */
#define HGCD_SWAP4_2(row) \
do { \
struct hgcd_row __hgcd_swap4_2_tmp; \
__hgcd_swap4_2_tmp = row[0]; \
row[0] = row[2]; \
row[2] = __hgcd_swap4_2_tmp; \
__hgcd_swap4_2_tmp = row[1]; \
row[1] = row[3]; \
row[3] = __hgcd_swap4_2_tmp; \
} while (0)
/* Sets (a, b, c) <-- (b, c, a) */
#define HGCD_SWAP3_LEFT(row) \
do { \
struct hgcd_row __hgcd_swap4_left_tmp; \
__hgcd_swap4_left_tmp = row[0]; \
row[0] = row[1]; \
row[1] = row[2]; \
row[2] = __hgcd_swap4_left_tmp; \
} while (0)
/* Computes P = R * S. No overlap allowed.
Temporary space is needed for two numbers smaller than the
resulting matrix elements, i.e. bounded by 2*L <= N. */
static mp_size_t
hgcd_mul (struct hgcd_row *P, mp_size_t alloc,
const struct hgcd_row *R, mp_size_t rsize,
const struct hgcd_row *S, mp_size_t ssize,
mp_ptr tp, mp_size_t talloc)
{
unsigned i;
unsigned j;
mp_size_t psize;
mp_limb_t h = 0;
int grow = 0;
MPN_NORMALIZE (R[1].uvp[1], rsize);
ASSERT (S[1].uvp[1][ssize - 1] != 0);
psize = rsize + ssize;
ASSERT (psize <= talloc);
if (rsize >= ssize)
{
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
/* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */
mp_limb_t cy;
mpn_mul (P[i].uvp[j], R[i].uvp[0], rsize, S[0].uvp[j], ssize);
mpn_mul (tp, R[i].uvp[1], rsize, S[1].uvp[j], ssize);
cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize);
if (cy)
{
ASSERT (psize + 1 < alloc);
P[i].uvp[j][psize] = cy;
grow = 1;
}
else
h |= P[i].uvp[j][psize - 1];
}
}
else
{
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
{
/* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */
mp_limb_t cy;
mpn_mul (P[i].uvp[j], S[0].uvp[j], ssize, R[i].uvp[0], rsize);
mpn_mul (tp, S[1].uvp[j], ssize, R[i].uvp[1], rsize);
cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize);
if (cy)
{
ASSERT (psize + 1 < alloc);
P[i].uvp[j][psize] = cy;
grow = 1;
}
else
h |= P[i].uvp[j][psize - 1];
}
}
if (grow)
return psize + 1;
else
return psize - (h == 0);
}
/* Computes R = W^k s->r + s->u A' - s->v B', which must be
non-negative. W denotes 2^(GMP_NUMB_BITS). Temporary space needed
is k + uvsize <= M + L = N.
Must have v > 0, v >= u. */
mp_size_t
mpn_hgcd_fix (mp_size_t k,
mp_ptr rp, mp_size_t ralloc,
int sign, mp_size_t uvsize,
const struct hgcd_row *s,
mp_srcptr ap,
mp_srcptr bp,
mp_ptr tp, mp_size_t talloc)
{
mp_size_t tsize;
mp_limb_t cy;
mp_size_t rsize;
mp_srcptr up;
mp_srcptr vp;
up = s->uvp[0]; vp = s->uvp[1];
MPN_NORMALIZE (vp, uvsize);
ASSERT (uvsize > 0);
if (sign < 0)
{
MP_SRCPTR_SWAP (up, vp);
MP_SRCPTR_SWAP (ap, bp);
}
tsize = k + uvsize;
ASSERT (k + s->rsize <= ralloc);
ASSERT (tsize <= talloc);
ASSERT (tsize <= ralloc);
ASSERT (rp != s->rp);
/* r = W^k s + u a */
if (uvsize <= k)
mpn_mul (rp, ap, k, up, uvsize);
else
mpn_mul (rp, up, uvsize, ap, k);
if (uvsize <= s->rsize)
{
cy = mpn_add (rp + k, s->rp, s->rsize, rp + k, uvsize);
rsize = k + s->rsize;
}
else
{
cy = mpn_add (rp + k, rp + k, uvsize, s->rp, s->rsize);
rsize = k + uvsize;
}
if (cy)
{
ASSERT (rsize < ralloc);
rp[rsize++] = cy;
}
/* r -= v b */
if (uvsize <= k)
mpn_mul (tp, bp, k, vp, uvsize);
else
mpn_mul (tp, vp, uvsize, bp, k);
ASSERT_NOCARRY (mpn_sub (rp, rp, rsize, tp, tsize));
MPN_NORMALIZE (rp, rsize);
return rsize;
}
/* Compute r2 = r0 - q r1, when q is a single limb */
static void
hgcd_update_r_1 (struct hgcd_row *r, mp_limb_t q)
{
mp_size_t size;
if (q == 1)
{
ASSERT_NOCARRY (mpn_sub (r[2].rp, r[0].rp, r[0].rsize, r[1].rp, r[1].rsize));
}
else
{
mp_size_t s = r[1].rsize;
mp_limb_t cy = mpn_mul_1 (r[2].rp, r[1].rp, s, q);
if (cy)
{
ASSERT (s < r[0].rsize);
r[2].rp[s++] = cy;
}
ASSERT_NOCARRY (mpn_sub (r[2].rp, r[0].rp, r[0].rsize, r[2].rp, s));
}
/* NOTE: In the usual case, we could take size = r[1].rsize, since
r[2] is expected to be smaller than r[1], but occasionally, we
call this with an incorrect quotient resulting in a larger
r[2]. */
size = r[0].rsize;
MPN_NORMALIZE (r[2].rp, size);
r[2].rsize = size;
}
/* Compute r2 = r0 - q r1 */
static void
hgcd_update_r (struct hgcd_row *r, mp_srcptr qp, mp_size_t qsize)
{
ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
ASSERT (qsize > 0);
if (qsize == 1)
{
hgcd_update_r_1 (r, qp[0]);
}
else
{
mp_srcptr r0p = r[0].rp;
mp_srcptr r1p = r[1].rp;
mp_ptr r2p = r[2].rp;
mp_size_t r0size = r[0].rsize;
mp_size_t r1size = r[1].rsize;
mp_size_t size = r1size + qsize;
ASSERT (size <= r0size + 1);
if (qsize <= r1size)
mpn_mul (r2p, r1p, r1size, qp, qsize);
else
mpn_mul (r2p, qp, qsize, r1p, r1size);
if (size > r0size)
{
ASSERT (size == r0size + 1);
size--;
ASSERT (r2p[size] == 0);
}
ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size));
MPN_NORMALIZE (r[2].rp, r0size);
r[2].rsize = r0size;
ASSERT (MPN_LESS_P (r2p, r0size, r1p, r1size));
}
}
/* Compute (u2, v2) = (u0, v0) + q (u1, v1)
Return the size of the largest u,v element.
Caller must ensure that usize + qsize <= available storage */
static mp_size_t
hgcd_update_uv_1 (struct hgcd_row *r, mp_size_t usize,
mp_limb_t q)
{
mp_limb_t hu, hv;
if (q == 1)
{
hu = mpn_add_n (r[2].uvp[0], r[0].uvp[0], r[1].uvp[0], usize);
hv = mpn_add_n (r[2].uvp[1], r[0].uvp[1], r[1].uvp[1], usize);
}
else
{
/* Too bad we don't have an addmul_1 with distinct source and
destination */
hu = mpn_mul_1 (r[2].uvp[0], r[1].uvp[0], usize, q);
hu += mpn_add_n (r[2].uvp[0], r[2].uvp[0], r[0].uvp[0], usize);
hv = mpn_mul_1 (r[2].uvp[1], r[1].uvp[1], usize, q);
hv += mpn_add_n (r[2].uvp[1], r[2].uvp[1], r[0].uvp[1], usize);
}
if (hu || hv)
{
r[2].uvp[0][usize] = hu;
r[2].uvp[1][usize] = hv;
usize++;
}
return usize;
}
/* Compute (u2, v2) = (u0, v0) + q (u1, v1)
Return the size of the largest u,v element.
Caller must ensure that usize + qsize <= available storage */
/* FIXME: Doesn't need the qp, qsize arguments */
static mp_size_t
hgcd_update_uv (struct hgcd_row *r, mp_size_t usize,
mp_srcptr qp, mp_size_t qsize)
{
unsigned i;
mp_size_t grow;
ASSERT (r[1].uvp[1][usize - 1] != 0);
/* Compute u2 = u0 + q u1 */
if (qsize == 1)
return hgcd_update_uv_1 (r, usize, qp[0]);
else
{
for (i = 0; i < 2; i++)
{
mp_srcptr u0p = r[0].uvp[i];
mp_srcptr u1p = r[1].uvp[i];
mp_ptr u2p = r[2].uvp[i];
if (qsize <= usize)
mpn_mul (u2p, u1p, usize, qp, qsize);
else
mpn_mul (u2p, qp, qsize, u1p, usize);
ASSERT_NOCARRY (mpn_add (u2p, u2p, usize + qsize, u0p, usize));
grow = qsize - ((u2p[usize + qsize - 1]) == 0);
}
}
usize += grow;
/* The values should be allocated with one limb margin */
ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0);
ASSERT (mpn_cmp (r[1].uvp[1], r[2].uvp[1], usize) <= 0);
ASSERT (r[2].uvp[1][usize - 1] != 0);
return usize;
}
/* Compute r0 = r2 + q r1, and the corresponding uv. Uses a
single-limb q. */
static void
hgcd_backup_1 (struct hgcd_row *r, mp_size_t usize, mp_limb_t q)
{
/* r0 = r2 + q r1
u0 = u2 - q u1
v0 = v2 - q v1 */
/* Just like for mpn_addmul_1, the r0 is the same size as r1, or one
limb larger. */
mp_limb_t cy;
mp_size_t size;
if (q == 1)
{
ASSERT_NOCARRY (mpn_sub_n (r[0].uvp[0], r[2].uvp[0], r[1].uvp[0], usize));
ASSERT_NOCARRY (mpn_sub_n (r[0].uvp[1], r[2].uvp[1], r[1].uvp[1], usize));
cy = mpn_add (r[0].rp, r[1].rp, r[1].rsize, r[2].rp, r[2].rsize);
}
else
{
ASSERT_NOCARRY (mpn_mul_1 (r[0].uvp[0], r[1].uvp[0], usize, q));
ASSERT_NOCARRY (mpn_sub_n (r[0].uvp[0], r[2].uvp[0], r[0].uvp[0], usize));
ASSERT_NOCARRY (mpn_mul_1 (r[0].uvp[1], r[1].uvp[1], usize, q));
ASSERT_NOCARRY (mpn_sub_n (r[0].uvp[1], r[2].uvp[1], r[0].uvp[1], usize));
cy = mpn_mul_1 (r[0].rp, r[1].rp, r[1].rsize, q);
cy += mpn_add (r[0].rp, r[0].rp, r[1].rsize, r[2].rp, r[2].rsize);
}
size = r[1].rsize;
if (cy)
r[0].rp[size++] = cy;
r[0].rsize = size;
}
static void
hgcd_backup_r (struct hgcd_row *r,
mp_srcptr qp, mp_size_t qsize)
{
mp_ptr r0p = r[0].rp;
mp_srcptr r1p = r[1].rp;
mp_srcptr r2p = r[2].rp;
mp_size_t r0size;
mp_size_t r1size = r[1].rsize;
mp_size_t r2size = r[2].rsize;
/* r0 = r2 + q r1
Result must be of size r1size + q1size - 1, or one limb
larger. */
r0size = r1size + qsize;
if (r1size >= qsize)
mpn_mul (r0p, r1p, r1size, qp, qsize);
else
mpn_mul (r0p, qp, qsize, r1p, r1size);
ASSERT_NOCARRY (mpn_add (r0p, r0p, r0size, r2p, r2size));
r0size -= (r0p[r0size-1] == 0);
r[0].rsize = r0size;
}
static void
hgcd_backup_uv (struct hgcd_row *r, mp_size_t usize,
mp_srcptr qp, mp_size_t qsize)
{
/* (u0, v0) = (u2, v2) - q (u1, v1) */
mp_ptr u0p = r[0].uvp[0];
mp_ptr v0p = r[0].uvp[1];
mp_srcptr u1p = r[1].uvp[0];
mp_srcptr v1p = r[1].uvp[1];
mp_srcptr u2p = r[2].uvp[0];
mp_srcptr v2p = r[2].uvp[1];
mp_size_t size;
/* FIXME: Special case qsize = 1 and q = 1 */
/* (u0,v0) = (u2,v2) - q (u1, v1) */
/* We must have
usize >= #(q u1) >= qsize + #u1 - 1
which means that u1 must have at least
usize - #u1 >= qsize - 1
zero limbs at the high end, and similarly for v1. */
ASSERT (qsize <= usize);
size = usize - qsize + 1;
/* NOTE: Needs an extra limb for the u,v values */
if (qsize <= size)
{
mpn_mul (u0p, u1p, size, qp, qsize);
mpn_mul (v0p, v1p, size, qp, qsize);
}
else
{
mpn_mul (u0p, qp, qsize, u1p, size);
mpn_mul (v0p, qp, qsize, v1p, size);
}
/* qsize + size = usize + 1 */
ASSERT (u0p[usize] == 0);
ASSERT (v0p[usize] == 0);
ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize));
ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize));
}
static void
hgcd_backup (struct hgcd_row *r, mp_size_t usize,
mp_srcptr qp, mp_size_t qsize)
{
if (qsize == 1)
hgcd_backup_1 (r, usize, qp[0]);
hgcd_backup_r (r, qp, qsize);
hgcd_backup_uv (r, usize, qp, qsize);
}
/* Compute r0 = r2 + q r1, and the corresponding uv.
* q is recovered from the uv matrix, as follows.
*
* We have (u2, v2) = (u0, v0) + q (u1, v1).
*
* If v1 = 0, we fail, since (u1, v1; u2, v2) must be the identity
* matrix, and we return 0 for failure.
*
* If u1 = 0, then we must have (u1, v1; u2, v2) = (0,1; 1, q),
* so that q = v2 and (u0, v0) = (1, 0).
*
* Else, q = floor(v2 / v1), and (u0, v0) = (u2, v2) - q (u1, v1).
*
* In other words, we compute
*
* (q, v0) = divmod(v2 / v1)
* u0 = u2 - q u1
*
* There's one exception though: If v2 mod v1 = 0, which can happen
* only when u1 = v1 = 1. Then u0 = 0, v0 = 1 and q = v2 - 1.
*
* On success, returns qsize > 0.
*
*/
static mp_size_t
hgcd_backup_recover_q (struct hgcd_row *r, mp_size_t usize,
mp_ptr qp)
{
mp_size_t qsize;
mp_ptr u0p = r[0].uvp[0];
mp_ptr v0p = r[0].uvp[1];
mp_srcptr u1p = r[1].uvp[0];
mp_srcptr v1p = r[1].uvp[1];
mp_srcptr u2p = r[2].uvp[0];
mp_srcptr v2p = r[2].uvp[1];
mp_size_t u1size;
mp_size_t v1size;
mp_size_t v2size;
ASSERT (MPN_LEQ_P (r[2].rp, r[2].rsize, r[1].rp, r[1].rsize));
/* Derive q */
u1size = v1size = v2size = usize;
MPN_NORMALIZE (v1p, v1size);
if (!v1size)
return 0;
MPN_NORMALIZE (v2p, v2size);
MPN_NORMALIZE (u1p, u1size);
if (!u1size)
{
MPN_COPY (qp, v2p, v2size);
qsize = v2size;
MPN_ZERO (u0p, usize);
MPN_ZERO (v0p, usize);
u0p[0] = 1;
}
else if (u1size == 1 && v1size == 1 && u1p[0] == 1 && v1p[0] == 1)
{
ASSERT_NOCARRY (mpn_sub_1 (qp, v2p, v2size, 1));
qsize = v2size;
qsize -= (qp[qsize - 1] == 0);
MPN_ZERO (u0p, usize);
MPN_ZERO (v0p, usize);
v0p[0] = 1;
}
else
{
mpn_tdiv_qr (qp, v0p, 0, v2p, v2size, v1p, v1size);
qsize = v2size - v1size;
qsize += (qp[qsize] != 0);
if (v1size < usize)
MPN_ZERO (v0p + v1size, usize - v1size);
if (qsize == 1)
{
mp_limb_t q = qp[0];
if (q == 1)
{
ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u1p, usize));
}
else
{
ASSERT_NOCARRY (mpn_mul_1 (u0p, u1p, usize, q));
ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize));
}
}
else
{
/* NOTE: Needs an extra limb for the u0 value */
mp_size_t u0size = qsize + u1size;
ASSERT (u0size <= usize + 1);
if (qsize <= u1size)
mpn_mul (u0p, u1p, u1size, qp, qsize);
else
mpn_mul (u0p, qp, qsize, u1p, u1size);
u0size -= (u0p[u0size-1] == 0);
ASSERT (u0size <= usize);
ASSERT_NOCARRY (mpn_sub (u0p, u2p, usize, u0p, u0size));
}
}
if (qsize == 1)
{
/* NOTE: Some duplication with hgcd_backup_1 */
mp_size_t size;
mp_limb_t cy;
mp_limb_t q = qp[0];
size = r[1].rsize;
if (q == 1)
cy = mpn_add (r[0].rp, r[1].rp, size, r[2].rp, r[2].rsize);
else
{
cy = mpn_mul_1 (r[0].rp, r[1].rp, size, q);
cy += mpn_add (r[0].rp, r[0].rp, size, r[2].rp, r[2].rsize);
}
if (cy)
r[0].rp[size++] = cy;
r[0].rsize = size;
}
else
hgcd_backup_r (r, qp, qsize);
return qsize;
}
/* Called after HGCD_SWAP4_RIGHT, to adjust the size field. Large
numbers in row 0 don't count, and are overwritten. */
static void
hgcd_normalize (struct hgcd *hgcd)
{
mp_size_t size = hgcd->size;
/* v3 should always be the largest element */
while (size > 0 && hgcd->row[3].uvp[1][size - 1] == 0)
{
size--;
/* Row 0 is about to be overwritten. We must zero out unused limbs */
hgcd->row[0].uvp[0][size] = 0;
hgcd->row[0].uvp[1][size] = 0;
ASSERT (hgcd->row[1].uvp[0][size] == 0);
ASSERT (hgcd->row[1].uvp[1][size] == 0);
ASSERT (hgcd->row[2].uvp[0][size] == 0);
ASSERT (hgcd->row[2].uvp[1][size] == 0);
ASSERT (hgcd->row[3].uvp[0][size] == 0);
}
hgcd->size = size;
}
int
mpn_hgcd2_lehmer_step (struct hgcd2 *hgcd,
mp_srcptr ap, mp_size_t asize,
mp_srcptr bp, mp_size_t bsize)
{
mp_limb_t ah;
mp_limb_t al;
mp_limb_t bh;
mp_limb_t bl;
ASSERT (asize >= bsize);
ASSERT (MPN_LEQ_P (bp, bsize, ap, asize));
if (bsize < 2)
return 0;
/* The case asize == 2 is needed to take care of values that are
between one and two *full* limbs in size. */
if (asize == 2 || (ap[asize-1] & GMP_NUMB_HIGHBIT))
{
if (bsize < asize)
return 0;
al = ap[asize - 2];
ah = ap[asize - 1];
ASSERT (asize == bsize);
bl = bp[asize - 2];
bh = bp[asize - 1];
}
else
{
unsigned shift;
if (bsize + 1 < asize)
return 0;
/* We want two *full* limbs */
ASSERT (asize > 2);
count_leading_zeros (shift, ap[asize-1]);
if (bsize == asize)
bh = MPN_EXTRACT_LIMB (shift, bp[asize - 1], bp[asize - 2]);
else
{
ASSERT (asize == bsize + 1);
bh = bp[asize - 2] >> (GMP_LIMB_BITS - shift);
}
bl = MPN_EXTRACT_LIMB (shift, bp[asize - 2], bp[asize - 3]);
al = MPN_EXTRACT_LIMB (shift, ap[asize - 2], ap[asize - 3]);
ah = MPN_EXTRACT_LIMB (shift, ap[asize - 1], ap[asize - 2]);
}
return mpn_hgcd2 (hgcd, ah, al, bh, bl);
}
static void
hgcd_start (struct hgcd *hgcd,
mp_srcptr ap, mp_size_t asize,
mp_srcptr bp, mp_size_t bsize)
{
MPN_COPY (hgcd->row[0].rp, ap, asize);
hgcd->row[0].rsize = asize;
MPN_COPY (hgcd->row[1].rp, bp, bsize);
hgcd->row[1].rsize = bsize;
hgcd->sign = 0;
if (hgcd->size != 0)
{
/* We must zero out the uv array */
unsigned i;
unsigned j;
for (i = 0; i < 4; i++)
for (j = 0; j < 2; j++)
MPN_ZERO (hgcd->row[i].uvp[j], hgcd->size);
}
hgcd->size = 1;
hgcd->row[0].uvp[0][0] = 1;
hgcd->row[1].uvp[1][0] = 1;
}
/* Performs one euclid step on r0, r1. Returns >= 0 if hgcd should be
terminated, -1 if we should go on */
static int
euclid_step (struct hgcd *hgcd, mp_size_t M)
{
mp_size_t asize;
mp_size_t qsize;
mp_ptr qp;
mp_size_t rsize;
mp_ptr rp;
asize = hgcd->row[0].rsize;
rsize = hgcd->row[1].rsize;
qsize = asize - rsize + 1;
rp = hgcd->row[2].rp;
/* If we need to return q, it's always as q1 */
qp = hgcd->qp[1];
mpn_tdiv_qr (qp, rp, 0, hgcd->row[0].rp, asize, hgcd->row[1].rp, rsize);
MPN_NORMALIZE (rp, rsize);
hgcd->row[2].rsize = rsize;
qsize -= (qp[qsize - 1] == 0);
/* Update u and v */
ASSERT (hgcd->size + qsize <= hgcd->alloc);
hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, qp, qsize);
ASSERT (hgcd->size < hgcd->alloc);
if (hgcd->row[2].rsize <= M)
{
HGCD_SWAP4_RIGHT (hgcd->row);
hgcd_normalize (hgcd);
hgcd->qsize[1] = qsize;
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
if (!hgcd->qsize[0])
return 0;
hgcd->sign = ~hgcd->sign;
return hgcd_jebelean (hgcd, M);
}
else
{
/* Keep this remainder */
hgcd->sign = ~hgcd->sign;
HGCD_SWAP4_LEFT (hgcd->row);
return -1;
}
}
/* Called when values have been computed in r[0] and r[1], and the
latter value is too large, and we know that it's not much too
large. Returns the updated size for the uv matrix. */
static mp_size_t
hgcd_adjust (struct hgcd_row *r, mp_size_t size, mp_limb_t *dp)
{
mp_limb_t c0;
mp_limb_t c1;
mp_limb_t d;
/* Compute the correct r1. We have r1' = r1 - d r0, and we always
have d = 1 or 2. */
ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
MPN_NORMALIZE (r[1].rp, r[1].rsize);
if (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize))
{
c0 = mpn_add_n (r[1].uvp[0], r[1].uvp[0], r[0].uvp[0], size);
c1 = mpn_add_n (r[1].uvp[1], r[1].uvp[1], r[0].uvp[1], size);
d = 1;
}
else
{
ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
MPN_NORMALIZE (r[1].rp, r[1].rsize);
ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize));
c0 = mpn_addmul_1 (r[1].uvp[0], r[0].uvp[0], size, 2);
c1 = mpn_addmul_1 (r[1].uvp[1], r[0].uvp[1], size, 2);
d = 2;
}
/* FIXME: Can avoid branches */
if (c1 != 0)
{
r[1].uvp[0][size] = c0;
r[1].uvp[1][size] = c1;
size++;
}
else
{
ASSERT (c0 == 0);
}
if (dp)
*dp = d;
return size;
}
/* Reduce using Lehmer steps. Called by mpn_hgcd when r1 has been
reduced to approximately the right size. Also used by
mpn_hgcd_lehmer. */
static int
hgcd_final (struct hgcd *hgcd, mp_size_t M)
{
ASSERT (hgcd->row[0].rsize > M);
ASSERT (hgcd->row[1].rsize > M);
/* Can be equal when called by hgcd_lehmer. */
ASSERT (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
hgcd->row[0].rp, hgcd->row[0].rsize));
for (;;)
{
mp_size_t L = hgcd->row[0].rsize;
struct hgcd2 R;
int res;
if (L <= M + 2
&& (L < M + 2 || (hgcd->row[0].rp[M+1] & GMP_NUMB_HIGHBIT) == 0))
break;
res = mpn_hgcd2_lehmer_step (&R,
hgcd->row[0].rp, hgcd->row[0].rsize,
hgcd->row[1].rp, hgcd->row[1].rsize);
if (res == 0)
{
/* We must divide to make progress */
res = euclid_step (hgcd, M);
if (res >= 0)
return res;
}
else if (res == 1)
{
mp_limb_t q;
/* The quotient that has been computed for r2 is at most 2
off. So adjust that, and avoid a full division. */
q = R.q[0];
/* Top two rows of R must be the identity matrix, followed
by a row (1, q). */
ASSERT (R.row[0].u == 1 && R.row[0].v == 0);
ASSERT (R.row[1].u == 0 && R.row[1].v == 1);
ASSERT (R.row[2].u == 1 && R.row[2].v == q);
hgcd_update_r_1 (hgcd->row, q);
hgcd->size = hgcd_update_uv_1 (hgcd->row, hgcd->size, q);
ASSERT (hgcd->size < hgcd->alloc);
if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
hgcd->row[2].rp, hgcd->row[2].rsize))
hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, NULL);
ASSERT (hgcd->size < hgcd->alloc);
hgcd->sign = ~hgcd->sign;
HGCD_SWAP4_LEFT (hgcd->row);
}
else
{
const struct hgcd2_row *s = R.row + (res - 2);
int sign = R.sign;
/* Max size after reduction, plus one */
mp_size_t ralloc = hgcd->row[1].rsize + 1;
if (res == 3)
sign = ~sign;
/* s[0] and s[1] correct. */
hgcd->row[2].rsize
= mpn_hgcd2_fix (hgcd->row[2].rp, ralloc,
sign,
s[0].u, hgcd->row[0].rp, hgcd->row[0].rsize,
s[0].v, hgcd->row[1].rp, hgcd->row[1].rsize);
hgcd->row[3].rsize
= mpn_hgcd2_fix (hgcd->row[3].rp, ralloc,
~sign,
s[1].u, hgcd->row[0].rp, hgcd->row[0].rsize,
s[1].v, hgcd->row[1].rp, hgcd->row[1].rsize);
hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc,
s, hgcd->row, hgcd->size);
hgcd->sign ^= sign;
ASSERT (hgcd->row[2].rsize > M);
if (hgcd->row[3].rsize <= M)
{
/* Can happen only in the res == 4 case */
ASSERT (res == 4);
/* Backup two steps */
hgcd_backup_1 (hgcd->row + 1, hgcd->size, R.q[1]);
hgcd_backup_1 (hgcd->row, hgcd->size, R.q[0]);
hgcd->qsize[0] = hgcd->qsize[1] = 1;
hgcd->qp[0][0] = R.q[0];
hgcd->qp[1][0] = R.q[1];
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
}
}
ASSERT (hgcd->row[1].rsize > M);
for (;;)
{
mp_size_t L = hgcd->row[0].rsize;
mp_size_t ralloc;
struct hgcd2 R;
int res;
/* We don't want hgcd2 to pickup any bits below r0p[M-1], so
don't tell mpn_hgcd2_lehmer_step about them. */
res = mpn_hgcd2_lehmer_step (&R,
hgcd->row[0].rp+M-1, hgcd->row[0].rsize-M+1,
hgcd->row[1].rp+M-1, hgcd->row[1].rsize-M+1);
if (res == 0)
{
/* We must divide to make progress */
res = euclid_step (hgcd, M);
if (res >= 0)
return res;
continue;
}
if (res == 1)
{
mp_limb_t q;
q = R.q[0];
/* The quotient that has been computed for r2 is at most 2
off. So adjust that, and avoid a full division. */
/* Top two rows of R must be the identity matrix, followed
by a row (1, q). */
ASSERT (R.row[0].u == 1 && R.row[0].v == 0);
ASSERT (R.row[1].u == 0 && R.row[1].v == 1);
ASSERT (R.row[2].u == 1 && R.row[2].v == q);
hgcd_update_r_1 (hgcd->row, q);
hgcd->size = hgcd_update_uv_1 (hgcd->row, hgcd->size, q);
ASSERT (hgcd->size < hgcd->alloc);
if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize,
hgcd->row[2].rp, hgcd->row[2].rsize))
hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, NULL);
ASSERT (hgcd->size < hgcd->alloc);
hgcd->sign = ~hgcd->sign;
HGCD_SWAP4_LEFT (hgcd->row);
continue;
}
/* Now r0 and r1 are always correct. */
/* Store new values in rows 2 and 3, to avoid overlap */
/* Max size after reduction, plus one */
ralloc = hgcd->row[1].rsize + 1;
hgcd->row[2].rsize
= mpn_hgcd2_fix (hgcd->row[2].rp, ralloc,
R.sign,
R.row[0].u, hgcd->row[0].rp, hgcd->row[0].rsize,
R.row[0].v, hgcd->row[1].rp, hgcd->row[1].rsize);
hgcd->row[3].rsize
= mpn_hgcd2_fix (hgcd->row[3].rp, ralloc,
~R.sign,
R.row[1].u, hgcd->row[0].rp, hgcd->row[0].rsize,
R.row[1].v, hgcd->row[1].rp, hgcd->row[1].rsize);
ASSERT (hgcd->row[2].rsize >= L - 1);
ASSERT (hgcd->row[3].rsize >= L - 2);
ASSERT (hgcd->row[2].rsize > M);
ASSERT (hgcd->row[3].rsize > M-1);
hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc,
R.row, hgcd->row, hgcd->size);
hgcd->sign ^= R.sign;
if (hgcd->row[3].rsize <= M)
{
/* Backup two steps */
hgcd->qsize[1] = hgcd_backup_recover_q (hgcd->row + 1, hgcd->size,
hgcd->qp[1]);
ASSERT (hgcd->qsize[1] > 0);
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
ASSERT (hgcd->qsize[0] > 0);
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
if (res == 2)
continue;
/* We already know the correct q for computing r2 */
hgcd->qp[0][0] = R.q[0];
hgcd->qsize[0] = 1;
ASSERT (hgcd->size < hgcd->alloc);
hgcd_update_r_1 (hgcd->row, R.q[0]);
hgcd->size = hgcd_update_uv_1 (hgcd->row, hgcd->size, R.q[0]);
ASSERT (hgcd->size < hgcd->alloc);
ASSERT (hgcd->row[2].rsize >= M - 2);
if (hgcd->row[2].rsize <= M)
{
/* Discard r3 */
HGCD_SWAP4_RIGHT (hgcd->row);
hgcd_normalize (hgcd);
hgcd->qp[1][0] = R.q[0];
hgcd->qsize[1] = 1;
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
if (!hgcd->qsize[0])
/* Can this happen */
return 0;
hgcd->sign = ~hgcd->sign;
return hgcd_jebelean (hgcd, M);
}
if (res == 3)
{
/* Drop quotient for r3 */
hgcd->sign = ~hgcd->sign;
HGCD_SWAP4_LEFT (hgcd->row);
continue;
}
ASSERT (res == 4);
ASSERT (hgcd->row[2].rsize > M);
/* We already know the correct q for computing r3 */
ASSERT (hgcd->size < hgcd->alloc);
hgcd_update_r_1 (hgcd->row + 1, R.q[1]);
hgcd->size = hgcd_update_uv_1 (hgcd->row + 1, hgcd->size, R.q[1]);
ASSERT (hgcd->size < hgcd->alloc);
ASSERT (hgcd->row[3].rsize <= M + 1);
/* Appearantly not true. Probably because we have leading zeros
when we call hgcd2. */
/* ASSERT (hgcd->row[3].rsize <= M || hgcd->row[3].rp[M] == 1); */
if (hgcd->row[3].rsize <= M)
{
hgcd->qp[0][0] = R.q[0]; hgcd->qsize[0] = 1;
hgcd->qp[1][0] = R.q[1]; hgcd->qsize[1] = 1;
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
}
}
mp_size_t
mpn_hgcd_itch (mp_size_t asize)
{
/* Scratch space is needed for calling hgcd. We need space for the
results of all recursive calls. In addition, we need space for
calling hgcd_fix and hgcd_mul, for which N = asize limbs should
be enough. */
/* Limit on the recursion depth */
unsigned k = mpn_hgcd_max_recursion (asize);
return asize + mpn_hgcd_init_itch (asize + 6 * k) + 12 * k;
}
/* Repeatedly divides A by B, until the remainder fits in M =
ceil(asize / 2) limbs. Stores cofactors in HGCD, and pushes the
quotients on STACK. On success, HGCD->row[0, 1, 2] correspond to
remainders that are larger than M limbs, while HGCD->row[3]
correspond to a remainder that fit in M limbs.
Return 0 on failure (if B or A mod B fits in M limbs), otherwise
return one of 1 - 4 as specified for hgcd_jebelean. */
int
mpn_hgcd (struct hgcd *hgcd,
mp_srcptr ap, mp_size_t asize,
mp_srcptr bp, mp_size_t bsize,
mp_ptr tp, mp_size_t talloc)
{
mp_size_t N = asize;
mp_size_t M = (N + 1)/2;
mp_size_t n;
mp_size_t m;
struct hgcd R;
mp_size_t itch;
ASSERT (M);
if (bsize <= M)
return 0;
ASSERT (asize >= 2);
/* Initialize, we keep r0 and r1 as the reduced numbers (so far). */
hgcd_start (hgcd, ap, asize, bp, bsize);
if (BELOW_THRESHOLD (N, HGCD_SCHOENHAGE_THRESHOLD))
return hgcd_final (hgcd, M);
/* Reduce the size to M + m + 1. Usually, only one hgcd call is
needed, but we may need multiple calls. When finished, the values
are stored in r0 (potentially large) and r1 (smaller size) */
n = N - M;
m = (n + 1)/2;
/* The second recursive call can use numbers of size up to n+3 */
itch = mpn_hgcd_init_itch (n+3);
ASSERT (itch <= talloc);
mpn_hgcd_init (&R, n+3, tp);
tp += itch; talloc -= itch;
/* Use same storage for quotients */
R.qp[0] = hgcd->qp[0];
R.qp[1] = hgcd->qp[1];
while (hgcd->row[1].rsize > M + m + 1)
{
/* Max size after reduction, plus one */
mp_size_t ralloc = hgcd->row[1].rsize + 1;
int res = mpn_hgcd (&R,
hgcd->row[0].rp + M, hgcd->row[0].rsize - M,
hgcd->row[1].rp + M, hgcd->row[1].rsize - M,
tp, talloc);
if (res == 0)
{
/* We must divide to make progress */
res = euclid_step (hgcd, M);
if (res >= 0)
return res;
}
else if (res <= 2)
{
/* The reason we use hgcd_adjust also when res == 2 is that
either r2 is correct, and we get it for free.
Or r2 is too large. Then can correct it by a few bignum
subtractions, and we are *guaranteed* that the result is
small enough that we don't need another run through this
loop. */
/* FIXME: For res == 1, the newly computed row[2] will be
the same as the old row[1], so we do some unnecessary
computations. */
/* Store new values in rows 2 and 3, to avoid overlap */
hgcd->row[2].rsize
= mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
~R.sign, R.size, &R.row[1],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
hgcd->row[3].rsize
= mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
R.sign, R.size, &R.row[2],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
ASSERT (hgcd->row[2].rsize > M);
ASSERT (hgcd->row[3].rsize > M);
/* Computes the uv matrix for the (possibly incorrect)
values r1, r2. The elements must be smaller than the
correct ones, since they correspond to a too small q. */
hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc,
R.row + 1, R.size,
hgcd->row, hgcd->size,
tp, talloc);
hgcd->sign ^= ~R.sign;
if (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
hgcd->row[2].rp, hgcd->row[2].rsize))
{
HGCD_SWAP4_2 (hgcd->row);
}
else
{
/* r2 was too large, i.e. q0 too small. In this case we
must have r2 % r1 <= r2 - r1 smaller than M + m + 1. */
mp_limb_t d;
hgcd->size = hgcd_adjust (hgcd->row + 2, hgcd->size, &d);
ASSERT (hgcd->row[3].rsize <= M + m + 1);
if (hgcd->row[3].rsize <= M)
{
/* Backup two steps. We already know one of the quotients. */
mp_limb_t cy;
hgcd->qp[1] = R.qp[0]; hgcd->qsize[1] = R.qsize[0];
/* Add in adjustment */
cy = mpn_add_1 (hgcd->qp[1], hgcd->qp[1], hgcd->qsize[1], d);
if (cy)
hgcd->qp[1][hgcd->qsize[1]++] = cy;
hgcd_backup (hgcd->row + 1, hgcd->size, hgcd->qp[1], hgcd->qsize[1]);
hgcd->qp[0] = R.qp[1];
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
if (!hgcd->qsize[0])
/* Can this happen? */
return 0;
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
/* Loop always terminates here. */
break;
}
}
else if (res == 3)
{
/* Store new values in rows 2 and 3, to avoid overlap */
hgcd->row[2].rsize
= mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
~R.sign, R.size, &R.row[1],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
hgcd->row[3].rsize
= mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
R.sign, R.size, &R.row[2],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
ASSERT (hgcd->row[2].rsize > M);
ASSERT (hgcd->row[3].rsize > M);
hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc,
R.row + 1, R.size,
hgcd->row, hgcd->size,
tp, talloc);
hgcd->sign ^= ~R.sign;
HGCD_SWAP4_2 (hgcd->row);
}
else
{
ASSERT (res == 4);
/* All of r0, r1, r3 and r3 are correct.
Compute r2 and r3 */
/* Store new values in rows 2 and 3, to avoid overlap */
hgcd->row[2].rsize
= mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc,
R.sign, R.size, &R.row[2],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
hgcd->row[3].rsize
= mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc,
~R.sign, R.size, &R.row[3],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
ASSERT (hgcd->row[2].rsize > M);
ASSERT (hgcd->row[3].rsize <= M + m + 1);
hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc,
R.row+2, R.size,
hgcd->row, hgcd->size,
tp, talloc);
hgcd->sign ^= R.sign;
if (hgcd->row[3].rsize <= M)
{
/* Backup two steps */
/* Backup two steps */
hgcd_backup (hgcd->row + 1, hgcd->size, R.qp[1], R.qsize[1]);
hgcd_backup (hgcd->row, hgcd->size, R.qp[0], R.qsize[0]);
hgcd->qp[0] = R.qp[0]; hgcd->qsize[0] = R.qsize[0];
hgcd->qp[1] = R.qp[1]; hgcd->qsize[1] = R.qsize[1];
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
/* Always exit the loop. */
break;
}
}
ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize);
ASSERT (hgcd->row[1].rsize > M);
ASSERT (hgcd->row[1].rsize <= M + m + 1);
if (hgcd->row[0].rsize > M + m + 1)
{
/* One euclid step to reduce size. */
int res = euclid_step (hgcd, M);
if (res >= 0)
return res;
}
ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize);
ASSERT (hgcd->row[0].rsize <= M + m + 1);
ASSERT (hgcd->row[1].rsize > M);
/* Second phase, reduce size until we have one number of size > M
and one of size <= M+1 */
while (hgcd->row[1].rsize > M + 1)
{
mp_size_t k = 2*M - hgcd->row[0].rsize;
mp_size_t n1 = hgcd->row[0].rsize - k;
int res;
ASSERT (k + (n1 + 1)/2 == M);
ASSERT (n1 >= 2);
ASSERT (n1 <= 2*(m + 1));
ASSERT (n1 <= n + 3);
res = mpn_hgcd (&R,
hgcd->row[0].rp + k, hgcd->row[0].rsize - k,
hgcd->row[1].rp + k, hgcd->row[1].rsize - k,
tp, talloc);
if (res == 0)
{
/* The first remainder was small. Then there's a good chance
that the remainder A % B is also small. */
res = euclid_step (hgcd, M);
if (res >= 0)
return res;
continue;
}
if (res == 1)
{
/* Compute possibly incorrect r2 and corresponding u2, v2.
Incorrect matrix elements must be smaller than the
correct ones, since they correspond to a too small q. */
mp_limb_t d;
/* Row 2 must be (1, q) */
ASSERT (R.qsize[0] + hgcd->size <= hgcd->alloc);
hgcd_update_r (hgcd->row, R.qp[0], R.qsize[0]);
hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
R.qp[0], R.qsize[0]);
ASSERT (hgcd->size < hgcd->alloc);
if (!MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize,
hgcd->row[2].rp, hgcd->row[2].rsize))
hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, &d);
if (hgcd->row[2].rsize <= M)
{
/* Discard r3, and backup one steps */
mp_limb_t cy;
HGCD_SWAP4_RIGHT (hgcd->row);
hgcd_normalize (hgcd);
hgcd->qp[1] = R.qp[0];
hgcd->qsize[1] = R.qsize[0];
/* Add in adjustment */
cy = mpn_add_1 (hgcd->qp[1], hgcd->qp[1], hgcd->qsize[1], d);
if (cy)
hgcd->qp[1][hgcd->qsize[1]++] = cy;
hgcd->qp[0] = R.qp[1];
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
if (!hgcd->qsize[0])
/* Can this happen */
return 0;
hgcd->sign = ~hgcd->sign;
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_LEFT (hgcd->row);
hgcd->sign = ~hgcd->sign;
continue;
}
/* Now r0 and r1 are always correct. */
/* It's possible that first two "new" r:s are the same as the
old ones. In that case skip recomputing them. */
if (!hgcd_start_row_p (&R.row[0], R.size))
{
/* Store new values in rows 2 and 3, to avoid overlap */
hgcd->row[2].rsize
= mpn_hgcd_fix (k, hgcd->row[2].rp, hgcd->row[0].rsize + 1,
R.sign, R.size, &R.row[0],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
hgcd->row[3].rsize
= mpn_hgcd_fix (k, hgcd->row[3].rp, hgcd->row[1].rsize + 1,
~R.sign, R.size, &R.row[1],
hgcd->row[0].rp, hgcd->row[1].rp,
tp, talloc);
ASSERT (hgcd->row[2].rsize > M);
ASSERT (hgcd->row[3].rsize > k);
hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc,
R.row, R.size, hgcd->row, hgcd->size,
tp, talloc);
hgcd->sign ^= R.sign;
if (hgcd->row[3].rsize <= M)
{
/* Backup two steps */
hgcd->qsize[1] = hgcd_backup_recover_q (hgcd->row + 1, hgcd->size,
hgcd->qp[1]);
ASSERT (hgcd->qsize[1] > 0);
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
ASSERT (hgcd->qsize[0] > 0);
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
if (res == 2)
continue;
}
ASSERT (res >= 3);
/* We already know the correct q */
ASSERT (R.qsize[0] + hgcd->size <= hgcd->alloc);
hgcd_update_r (hgcd->row, R.qp[0], R.qsize[0]);
hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size,
R.qp[0], R.qsize[0]);
ASSERT (hgcd->size < hgcd->alloc);
ASSERT (hgcd->row[2].rsize > k);
if (hgcd->row[2].rsize <= M)
{
/* Discard r3 */
HGCD_SWAP4_RIGHT (hgcd->row);
hgcd_normalize (hgcd);
hgcd->qp[1] = R.qp[0];
hgcd->qsize[1] = R.qsize[0];
hgcd->qp[0] = R.qp[1];
hgcd->qsize[0] = hgcd_backup_recover_q (hgcd->row, hgcd->size,
hgcd->qp[0]);
if (!hgcd->qsize[0])
/* Can this happen */
return 0;
hgcd->sign = ~hgcd->sign;
return hgcd_jebelean (hgcd, M);
}
if (res == 3)
{
/* Drop quotient for r3 */
hgcd->sign = ~hgcd->sign;
HGCD_SWAP4_LEFT (hgcd->row);
continue;
}
ASSERT (hgcd->row[2].rsize > M);
ASSERT (res == 4);
/* We already know the correct q */
ASSERT (R.qsize[1] + hgcd->size <= hgcd->alloc);
hgcd_update_r (hgcd->row + 1, R.qp[1], R.qsize[1]);
hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size,
R.qp[1], R.qsize[1]);
ASSERT (hgcd->size < hgcd->alloc);
ASSERT (hgcd->row[3].rsize <= M + 1);
if (hgcd->row[3].rsize <= M)
{
hgcd->qp[0] = R.qp[0]; hgcd->qsize[0] = R.qsize[0];
hgcd->qp[1] = R.qp[1]; hgcd->qsize[1] = R.qsize[1];
return hgcd_jebelean (hgcd, M);
}
HGCD_SWAP4_2 (hgcd->row);
}
return hgcd_final (hgcd, M);
}