mpir/mpn/generic/bgcd.c

2082 lines
46 KiB
C

/* Stehle's and Zimmermann's binary algorithm */
#include <stdio.h> /* for NULL */
#include <string.h>
/* #define WANT_ASSERT 0 */
#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"
/* Notation: W = 2^GMP_NUMB_BITS */
#define ODD(x) ((x) & 1)
/* Extract one limb, shifting count bits right
________ ________
|___xh___||___xl___|
|____r____|
>count<
*/
#define EXTRACT_LIMB(count, xh, xl) \
(((xl) >> (count) ) | (((xh) << (GMP_NUMB_BITS - count)) & GMP_NUMB_MASK))
#define RSHIFT2(h, l, count) do { \
(l) = ((l)>>(count)) | (((h)<<(GMP_NUMB_BITS - count)) & GMP_NUMB_MASK); \
(h) >>= (count); \
} while (0)
#define MP_LIMB_SIGNED_T_SWAP(x, y) \
do { \
mp_limb_signed_t __mp_limb_signed_t_swap__tmp = (x); \
(x) = (y); \
(y) = __mp_limb_signed_t_swap__tmp; \
} while (0)
#define INT_SWAP(x, y) \
do { \
int __int_swap__tmp = (x); \
(x) = (y); \
(y) = __int_swap__tmp; \
} while (0)
/* Computes 2^(n * GMP_NUMB_BITS) - x. Returns 1 in case x was zero,
otherwise zero. This function is used when a subtraction yields a
borrow, to negate the difference. */
static mp_limb_t
negate_diff (mp_ptr xp, mp_size_t n)
{
mp_size_t i;
ASSERT (n > 0);
for (i = 0; i < n; i++)
{
if (xp[i] > 0)
{
xp[i] = ~xp[i] + 1;
i++;
if (i < n)
mpn_com_n (xp + i, xp + i, n - i);
return 0;
}
}
return 1;
}
/* Computes r = u x + v y. Returns most significant limb. rp and xp
may be the same. */
static mp_limb_t
addmul2_1 (mp_ptr rp, mp_limb_t *cy, mp_srcptr xp, mp_srcptr yp, mp_size_t n,
mp_limb_t u, mp_limb_t v)
{
mp_limb_t h1;
mp_limb_t h2;
ASSERT (rp != yp);
h1 = mpn_mul_1 (rp, xp, n, u);
h2 = mpn_addmul_1 (rp, yp, n, v);
h1 += h2;
*cy = (h1 < h2);
return h1;
}
/* Computes r = u x - v y. rp and xp may be the same. Returns most
significant limb. Sets *rsign to reflect the sign of the result. */
static mp_limb_t
submul2_1 (mp_ptr rp, int *rsign, mp_srcptr xp, mp_srcptr yp, mp_size_t n,
mp_limb_t u, mp_limb_t v)
{
mp_limb_t hi;
mp_limb_t cy;
ASSERT (rp != yp);
hi = mpn_mul_1 (rp, xp, n, u);
cy = mpn_submul_1 (rp, yp, n, v);
if (cy <= hi)
{
hi -= cy;
*rsign = 0;
}
else
{
/* We need to negate the difference,
- ( (hi - cy) 2^k + r) = (cy - hi) 2^k - r = (cy - hi - 1) 2^k + 2^k - r */
*rsign = -1;
hi = cy - hi - 1 + negate_diff (rp, n);
}
return hi;
}
/* Returns the number of zero limbs at the least significant end of x,
and the number of zero bits of the first non-zero limb. If x == 0,
returns n and sets *bits = 0. */
static mp_size_t
power_of_2 (unsigned *bits, mp_srcptr xp, mp_size_t n)
{
mp_size_t zlimbs;
for (zlimbs = 0; zlimbs < n && xp[zlimbs] == 0; zlimbs++)
;
if (zlimbs == n || ODD (xp[zlimbs]))
*bits = 0;
else
{
int count;
count_trailing_zeros (count, xp[zlimbs]);
*bits = count;
}
return zlimbs;
}
struct bgcd_matrix1
{
unsigned j;
/* Sign/magnitute representation. sign = 0 for positive, -1 for
negative */
signed char sign[2][2];
mp_limb_t R[2][2];
};
/* Multiplies a,b by 2^(-2j) R. Changes the signs of the rows of R if
needed to make a, b >= 0. Needs n limbs of temporary storage.
Returns normalized length. */
static mp_size_t
bgcd_matrix1_apply (struct bgcd_matrix1 *m,
mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
/* Top two limbs */
mp_limb_t a0, a1, b0, b1;
int s0;
int s1;
int ts;
unsigned j;
/* Make a copy of a */
MPN_COPY (tp, ap, n);
s0 = m->sign[0][0];
s1 = m->sign[0][1];
if ( (s0 ^ s1) < 0)
{
a0 = submul2_1 (ap, &ts, ap, bp, n, m->R[0][0], m->R[0][1]);
a1 = 0;
ts ^= s0;
m->sign[0][0] ^= ts;
m->sign[0][1] ^= ts;
}
else
{
a0 = addmul2_1 (ap, &a1, ap, bp, n, m->R[0][0], m->R[0][1]);
m->sign[0][0] = m->sign[0][1] = 0;
}
s0 = m->sign[1][0];
s1 = m->sign[1][1];
if ( (s0 ^ s1) < 0)
{
b0 = submul2_1 (bp, &ts, bp, tp, n, m->R[1][1], m->R[1][0]);
b1 = 0;
ts ^= s1;
m->sign[1][0] ^= ts;
m->sign[1][1] ^= ts;
}
else
{
b0 = addmul2_1 (bp, &b1, bp, tp, n, m->R[1][1], m->R[1][0]);
m->sign[1][0] = m->sign[1][1] = 0;
}
/* Shift 2j bits right */
j = 2 * m->j;
if (j == GMP_NUMB_BITS)
{
ASSERT (n >= 2);
ASSERT (ap[0] == 0);
ASSERT (bp[0] == 0);
ASSERT (a1 == 0);
ASSERT (b1 == 0);
MPN_COPY_INCR (ap, ap+1, n-1);
MPN_COPY_INCR (bp, bp+1, n-1);
ap[n-1] = a0;
bp[n-1] = b0;
}
else if (j < GMP_NUMB_BITS)
{
/* This is the only case where n == 1 is allowed */
ASSERT (a1 == 0);
ASSERT (b1 == 0);
ASSERT_NOCARRY (mpn_rshift (ap, ap, n, j));
ASSERT_NOCARRY (mpn_rshift (bp, bp, n, j));
ap[n-1] |= (a0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
bp[n-1] |= (b0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
ASSERT ( (a0 >> j) == 0);
ASSERT ( (b0 >> j) == 0);
}
else
{
ASSERT (n >= 2);
j -= GMP_NUMB_BITS;
ASSERT (ap[0] == 0);
ASSERT (bp[0] == 0);
ASSERT_NOCARRY (mpn_rshift (ap, ap + 1, n - 1, j));
ASSERT_NOCARRY (mpn_rshift (bp, bp + 1, n - 1, j));
ap[n-2] |= (a0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
bp[n-2] |= (b0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
ap[n-1] = (a0 >> j) | (a1 << (GMP_NUMB_BITS - j));
bp[n-1] = (b0 >> j) | (b1 << (GMP_NUMB_BITS - j));
}
ASSERT (ODD (ap[0]));
ASSERT (!ODD (bp[0]));
return n - ( (ap[n-1] | bp[n-1]) == 0);
}
/* When hbgcd is computed and results in j bits of reduction, then the
matrix elements will fit in floor (11 (j+1) / 8) bits.
If the input to hbgcd is n limbs, and k bits of desired reduction,
then 2k <= n GMP_NUMB_BITS and j < k. Then elements of the resulting
matrix must fit in
floor (11 (j+1) / 8) <= 11 k / 8 <= 11 n * GMP_NUMB_BITS / 16
Rounding up to an integral number of limbs, we get the limit
ceil (11 n / 16)
Table for small values:
n maximum size of matrix entries
1 1
2 2
3 3
4 3
5 4
6 5
7 5
8 6
For the computations, we need one extra limb.
*/
void
mpn_bgcd_matrix_init (struct bgcd_matrix *m, mp_ptr limbs, mp_size_t alloc)
{
m->alloc = alloc;
m->j = 0;
m->n = 1;
memset (m->sign, 0, sizeof (m->sign));
m->R[0][0] = limbs;
m->R[0][1] = limbs + alloc;
m->R[1][0] = limbs + 2 * alloc;
m->R[1][1] = limbs + 3 * alloc;
MPN_ZERO (limbs, 4 * alloc);
m->R[0][0][0] = m->R[1][1][0] = 1;
}
/* Needs zlimbs + 1 + m->n <= m->alloc limbs of temporary storage */
static void
bgcd_matrix_mul_q (struct bgcd_matrix *m, mp_size_t zlimbs, unsigned zbits,
mp_ptr qp, int qsign, mp_ptr tp)
{
unsigned i;
mp_size_t n, nn;
mp_limb_t grow;
ASSERT (tp != qp);
ASSERT (qp[zlimbs] < ((mp_limb_t ) 1 << zbits));
/* To multiply a column (u ; v) by (0, 2^j ; 2^j, q), set
u = 2^j u + q v
v = 2^j v
and then swap u, v. We need temporary storage for t = q * v.
Size should be increased by zlimbs or zlimbs + 1.
*/
grow = 0;
n = m->n;
nn = n + zlimbs;
ASSERT (nn + 1 <= m->alloc);
for (i = 0; i < 2; i++)
{
mp_ptr up = m->R[0][i];
mp_ptr vp = m->R[1][i];
int us = m->sign[0][i];
int vs = m->sign[1][i];
/* Set u <<= j */
if (zbits)
{
up[nn] = mpn_lshift (up + zlimbs, up, n, zbits);
}
else
{
MPN_COPY_DECR (up + zlimbs, up, n);
ASSERT (up[nn] == 0);
}
/* We could make some clever use of negate_diff, but it's easier to
clear the low limbs first. */
MPN_ZERO (up, zlimbs);
/* Set t = q * v, and u += t */
if (n > zlimbs)
mpn_mul (tp, vp, n, qp, zlimbs + 1);
else
mpn_mul (tp, qp, zlimbs + 1, vp, n);
if ( (us ^ vs ^ qsign) < 0)
{
mp_limb_t cy;
cy = mpn_sub_n (up, up, tp, nn + 1);
if (cy > 0)
{
ASSERT_NOCARRY (negate_diff (up, nn + 1));
us = ~us;
}
}
else
{
ASSERT_NOCARRY (mpn_add_n (up, up, tp, nn + 1));
}
grow |= up[nn];
/* Set v <<= j */
if (zbits)
{
vp[nn] = mpn_lshift (vp + zlimbs, vp, n, zbits);
grow |= vp[nn];
}
else
{
MPN_COPY_DECR (vp + zlimbs, vp, n);
ASSERT (vp[nn] == 0);
}
MPN_ZERO (vp, zlimbs);
MP_PTR_SWAP (m->R[0][i], m->R[1][i]);
m->sign[0][i] = vs;
m->sign[1][i] = us;
}
m->n = nn + (grow != 0);
m->j += zlimbs * GMP_NUMB_BITS + zbits;
}
/* Needs m->n limbs of temporary storage. */
static void
bgcd_matrix_mul_1 (struct bgcd_matrix *m, const struct bgcd_matrix1 *m1,
mp_ptr tp)
{
unsigned i;
mp_limb_t grow;
mp_limb_t r00 = m1->R[0][0];
mp_limb_t r01 = m1->R[0][1];
mp_limb_t r10 = m1->R[1][0];
mp_limb_t r11 = m1->R[1][1];
int s00 = m1->sign[0][0];
int s01 = m1->sign[0][1];
int s10 = m1->sign[1][0];
int s11 = m1->sign[1][1];
/* Carries, for the unlikely case that R grows by two limbs. */
mp_limb_t c[2][2];
mp_size_t n = m->n;
ASSERT (n + 1 <= m->alloc);
grow = 0;
for (i = 0; i < 2; i++)
{
mp_ptr up = m->R[0][i];
mp_ptr vp = m->R[1][i];
int us = m->sign[0][i];
int vs = m->sign[1][i];
int ts;
mp_limb_t uh, vh;
/* Make a copy of u */
MPN_COPY (tp, up, n);
if ( (s00 ^ s01 ^ us ^ vs) < 0)
{
uh = submul2_1 (up, &ts, up, vp, n, r00, r01);
c[0][i] = 0;
ts ^= s00 ^ us;
}
else
{
uh = addmul2_1 (up, &c[0][i], up, vp, n, r00, r01);
ts = s00 ^ us;
}
if ( (s10 ^ s11 ^ us ^ vs) < 0)
{
int s;
vh = submul2_1 (vp, &s, vp, tp, n, r11, r10);
c[1][i] = 0;
vs ^= s11 ^ s;
}
else
{
vh = addmul2_1 (vp, &c[1][i], vp, tp, n, r11, r10);
vs ^= s11;
}
up[n] = uh;
vp[n] = vh;
grow |= (uh | vh);
m->sign[0][i] = ts;
m->sign[1][i] = vs;
}
if ( (c[0][0] | c[0][1] | c[1][0] | c[1][1]) != 0)
{
/* Growth by two limbs. */
m->n += 2;
ASSERT (m->n + 1 <= m->alloc);
m->R[0][0][n+1] = c[0][0];
m->R[0][1][n+1] = c[0][1];
m->R[1][0][n+1] = c[1][0];
m->R[1][1][n+1] = c[1][1];
}
else
{
m->n += (grow != 0);
ASSERT (m->n + 1 <= m->alloc);
}
m->j += m1->j;
}
/* Computes r = u x + v y. Needs temporary space 2*(xn + un). The
least significant xn + un limbs are stored at rp. Returns carry.
rp == tp is allowed. */
static mp_limb_t
addmul2_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t xn,
mp_srcptr up, mp_srcptr vp, mp_size_t un, mp_ptr tp)
{
mp_size_t n = xn + un;
mp_ptr t1p = tp;
mp_ptr t2p = tp + n;
if (xn >= un)
{
mpn_mul (t1p, xp, xn, up, un);
mpn_mul (t2p, yp, xn, vp, un);
}
else
{
mpn_mul (t1p, up, un, xp, xn);
mpn_mul (t2p, vp, un, yp, xn);
}
return mpn_add_n (rp, t1p, t2p, n);
}
/* Computes r = u x - v y. Needs temporary space 2*(xn + un). Result,
xn + un limbs, is stored at rp. Returns -1 or zero depending on the
sign of the difference.
rp may be the same as either input or tp. */
static int
submul2_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t xn,
mp_srcptr up, mp_srcptr vp, mp_size_t un, mp_ptr tp)
{
mp_size_t n = xn + un;
mp_ptr t1p = tp;
mp_ptr t2p = tp + n;
if (xn >= un)
{
mpn_mul (t1p, xp, xn, up, un);
mpn_mul (t2p, yp, xn, vp, un);
}
else
{
mpn_mul (t1p, up, un, xp, xn);
mpn_mul (t2p, vp, un, yp, xn);
}
while (t1p[n-1] == t2p[n-1])
{
rp[--n] = 0;
if (n == 0)
return 0;
}
if (t1p[n-1] > t2p[n-1])
{
ASSERT_NOCARRY (mpn_sub_n (rp, t1p, t2p, n));
return 0;
}
else
{
ASSERT (t1p[n-1] < t2p[n-1]);
ASSERT_NOCARRY (mpn_sub_n (rp, t2p, t1p, n));
return -1;
}
}
/* Sets m = s * m. Needs temporary space 2 * (m->n + s->n) + m->n = 3 * m->n + 2 * s->n */
static void
bgcd_matrix_mul (struct bgcd_matrix *m, const struct bgcd_matrix *s,
mp_ptr tp)
{
unsigned i;
mp_limb_t grow = 0;
mp_ptr r00 = s->R[0][0];
mp_ptr r01 = s->R[0][1];
mp_ptr r10 = s->R[1][0];
mp_ptr r11 = s->R[1][1];
int s00 = s->sign[0][0];
int s01 = s->sign[0][1];
int s10 = s->sign[1][0];
int s11 = s->sign[1][1];
mp_size_t mn = m->n;
mp_size_t sn = s->n;
mp_size_t n = m->n + s->n;
/* Space for a copy of u */
mp_ptr cp = tp + 2 * n;
ASSERT (n + 1 <= m->alloc);
for (i = 0; i < 2; i++)
{
mp_ptr up = m->R[0][i];
mp_ptr vp = m->R[1][i];
int us = m->sign[0][i];
int vs = m->sign[1][i];
int ts;
/* Make a copy of u */
MPN_COPY (cp, up, mn);
/* Set u = r00 * u + r01 * v */
/* If r00 u and r01 v have different signs, we should subtract */
ts = us ^ s00;
if ( (s00 ^ us ^ s01 ^ vs) < 0)
{
ts ^= submul2_n (up, cp, vp, mn, r00, r01, sn, tp);
up[n] = 0;
}
else
{
up[n] = addmul2_n (up, cp, vp, mn, r00, r01, sn, tp);
grow |= up[n];
}
/* Set v = r10 (old) u + r11 v */
/* If r10 u and r11 v have different signs, we should subtract */
if ( (s10 ^ us ^ s11 ^ vs) < 0)
{
vs = us ^ s10 ^ submul2_n (vp, cp, vp, mn, r10, r11, sn, tp);
vp[n] = 0;
}
else
{
vp[n] = addmul2_n (vp, cp, vp, mn, r10, r11, sn, tp);
vs = us ^ s10;
grow |= vp[n];
}
m->sign[0][i] = ts;
m->sign[1][i] = vs;
}
if (grow > 0)
m->n = n + 1;
else
{
/* We don't need full normalization, stripping one leading zero
limb should be sufficient to get size below the bound. */
m->n = n - (m->R[0][0][n-1] == 0 && m->R[0][1][n-1] == 0
&& m->R[1][0][n-1] == 0 && m->R[1][1][n-1] == 0);
}
ASSERT (m->n + 1 <= m->alloc);
m->j += s->j;
}
/* Sets a = a + c * 2^(shift * GMP_NUMB_BITS),
________________________
_|____________________a___|
+ |__________________c_|
--> <-- shift
Stores MAX(an, cn + shift) limbs of the result at ap, and returns
carry.
*/
static mp_limb_t
bgcd_fix_add (mp_ptr ap, mp_size_t an, mp_srcptr cp, mp_size_t cn, mp_size_t shift)
{
if (cn + shift < an)
{
/* Can this really happen? */
ASSERT_ALWAYS (0 == "Can this happen?");
return mpn_add (ap + shift, ap + shift, an - shift, cp, cn);
}
else if (shift < an)
return mpn_add (ap + shift, cp, cn,
ap + shift, an - shift);
else
{
MPN_COPY (ap + shift, cp, cn);
if (shift > an)
MPN_ZERO (ap + an, shift - an);
return 0;
}
}
/* Sets a = a - c * 2^(shift * GMP_NUMB_BITS,
________________________
_|____________________a___|
- |__________________c_|
--> <-- shift
Stores the cn + shift limbs of absolute value at ap, and returns 0
or -1 depending on the sign.
*/
static mp_size_t
bgcd_fix_sub (mp_ptr ap, mp_size_t an, mp_srcptr cp, mp_size_t cn, mp_size_t shift)
{
while (an > cn + shift)
{
if (ap[an-1] == 0)
an--;
else
{
ASSERT_NOCARRY (mpn_sub (ap + shift,
ap + shift, an - shift, cp, cn));
return 0;
}
}
while (cn > 0 && cn + shift > an)
{
if (cp[cn - 1] == 0)
ap[shift + --cn] = 0;
else
{
/* Result must be negative */
if (shift == 0)
ASSERT_NOCARRY (mpn_sub (ap, cp, cn, ap, an));
else
{
/* Negate the lower part of a */
if (shift <= an)
{
ASSERT_NOCARRY (mpn_sub (ap + shift, cp, cn, ap + shift, an - shift));
if (negate_diff (ap, shift) == 0)
MPN_DECR_U (ap + shift, cn, 1);
}
else
{
if (negate_diff (ap, an) == 0)
{
mp_size_t i;
for (i = an; i < shift; i++)
ap[i] = GMP_NUMB_MASK;
MPN_DECR_U (ap + shift, cn, 1);
}
else
MPN_ZERO (ap + an, shift - an);
}
}
return -1;
}
}
while (cn > 0)
{
if (ap[shift + cn - 1] == cp[cn - 1])
ap[shift + --cn] = 0;
else if (ap[shift + cn - 1] > cp[cn - 1])
{
/* Result must be positive */
ASSERT_NOCARRY (mpn_sub_n (ap + shift, ap + shift, cp, cn));
return 0;
}
else
{
/* Result must be negative */
ASSERT (ap[shift + cn - 1] < cp[cn - 1]);
ASSERT_NOCARRY (mpn_sub_n(ap + shift, cp, ap + shift, cn));
if (shift > 0 && negate_diff (ap, shift) == 0)
MPN_DECR_U (ap + shift, cn, 1);
return -1;
}
}
return 0;
}
/* Computes (C ; D) = 2^(-2j) R (A; B) =
2^(k * GMP_NUMB_BITS - 2j) R (a ; b) + (c ; d)
where a, b are stored at ap[k, ..., n-1], bp[k, ..., n-1],
and c, d are stored at ap[0, ..., l-1], bp[0, ..., l-1].
Results C, D are also stored at ap, bp. If necessary to get
non-negative values, flip the signs of the rows of the matrix R.
Sizes:
Input c,d are l limbs. The output C,D must always fit in n limbs
(and we do *not* have any temporary space beyond the n:th limb).
The temporaries R(a;b) must fit in n limbs. This is because
#(R(a;b)) <= #R + (n-k) GMP_NUMB_BITS + 1. From the bounds #R <=
11(j+1)/8 and k GMP_NUMB_BITS >= 2(j+1) we get
#(R(a;b)) <= n GMP_NUMB_BITS + (11 - 16)(j+1)/8 + 1
= n GMP_NUMB_BITS - (5j -3)/8 < n GMP_NUMB_BITS.
For shifted numbers 2^(k * GMP_NUMB_BITS - 2j) R (a ; b) we get
k * GMP_NUMB_BITS - 2j + #R + (n-k) GMP_NUMB_BITS + 1
<= n * GMP_NUMB_BITS + (11 j + 11 - 16j)/8 + 1
<= n * GMP_NUMB_BITS + (17 - 5j)/8
Taking the fllor of this expresstion, we see that if j >= 2, n
limbs should be sufficient. A more careful study of j = 1 shows
that n limbs is sufficient for all j >= 1.
Temporary storage needed: 4*n.
*/
static mp_size_t
bgcd_matrix_apply (struct bgcd_matrix *m,
mp_ptr ap, mp_ptr bp, mp_size_t n,
mp_size_t k, mp_size_t l, mp_ptr tp)
{
mp_ptr r00 = m->R[0][0];
mp_ptr r01 = m->R[0][1];
mp_ptr r10 = m->R[1][0];
mp_ptr r11 = m->R[1][1];
int s00 = m->sign[0][0];
int s01 = m->sign[0][1];
int s10 = m->sign[1][0];
int s11 = m->sign[1][1];
mp_ptr cp = tp + 2*n;
mp_ptr dp = tp + 3*n;
mp_size_t nn;
mp_limb_t ch;
mp_limb_t dh;
int cs;
int ds;
unsigned shift;
unsigned shift_bits;
mp_size_t shift_limbs;
/* This function probably works for k == 1, but then it's better to
use hbgcd2 and bgcd_matrix1_apply. */
ASSERT (k >= 2);
ASSERT (l <= k);
ASSERT (2 * m->j < k * GMP_NUMB_BITS);
nn = n - k + m->n;
ASSERT (nn <= n);
/* There are three chances for nn to grow: addmul2_n, left shift,
and bgcd_fix_add. */
cs = s00;
/* Compute R * (a ; b), and store result at cp, dp */
if ( (s00 ^ s01) < 0)
{
cs ^= submul2_n (cp, ap + k, bp + k, n - k,
r00, r01, m->n, tp);
ch = 0;
}
else
ch = addmul2_n (cp, ap + k, bp + k, n - k,
r00, r01, m->n, tp);
ds = s10;
if ( (s10 ^ s11) < 0)
{
ds ^= submul2_n (dp, ap + k, bp + k, n - k,
r10, r11, m->n, tp);
dh = 0;
}
else
dh = addmul2_n (dp, ap + k, bp + k, n - k,
r10, r11, m->n, tp);
if ( (ch | dh) > 0)
{
ASSERT (nn < n);
cp[nn] = ch;
dp[nn] = dh;
nn++;
}
shift = k * GMP_NUMB_BITS - 2 * m->j;
shift_limbs = shift / GMP_NUMB_BITS;
shift_bits = shift % GMP_NUMB_BITS;
if (shift_bits > 0)
{
ch = mpn_lshift (cp, cp, nn, shift_bits);
dh = mpn_lshift (dp, dp, nn, shift_bits);
if ( (ch | dh) > 0)
{
ASSERT (nn < n);
cp[nn] = ch;
dp[nn] = dh;
nn++;
}
}
/* Update a */
ASSERT (nn + shift_limbs <= n);
if (cs >= 0)
ch = bgcd_fix_add (ap, l, cp, nn, shift_limbs);
else
{
int s = bgcd_fix_sub (ap, l, cp, nn, shift_limbs);
ch = 0;
/* Flip signs of first matrix row, if needed. */
m->sign[0][0] ^= s;
m->sign[0][1] ^= s;
}
/* Update b */
if (ds >= 0)
dh = bgcd_fix_add (bp, l, dp, nn, shift_limbs);
else
{
int s = bgcd_fix_sub (bp, l, dp, nn, shift_limbs);
dh = 0;
/* Flip signs of second matrix row, if needed. */
m->sign[1][0] ^= s;
m->sign[1][1] ^= s;
}
nn += shift_limbs;
if (nn < l)
{
/* Can this really happen? */
ASSERT_ALWAYS (0 == "Can this happen?");
nn = l;
}
if ( (ch | dh) > 0)
{
ASSERT (nn < n);
ap[nn] = ch;
bp[nn] = dh;
nn++;
}
return nn;
}
/* Input: b is odd, k < GMP_NUMB_BITS. (Common case is a odd, k > 0).
Returns q, such that -2^k <= q < 2^k and v (a + q b) > k, i.e. there
are at least k+1 zeros at the least significant end.
*/
static inline mp_limb_signed_t
bdiv_1 (mp_limb_t a, mp_limb_t b, unsigned k)
{
mp_limb_t b_inv;
mp_limb_signed_t q;
mp_limb_t bit;
#if 0
#define BDIV_TABLE_SIZE 3
#define BDIV_TABLE_MASK ((1 << (BDIV_TABLE_SIZE + 1)) - 1)
/* Table of -a b^{-1} mod 2^{k+1}, indexed by
[b_k b_{k-1} ... b_1 a_k a_{k-1} a_{k-2} ... a_0]
*/
static const char qtable[1L << (BDIV_TABLE_SIZE * 2 + 1)] =
{
/* binv = 1 */ 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
/* binv = 11 */ 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11,
/* binv = 13 */ 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
/* binv = 7 */ 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 5, 14, 7,
/* binv = 9 */ 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,
/* binv = 3 */ 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3,
/* binv = 5 */ 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
/* binv = 15 */ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
};
ASSERT (ODD (b));
ASSERT (k < GMP_NUMB_BITS);
if (k <= BDIV_TABLE_SIZE)
{
bit = CNST_LIMB(1) << k;
q = qtable[ (((b-1) & BDIV_TABLE_MASK) << BDIV_TABLE_SIZE )
| (a & BDIV_TABLE_MASK) ] & (2*bit - 1);
}
else
#endif
{
mp_limb_t mask;
/* We need (k+1)-bit inverse */
b_inv = modlimb_invert_table[(b/2) & 0x7f];
if (k >= 8)
{
b_inv = 2 * b_inv - b_inv * b_inv * b;
if (k >= 16)
{
b_inv = 2 * b_inv - b_inv * b_inv * b;
if (GMP_NUMB_BITS > 32 && k >= 32)
{
b_inv = 2 * b_inv - b_inv * b_inv * b;
if (GMP_NUMB_BITS > 64 && k >= 64)
{
int inv_bits = 64;
do
{
b_inv = 2 * b_inv - b_inv * b_inv * b;
inv_bits *= 2;
}
while (inv_bits <= k);
}
}
}
}
bit = CNST_LIMB(1) << k;
mask = 2*bit - 1;
ASSERT ( (b_inv * b & mask) == 1);
q = (- a * b_inv) & mask;
}
if (q >= bit)
q -= 2*bit;
return q;
}
/* Input is two two-limb numbers a, b (typically the least significant
limbs of some larger numbers). The integer k is the desired number
of bits of reduction.
Returns an array R of single limbs, and an integer j < k. If c, d
are formed as
/c\ -2j /a\
\d/ = 2 R \b/
then c is odd, d is even, and gcd(a,b) = gcd(c, d). R is
constructed by repeated bdiv, where iteration stops when either
v (d) >= k - j
or another iteration would cause R to overflow.
Returns 0 if no reduction is possible, otherwise j.
*/
/* two's complement version */
static unsigned
hbgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
unsigned k,
struct bgcd_matrix1 *m)
{
mp_limb_signed_t r00, r01, r10, r11;
unsigned s;
unsigned j;
unsigned v;
ASSERT (ODD (al));
ASSERT (!ODD (bl));
/* Multiply by (0, 2^bits; 2^bits, q) */
#define MUL_Q(R00, R01, R10, R11) do { \
R00 = (R00 << v) + q * R10; \
R01 = (R01 << v) + q * R11; \
R10 <<= v; \
R11 <<= v; \
\
s += v; \
s += ((R00 ^ (R00 >> 1)) | (R01 ^ (R01 >> 1))) >> s; \
j += v; \
} while (0)
/* Add q b to a. We treat q as an unsigned in the multiplication. If
q < 0 and th * w + tl = (q + w) bl is the result of the
unsigned multiplication, what we want
(q + w - w) (bh w + bl)
= q * bh * w + th w + tl - w bl =
= (q * bh + th - bl) w + tl (mod w^2)
*/
#define ADDMUL2_Q(AH, AL, BH, BL) do { \
mp_limb_t th, tl; \
mp_limb_t mask = q >> (GMP_LIMB_BITS - 1); \
\
umul_ppmm (th, tl, BL, q); \
\
AL += tl; \
AH += th + (AL < tl) + q * BH - (mask & BL); \
} while (0)
/* NOTE: We can reduce at most GMP_NUMB_BITS - 1 bits. Reducing
GMP_NUMB_BITS would be possible if b was preshifted, so we had 2
* GMP_NUMB_BITS + 1 of b. But a reduction of GMP_NUMB_BITS,
without R overflowing, is quite unlikely (probability on the
order of 1%). */
if (k > GMP_NUMB_BITS)
k = GMP_NUMB_BITS;
if (bl == 0)
return 0;
count_trailing_zeros (v, bl);
if (v >= k)
return 0;
RSHIFT2 (bh, bl, v);
if (v == GMP_NUMB_BITS - 1)
{
/* Special case that the two's complement code can't handle */
mp_limb_signed_t q;
mp_limb_signed_t sign;
q = bdiv_1 (al, bl, v);
ASSERT (ODD (q));
m->R[0][0] = 0;
m->R[0][1] = m->R[1][0] = CNST_LIMB(1) << (GMP_NUMB_BITS - 1);
m->sign[0][0] = m->sign[0][1] = m->sign[1][0] = 0;
sign = q >> (GMP_LIMB_BITS - 1);
m->R[1][1] = (q + sign) ^ sign;
m->sign[1][1] = sign;
return (m->j = GMP_NUMB_BITS - 1);
}
r00 = r11 = 1;
r01 = r10 = 0;
j = 0;
s = 1;
/* We need 2*(k-j) bits of accuracy */
/* Loop while we need more than one limb of accuracy. */
for (;;)
{
mp_limb_signed_t q;
unsigned oldv;
/* Divide by b */
q = bdiv_1 (al, bl, v);
ASSERT (ODD (q));
MUL_Q (r00, r01, r10, r11);
ASSERT (s < GMP_NUMB_BITS);
ADDMUL2_Q(ah, al, bh, bl);
ASSERT ( (al & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
if (al == 0)
goto split_swap;
oldv = v;
count_trailing_zeros (v, al);
ASSERT (v > oldv);
RSHIFT2 (ah, al, v);
v -= oldv;
if (j + v >= k || s + v >= GMP_NUMB_BITS)
goto split_swap;
else if (s + v == GMP_NUMB_BITS - 1)
goto final_swap;
else if (2*(k-j) <= GMP_NUMB_BITS)
goto middle;
/* Divide by a */
q = bdiv_1 (bl, al, v);
ASSERT (ODD (q));
MUL_Q (r10, r11, r00, r01);
ASSERT (s < GMP_NUMB_BITS);
ADDMUL2_Q(bh, bl, ah, al);
ASSERT ( (bl & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
if (bl == 0)
goto split;
oldv = v;
count_trailing_zeros (v, bl);
RSHIFT2 (bh, bl, v);
v -= oldv;
if (j + v >= k || s + v >= GMP_NUMB_BITS)
goto split;
else if (s + v == GMP_NUMB_BITS - 1)
goto final;
else if (2*(k-j) <= GMP_NUMB_BITS)
break;
}
/* Now we can forget about ah and bh. On the other hand, we need to
check for an overflowing R */
for (;;)
{
mp_limb_signed_t q;
unsigned oldv;
/* Divide by b */
q = bdiv_1 (al, bl, v);
ASSERT (ODD (q));
MUL_Q (r00, r01, r10, r11);
ASSERT (s < GMP_NUMB_BITS);
/* Update a, b */
al += q*bl;
ASSERT ( (al & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
if (al == 0)
goto split_swap;
oldv = v;
count_trailing_zeros (v, al);
al >>= v;
v -= oldv;
if (j + v >= k || s + v >= GMP_NUMB_BITS)
goto split_swap;
else if (s + v == GMP_NUMB_BITS - 1)
goto final_swap;
middle:
/* Divide by a */
q = bdiv_1 (bl, al, v);
ASSERT (ODD (q));
MUL_Q (r10, r11, r00, r01);
ASSERT (s < GMP_NUMB_BITS);
/* Update a, b */
bl += q*al;
ASSERT ( (bl & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
if (bl == 0)
goto split;
oldv = v;
count_trailing_zeros (v, bl);
bl >>= v;
v -= oldv;
if (j + v >= k || s + v >= GMP_NUMB_BITS)
goto split;
else if (s + v == GMP_NUMB_BITS - 1)
break;
}
#undef MUL_Q
#undef ADDMUL2_Q
final:
if (0)
{
final_swap:
MP_LIMB_SIGNED_T_SWAP (r00, r10);
MP_LIMB_SIGNED_T_SWAP (r01, r11);
MP_LIMB_T_SWAP (al, bl);
}
if (j + v < k && s + v < GMP_NUMB_BITS)
{
/* One more quotient will fit in sign-magnitude representation,
but two's complement calculation may overflow. */
mp_limb_signed_t q;
mp_limb_signed_t t0, t1;
mp_limb_signed_t sign;
q = bdiv_1 (al, bl, v);
ASSERT (ODD (q));
t0 = r00 << v;
t1 = q * r10; /* Can't overflow, since q > -2^{-v} */
r00 = t0 + t1;
/* Correct sign given by the majority function */
sign = ((t0 & t1) | (r00 & t0) | (r00 & t1)) >> (GMP_LIMB_BITS - 1);
m->R[1][0] = (r00 + sign) ^ sign;
m->sign[1][0] = sign;
t0 = r01 << v;
t1 = q * r11;
r01 = t0 + t1;
sign = ((t0 & t1) | (r01 & t0) | (r01 & t1) ) >> (GMP_LIMB_BITS - 1);
m->R[1][1] = (r01 + sign) ^ sign;
m->sign[1][1] = sign;
sign = r10 >> (GMP_LIMB_BITS - 1);
m->R[0][0] = ( (r10 << v) + sign) ^ sign;
m->sign[0][0] = sign;
sign = r11 >> (GMP_LIMB_BITS - 1);
m->R[0][1] = ( (r11 << v) + sign) ^ sign;
m->sign[0][1] = sign;
j += v;
}
else
{
mp_limb_signed_t sign;
if (0)
{
split_swap:
MP_LIMB_SIGNED_T_SWAP (r00, r10);
MP_LIMB_SIGNED_T_SWAP (r01, r11);
}
split:
/* Split into signed/magnitude representation */
sign = r00 >> (GMP_LIMB_BITS - 1);
m->R[0][0] = (r00 + sign) ^sign;
m->sign[0][0] = sign;
sign = r01 >> (GMP_LIMB_BITS - 1);
m->R[0][1] = (r01 + sign) ^sign;
m->sign[0][1] = sign;
sign = r10 >> (GMP_LIMB_BITS - 1);
m->R[1][0] = (r10 + sign) ^sign;
m->sign[1][0] = sign;
sign = r11 >> (GMP_LIMB_BITS - 1);
m->R[1][1] = (r11 + sign) ^sign;
m->sign[1][1] = sign;
}
ASSERT (j > 0);
m->j = j;
return j;
}
/* Compute the inverse of -b,
x * b = -1 (mod 2^(n * GMP_NUMB_BITS))
using the iteration formula
x_{k+1} = 2 x_k + x_k^2 b
In fact, when x_k is IN limbs, the only thing we need is the middle
part, limb IN to 2 IN, of the product x_k^2 b. The lower limbs are
known already, and the higher limbs are not significant.
Needs 3*xn limbs of temporary storage.
*/
/* NOTE: Handles input b of fewer limbs than output x.We should
usually have 2 zlimbs <= n */
static void
invert_limbs (mp_ptr xp, mp_size_t xn, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
{
mp_limb_t b0;
mp_limb_t x0;
ASSERT (bn >= 1);
if (bn > xn)
bn = xn;
ASSERT (!MPN_OVERLAP_P (xp, xn, bp, xn));
ASSERT (!MPN_OVERLAP_P (xp, xn, tp, 3*xn));
ASSERT (!MPN_OVERLAP_P (bp, bn, tp, 3*xn));
b0 = bp[0];
modlimb_invert (x0, -b0);
xp[0] = x0;
if (xn >= 2)
{
/* Iterate once, x1 = 2 x0 + x0^2 b, and keep only the two
least significant limbs. */
mp_limb_t sh, sl;
mp_limb_t t, dummy;
/* Compute sh, sl = x0^2 */
umul_ppmm (sh, sl, x0, x0);
/* Compute th, tl = x0^2 b (ignoring higher limbs) */
umul_ppmm (t, dummy, sl, b0); /* x0^2 b0 */
/* We always have a carry from the low limb which we don't
compute. */
xp[1] = t + sh * b0 + 1;
if (bn > 1)
xp[1] += sl * bp[1];
if (xn > 2)
{
int count;
int bitsize;
mp_size_t mask;
mp_size_t in;
/* We need at most xn + 1 limbs to compute the the square,
but at most xn limbs to store it. */
mp_ptr sp = tp;
/* And this product can be at most 2*xn */
mp_ptr pp = tp + xn;
count_leading_zeros (count, (mp_limb_t) xn-1);
bitsize = GMP_LIMB_BITS - count;
ASSERT (bitsize >= 2);
mask = 1L << (bitsize - 2);
for (in = 2; in < xn; mask /= 2)
{
mp_limb_t nn;
ASSERT (mask != 0);
/* We know the in least significant words of the
inverse, and iterate
x = 2*x + x^2 * b
to double the size. */
mpn_sqr_n (sp, xp, in);
/* Drop the high limb if we don't need it. */
nn = 2 * in - ((-xn & mask) != 0);
ASSERT (nn <= xn);
/* FIXME: We don't need the nn most significant
limbs. And we don't need the least significant in
limbs either, as we already know them. */
mpn_mul (pp, sp, nn, bp, MIN (nn, bn));
/* For the addition of 2x_k, why shouldn't we add in the
most significant limb of x_k? */
mpn_add_1 (xp + in, pp + in, nn - in, 1);
in = nn;
}
}
}
}
/* For a, b odd, a of size n, b of size n - zlimbs, j = zlimbs *
GMP_NUMB_BITS + zbits, compute q = - a (b)^-1 mod 2^(j+1), with
-2^j < q < 2^j, and set
r <-- 2^(-j) (a + q b)
Stores the zlimb least significant limbs of r at ap, and returns
the most significant limb (which is always 0 or 1). Stores the
quotient, zlimbs + 1, limbs, at qp (if zbits == 0 and zlimbs > 0,
then the most significant limb of q is always zero).
Needs 4 * zlimbs of temporary storage to compute the quotient, and
n limbs to compute the remainder (but not at the same time).
*/
#define BDIVMOD_N_ITCH(n, zlimbs) (MAX (4*(zlimbs), (n)))
static mp_limb_t
bdivmod_n (mp_ptr qp, int *qsign,
int *rsign,
mp_ptr ap, mp_srcptr bp, mp_size_t n,
mp_size_t zlimbs, unsigned zbits,
mp_ptr tp)
{
mp_limb_signed_t q;
mp_limb_t hi;
mp_size_t scratch = BDIVMOD_N_ITCH (n, zlimbs);
ASSERT (ODD (ap[0]));
ASSERT (ODD (bp[0]));
ASSERT (!MPN_OVERLAP_P (qp, zlimbs + 1, ap, n));
ASSERT (!MPN_OVERLAP_P (qp, zlimbs + 1, bp, n - zlimbs));
ASSERT (!MPN_OVERLAP_P (ap, n, bp, n - zlimbs));
ASSERT (!MPN_OVERLAP_P (tp, scratch, ap, n));
ASSERT (!MPN_OVERLAP_P (tp, scratch, bp, n - zlimbs));
if (zlimbs == 0)
hi = 0;
else
{
/* First compute q such that a + q*b has zlimbs zero limbs at
the least significant end. */
invert_limbs (tp, zlimbs, bp, n - zlimbs, tp + zlimbs);
/* Let q = t * a. We only want the least significant
limbs. */
mpn_mul_n (tp + zlimbs, tp, ap, zlimbs);
MPN_COPY (qp, tp + zlimbs, zlimbs);
/* Compute q*b. We don't really need the zlimb least
significant limbs. */
if (zlimbs <= n - zlimbs)
mpn_mul (tp, bp, n - zlimbs, qp, zlimbs);
else
mpn_mul (tp, qp, zlimbs, bp, n - zlimbs);
/* We must get a carry when adding the lower limbs (since a odd
=> [ap, zlimbs] != 0). This is actually the only piece of
this code that depends on a being odd. */
MPN_COPY_INCR (ap, ap + zlimbs, n - zlimbs);
n -= zlimbs;
hi = mpn_add_n (ap, ap, tp + zlimbs, n);
hi += mpn_add_1 (ap, ap, n, 1);
}
/* Now we have zbits + 1 bits left to clear of ap[0] */
q = bdiv_1 (ap[0], bp[0], zbits);
if (q >= 0)
{
*qsign = 0;
*rsign = 0;
/* Can't overflow, since q < 2^(GMP_NUMB_BITS-1) and hi <= 1 */
hi += mpn_addmul_1 (ap, bp, n, q);
}
else
{
mp_limb_t cy;
*qsign = -1;
q = -q;
cy = mpn_submul_1 (ap, bp, n, q);
if (cy <= hi)
{
*rsign = 0;
hi -= cy;
}
else
{
*rsign = -1;
ASSERT_NOCARRY (negate_diff (ap, n));
hi = cy - hi - 1;
}
if (zlimbs > 0)
{
ASSERT_NOCARRY (negate_diff (qp, zlimbs));
q--;
}
}
qp[zlimbs] = q;
if (zbits > 0)
{
mpn_rshift (ap, ap, n, zbits);
ap[n-1] |= (hi << (GMP_NUMB_BITS - zbits) & GMP_NUMB_MASK);
hi >>= zbits;
}
return hi;
}
/* Takes as input two n-limb numbers (typically the least
significant limbs of some larger numbers), a odd, b even, and an
integer k.
Returns an array R, an integer j, two numbers a', b' such that
/a'\ -2j /a\
\b'/ = 2 R \b/
where
a' is odd
v (2^j a') < k,
v (2^j b') >= k
*/
/* Needs temporary space for calling bgcd_matrix1_apply,
bgcd_matrix1_mul, bdivmod_n, bgcd_matrix_mul_q, and storing q. The
most demanding is bdivmod, for which we need 5*n/2 limbs. */
#define HBGCD_N_BASE_ITCH(n) (5*(n)/2)
static mp_size_t
hbgcd_n_base (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned k,
struct bgcd_matrix *m, mp_ptr tp)
{
ASSERT (2 * k <= n * GMP_NUMB_BITS);
ASSERT (MPN_BGCD_MATRIX_ITCH (n) <= m->alloc);
/* Should be initialized already */
ASSERT (m->j == 0);
ASSERT (n >= 2);
ASSERT (ODD (ap[0]));
ASSERT (!ODD (bp[0]));
for (;;)
{
mp_size_t zlimbs;
unsigned zbits;
ASSERT (ODD (ap[0]));
ASSERT (!ODD (bp[0]));
zlimbs = power_of_2 (&zbits, bp, n);
if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k)
break;
if (zlimbs == 0)
{
/* FIXME: Some duplication with next case. */
struct bgcd_matrix1 m1;
/* Here, we may have n == 1, and then the values ap[1] and
bp[1] will not be used. */
hbgcd2 (ap[1], ap[0], bp[1], bp[0], k - m->j, &m1);
ASSERT (m1.j > 0);
/* Modifies the signs of the matrix elements. */
n = bgcd_matrix1_apply (&m1, ap, bp, n, tp);
bgcd_matrix_mul_1 (m, &m1, tp);
}
else
{
mp_size_t i;
mp_limb_t cy;
int qsign;
int rsign;
ASSERT (zlimbs <= (n-1) /2);
if (zbits)
ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits));
else
MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs);
/* Needs zlimbs + 1 to store the quotient, and MAX (n, 4
zlimbs) for the computations. Since zlimbs <= (n-1)/2,
the total need is at most
(n + 1) / 2 + MAX (n, 2*(n-1)) = n/2 + 1/2 + 2*n - 2 <= 5*n/2
*/
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n,
zlimbs, zbits, tp + zlimbs + 1);
/* Needs zlimbs + 1 + m->n <= 2*n limbs of temporary storage. */
bgcd_matrix_mul_q (m, zlimbs, zbits, tp, qsign, tp + zlimbs + 1);
if (rsign < 0)
{
m->sign[1][0] = ~m->sign[1][0];
m->sign[1][1] = ~m->sign[1][1];
}
n -= zlimbs;
if (cy > 0)
{
ap[n] = cy;
bp[n] = 0;
n++;
}
/* Swapping pointers would confuse our caller, so we have to swap
the contents. */
for (i = 0; i<n; i++)
MP_LIMB_T_SWAP (ap[i], bp[i]);
}
}
return n;
}
/* For input of size n, we need 11*n/2 + 3 limbs of temporary storage.
This is sufficient for the first recursive call and the division
step. For the final recursive call, we need 3*n/2 + 7 for the
matrix, and 4*(n-1) limbs for the work after the recursive call.
The recursive call needs 11*n/4 + 3limbs of storage, so this works out
because
4*(n-1) - (11*n/4 + 3) = (16 - 11) n / 4 - 4 - 3 >= 3, for n >= 8.
Actually, approximately 5*n/4 limbs are unused by the recursive
call, so perhaps it is possible to rearrange the storage so that we
get by with only 17*n/4???
*/
mp_size_t
mpn_hbgcd_n_itch (mp_size_t n)
{
if (BELOW_THRESHOLD (n, HBGCD_THRESHOLD))
return HBGCD_N_BASE_ITCH (n);
else
return 11*n/2 + 3;
}
mp_size_t
mpn_hbgcd_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned k,
struct bgcd_matrix *m, mp_ptr tp)
{
/* Original size */
mp_size_t l = n;
if (BELOW_THRESHOLD (n, HBGCD_THRESHOLD))
return hbgcd_n_base (ap, bp, n, k, m, tp);
/* Should be initialized already */
ASSERT (m->j == 0);
ASSERT (n >= 2);
ASSERT (ODD (ap[0]));
ASSERT (!ODD (bp[0]));
/* FIXME: Should we allow smaller n? */
ASSERT (2 * k <= n * GMP_NUMB_BITS);
ASSERT (MPN_BGCD_MATRIX_ITCH (n) <= m->alloc);
{
/* First recursive call */
mp_size_t zlimbs;
unsigned zbits;
unsigned k1;
mp_size_t n1;
mp_size_t nn;
zlimbs = power_of_2 (&zbits, bp, n);
if (zlimbs * GMP_NUMB_BITS + zbits >= k)
goto done;
/* Split on a limb boundary */
n1 = (n+1)/2;
k1 = n1 * (GMP_NUMB_BITS / 2);
ASSERT (k1 >= (k+1)/2);
if (zlimbs * GMP_NUMB_BITS + zbits < k1)
{
ASSERT (n1 < n);
nn = mpn_hbgcd_n (ap, bp, n1, k1, m, tp);
ASSERT (m->j > 0);
/* Needs space 4 * l */
n = bgcd_matrix_apply (m, ap, bp, n, n1, nn, tp);
}
ASSERT (m->j < k1);
}
ASSERT (m->n <= l);
{
/* Division. This stage needs temporary storage of 5 * l / 2 limbs
(same analysis as the division case in in hbgcd_n_base) */
mp_size_t zlimbs;
unsigned zbits;
mp_size_t i;
mp_limb_t cy;
int qsign;
int rsign;
zlimbs = power_of_2 (&zbits, bp, n);
if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k)
goto done;
ASSERT (zlimbs <= (n-1) /2);
if (zbits)
ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits));
else
MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs);
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n,
zlimbs, zbits, tp + zlimbs + 1);
bgcd_matrix_mul_q (m, zlimbs, zbits, tp, qsign, tp + zlimbs + 1);
if (rsign < 0)
{
m->sign[1][0] = ~m->sign[1][0];
m->sign[1][1] = ~m->sign[1][1];
}
n -= zlimbs;
if (cy > 0)
{
ap[n] = cy;
bp[n] = 0;
n++;
}
/* Swapping pointers would confuse our caller, so we have to swap
the contents. */
for (i = 0; i<n; i++)
MP_LIMB_T_SWAP (ap[i], bp[i]);
}
ASSERT (m->j >= (k+1)/2);
ASSERT (n <= 2 + (27 * l + 5) / 32);
ASSERT (n < l);
{
/* Second and final recursive call */
mp_size_t zlimbs;
unsigned zbits;
unsigned k1;
mp_size_t n1;
mp_size_t nn;
zlimbs = power_of_2 (&zbits, bp, n);
if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k)
goto done;
k1 = k - m->j;
ASSERT (k1 <= (k+1) / 2);
if (k1 <= GMP_NUMB_BITS)
{
struct bgcd_matrix1 m1;
hbgcd2 (ap[1], ap[0], bp[1], bp[0], k1, &m1);
ASSERT (m1.j > 0);
n = bgcd_matrix1_apply (&m1, ap, bp, n, tp);
bgcd_matrix_mul_1 (m, &m1, tp);
/* May need another one */
k1 = k - m->j;
if (hbgcd2 (ap[1], ap[0], bp[1], bp[0], k1, &m1) > 0)
{
n = bgcd_matrix1_apply (&m1, ap, bp, n, tp);
bgcd_matrix_mul_1 (m, &m1, tp);
}
}
else
{
struct bgcd_matrix m1;
/* Need at least 2 * k1 bits */
n1 = (2 * k1 + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS;
ASSERT (n1 < n);
ASSERT (n1 <= l/2);
/* Each matrix element is of size at most
MPN_BGCD_MATRIX_ITCH (n1) <= (11*l/2 + 31) / 16
= 11 * l/32 + 31 / 16 < 3*l/8 + 31/16 - l/32 <= 3*l/8 + 27/16
Hence, 3*l/2 + 7 limbs should be enough for the entire matrix.
*/
mpn_bgcd_matrix_init (&m1, tp, MPN_BGCD_MATRIX_ITCH (n1));
ASSERT (4 * MPN_BGCD_MATRIX_ITCH (n1) <= 3*l/2 + 7);
tp += 3*l/2 + 7;
nn = mpn_hbgcd_n (ap, bp, n1, k1, &m1, tp);
ASSERT (m->j > 0);
/* Needs space 4*n <= 4*(l - 1) */
n = bgcd_matrix_apply (&m1, ap, bp, n, n1, nn, tp);
/* Needs space 3 * (MPN_BGCD_MATRIX_ITCH (l) - 1) + 2 * (MPN_BGCD_MATRIX_ITCH(n1) - 1)
<= 3 * (11 * l + 15) / 16 + 2 * (11 * l/2 + 15) / 16
= 44 * l / 16 + (45 + 30)/16
< 11 * l / 4 + 5 < 3*l + 5
==> need <= 3*l + 4 <= * (l - 1);
where l denotes the original input size. */
bgcd_matrix_mul (m, &m1, tp);
}
}
done:
ASSERT (m->j < k);
return n;
}
/* For a hbgcd step, we need
4*MPN_BGCD_MATRIX_ITCH(n/2) + MAX(hbgcd_n_itch(n/2), 4*n)
<= 3*n/2 + 7 + MAX (11*n/4 + 3, 4*n)
== (3/2 + 4) * n + 7 = 11*n/2 + 7
For a division step, we need
zlimbs + 1 + (MAX (4 * zlimbs, n)) <= n + 4*(n-1) < 5*n
For kinds of steps, 11*n/2 + 7 is sufficient.
*/
#define BGCD_N_ITCH(n) (11*(n)/2 + 7)
static mp_size_t
bgcd_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
{
ASSERT (ODD (ap[0]));
if (ODD (bp[0]))
{
int cmp = mpn_cmp (ap, bp, n);
if (cmp == 0)
{
MPN_NORMALIZE (ap, n);
MPN_COPY (gp, ap, n);
return n;
}
else if (cmp > 0)
MP_PTR_SWAP (ap, bp);
ASSERT_NOCARRY (mpn_sub_n (bp, bp, ap, n));
}
while (ABOVE_THRESHOLD (n, BGCD_THRESHOLD))
{
struct bgcd_matrix m;
mp_size_t zlimbs;
unsigned zbits;
mp_size_t n1;
mp_size_t l;
mp_size_t alloc;
zlimbs = power_of_2 (&zbits, bp, n);
if (zlimbs == n)
{
MPN_NORMALIZE (ap, n);
MPN_COPY (gp, ap, n);
return n;
}
n1 = n/2;
alloc = MPN_BGCD_MATRIX_ITCH (n1);
ASSERT (4*alloc <= 3*n/2 + 7);
mpn_bgcd_matrix_init (&m, tp + 4 * n, alloc);
l = mpn_hbgcd_n (ap, bp, n1, n1 * GMP_NUMB_BITS / 2, &m, tp);
if (m.j > 0)
n = bgcd_matrix_apply (&m, ap, bp, n, n1, l, tp);
else
{
mp_limb_t cy;
int qsign;
int rsign;
if (zbits)
ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits));
else
MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs);
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n,
zlimbs, zbits, tp + zlimbs + 1);
n -= zlimbs;
if (cy > 0)
{
ap[n] = cy;
bp[n] = 0;
n++;
}
MP_PTR_SWAP (ap, bp);
}
}
/* Use old algorithm */
{
mp_size_t an;
mp_size_t bn;
an = bn = n;
MPN_NORMALIZE (ap, an);
MPN_NORMALIZE (bp, bn);
if (bn == 0)
{
MPN_COPY (gp, ap, an);
return an;
}
/* The *smaller* number must be odd */
if (an > bn || (an == bn && ap[an-1] > bp[an - 1]))
{
int bits;
while (bp[0] == 0)
{
bp++;
bn--;
}
count_trailing_zeros (bits, bp[0]);
if (bits > 0)
{
ASSERT_NOCARRY (mpn_rshift (bp, bp, bn, bits));
bn -= (bp[bn-1] == 0);
}
MPN_PTR_SWAP (ap, an, bp, bn);
}
return mpn_gcd (gp, bp, bn, ap, an);
}
}
/* Needs BDIVMOD_N_ITCH(an, an - bn) + an - bn + 1 for the initial division,
and BGCD_N_ITCH (bn) for the bgcd_n call. I.e. the max of
an - bn + 1 + MAX(4 * (an - bn), an) <= 5*an
11 * bn/2 + 7
*/
#define BGCD_ITCH(an, bn) MAX(5*(an), BGCD_N_ITCH(bn))
static mp_size_t
bgcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t bn, mp_ptr tp)
{
/* mpn_gcd uses different conventions than the rest of the code. */
ASSERT (ODD (bp[0]));
ASSERT (an >= bn);
if (an > bn)
{
mp_limb_t zlimbs;
unsigned zbits;
zlimbs = power_of_2 (&zbits, ap, an);
ASSERT (zlimbs < an);
if (zbits > 0)
ASSERT_NOCARRY (mpn_rshift (ap, ap + zlimbs, an - zlimbs, zbits));
else if (zlimbs > 0)
MPN_COPY_INCR (ap, ap + zlimbs, an - zlimbs);
an = an - zlimbs;
an -= (ap[an-1] == 0);
if (an < bn)
MPN_ZERO (ap + an, an - bn);
else if (an > bn)
{
mp_size_t qn = an - bn + 1;
mp_limb_t cy;
int qsign;
int rsign;
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, an,
an - bn, 0, tp + qn);
an = bn;
ASSERT (!ODD (ap[0]));
if (cy > 0)
{
ASSERT (cy == 1);
mpn_rshift1 (ap, ap, an);
ap[an-1] |= GMP_NUMB_HIGHBIT;
}
}
}
/* Now both numbers are of size bn, and b is still odd */
return bgcd_n (gp, bp, ap, bn, tp);
}
mp_size_t
mpn_bgcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t bn)
{
mp_ptr tp;
mp_size_t gn;
TMP_DECL;
TMP_MARK;
tp = TMP_ALLOC_LIMBS (BGCD_ITCH (an, bn));
gn = bgcd (gp, ap, an, bp, bn, tp);
TMP_FREE;
return gn;
}