2082 lines
46 KiB
C
2082 lines
46 KiB
C
/* Stehle's and Zimmermann's binary algorithm */
|
|
#include <stdio.h> /* for NULL */
|
|
#include <string.h>
|
|
|
|
/* #define WANT_ASSERT 0 */
|
|
|
|
#include "mpir.h"
|
|
#include "gmp-impl.h"
|
|
#include "longlong.h"
|
|
|
|
/* Notation: W = 2^GMP_NUMB_BITS */
|
|
|
|
#define ODD(x) ((x) & 1)
|
|
|
|
/* Extract one limb, shifting count bits right
|
|
________ ________
|
|
|___xh___||___xl___|
|
|
|____r____|
|
|
>count<
|
|
*/
|
|
|
|
#define EXTRACT_LIMB(count, xh, xl) \
|
|
(((xl) >> (count) ) | (((xh) << (GMP_NUMB_BITS - count)) & GMP_NUMB_MASK))
|
|
|
|
#define RSHIFT2(h, l, count) do { \
|
|
(l) = ((l)>>(count)) | (((h)<<(GMP_NUMB_BITS - count)) & GMP_NUMB_MASK); \
|
|
(h) >>= (count); \
|
|
} while (0)
|
|
|
|
#define MP_LIMB_SIGNED_T_SWAP(x, y) \
|
|
do { \
|
|
mp_limb_signed_t __mp_limb_signed_t_swap__tmp = (x); \
|
|
(x) = (y); \
|
|
(y) = __mp_limb_signed_t_swap__tmp; \
|
|
} while (0)
|
|
|
|
#define INT_SWAP(x, y) \
|
|
do { \
|
|
int __int_swap__tmp = (x); \
|
|
(x) = (y); \
|
|
(y) = __int_swap__tmp; \
|
|
} while (0)
|
|
|
|
/* Computes 2^(n * GMP_NUMB_BITS) - x. Returns 1 in case x was zero,
|
|
otherwise zero. This function is used when a subtraction yields a
|
|
borrow, to negate the difference. */
|
|
static mp_limb_t
|
|
negate_diff (mp_ptr xp, mp_size_t n)
|
|
{
|
|
mp_size_t i;
|
|
|
|
ASSERT (n > 0);
|
|
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
if (xp[i] > 0)
|
|
{
|
|
xp[i] = ~xp[i] + 1;
|
|
i++;
|
|
if (i < n)
|
|
mpn_com_n (xp + i, xp + i, n - i);
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/* Computes r = u x + v y. Returns most significant limb. rp and xp
|
|
may be the same. */
|
|
|
|
static mp_limb_t
|
|
addmul2_1 (mp_ptr rp, mp_limb_t *cy, mp_srcptr xp, mp_srcptr yp, mp_size_t n,
|
|
mp_limb_t u, mp_limb_t v)
|
|
{
|
|
mp_limb_t h1;
|
|
mp_limb_t h2;
|
|
|
|
ASSERT (rp != yp);
|
|
|
|
h1 = mpn_mul_1 (rp, xp, n, u);
|
|
h2 = mpn_addmul_1 (rp, yp, n, v);
|
|
|
|
h1 += h2;
|
|
*cy = (h1 < h2);
|
|
|
|
return h1;
|
|
}
|
|
|
|
/* Computes r = u x - v y. rp and xp may be the same. Returns most
|
|
significant limb. Sets *rsign to reflect the sign of the result. */
|
|
static mp_limb_t
|
|
submul2_1 (mp_ptr rp, int *rsign, mp_srcptr xp, mp_srcptr yp, mp_size_t n,
|
|
mp_limb_t u, mp_limb_t v)
|
|
{
|
|
mp_limb_t hi;
|
|
mp_limb_t cy;
|
|
|
|
ASSERT (rp != yp);
|
|
|
|
hi = mpn_mul_1 (rp, xp, n, u);
|
|
|
|
cy = mpn_submul_1 (rp, yp, n, v);
|
|
if (cy <= hi)
|
|
{
|
|
hi -= cy;
|
|
*rsign = 0;
|
|
}
|
|
else
|
|
{
|
|
/* We need to negate the difference,
|
|
|
|
- ( (hi - cy) 2^k + r) = (cy - hi) 2^k - r = (cy - hi - 1) 2^k + 2^k - r */
|
|
|
|
*rsign = -1;
|
|
hi = cy - hi - 1 + negate_diff (rp, n);
|
|
}
|
|
return hi;
|
|
}
|
|
|
|
/* Returns the number of zero limbs at the least significant end of x,
|
|
and the number of zero bits of the first non-zero limb. If x == 0,
|
|
returns n and sets *bits = 0. */
|
|
static mp_size_t
|
|
power_of_2 (unsigned *bits, mp_srcptr xp, mp_size_t n)
|
|
{
|
|
mp_size_t zlimbs;
|
|
|
|
for (zlimbs = 0; zlimbs < n && xp[zlimbs] == 0; zlimbs++)
|
|
;
|
|
|
|
if (zlimbs == n || ODD (xp[zlimbs]))
|
|
*bits = 0;
|
|
else
|
|
{
|
|
int count;
|
|
|
|
count_trailing_zeros (count, xp[zlimbs]);
|
|
*bits = count;
|
|
}
|
|
|
|
return zlimbs;
|
|
}
|
|
|
|
struct bgcd_matrix1
|
|
{
|
|
unsigned j;
|
|
|
|
/* Sign/magnitute representation. sign = 0 for positive, -1 for
|
|
negative */
|
|
signed char sign[2][2];
|
|
mp_limb_t R[2][2];
|
|
};
|
|
|
|
/* Multiplies a,b by 2^(-2j) R. Changes the signs of the rows of R if
|
|
needed to make a, b >= 0. Needs n limbs of temporary storage.
|
|
Returns normalized length. */
|
|
static mp_size_t
|
|
bgcd_matrix1_apply (struct bgcd_matrix1 *m,
|
|
mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
|
|
{
|
|
/* Top two limbs */
|
|
mp_limb_t a0, a1, b0, b1;
|
|
int s0;
|
|
int s1;
|
|
int ts;
|
|
unsigned j;
|
|
|
|
/* Make a copy of a */
|
|
MPN_COPY (tp, ap, n);
|
|
|
|
s0 = m->sign[0][0];
|
|
s1 = m->sign[0][1];
|
|
|
|
if ( (s0 ^ s1) < 0)
|
|
{
|
|
a0 = submul2_1 (ap, &ts, ap, bp, n, m->R[0][0], m->R[0][1]);
|
|
a1 = 0;
|
|
ts ^= s0;
|
|
m->sign[0][0] ^= ts;
|
|
m->sign[0][1] ^= ts;
|
|
}
|
|
else
|
|
{
|
|
a0 = addmul2_1 (ap, &a1, ap, bp, n, m->R[0][0], m->R[0][1]);
|
|
m->sign[0][0] = m->sign[0][1] = 0;
|
|
}
|
|
|
|
s0 = m->sign[1][0];
|
|
s1 = m->sign[1][1];
|
|
|
|
if ( (s0 ^ s1) < 0)
|
|
{
|
|
b0 = submul2_1 (bp, &ts, bp, tp, n, m->R[1][1], m->R[1][0]);
|
|
b1 = 0;
|
|
ts ^= s1;
|
|
m->sign[1][0] ^= ts;
|
|
m->sign[1][1] ^= ts;
|
|
}
|
|
else
|
|
{
|
|
b0 = addmul2_1 (bp, &b1, bp, tp, n, m->R[1][1], m->R[1][0]);
|
|
m->sign[1][0] = m->sign[1][1] = 0;
|
|
}
|
|
|
|
/* Shift 2j bits right */
|
|
j = 2 * m->j;
|
|
|
|
if (j == GMP_NUMB_BITS)
|
|
{
|
|
ASSERT (n >= 2);
|
|
ASSERT (ap[0] == 0);
|
|
ASSERT (bp[0] == 0);
|
|
ASSERT (a1 == 0);
|
|
ASSERT (b1 == 0);
|
|
|
|
MPN_COPY_INCR (ap, ap+1, n-1);
|
|
MPN_COPY_INCR (bp, bp+1, n-1);
|
|
|
|
ap[n-1] = a0;
|
|
bp[n-1] = b0;
|
|
}
|
|
else if (j < GMP_NUMB_BITS)
|
|
{
|
|
/* This is the only case where n == 1 is allowed */
|
|
ASSERT (a1 == 0);
|
|
ASSERT (b1 == 0);
|
|
ASSERT_NOCARRY (mpn_rshift (ap, ap, n, j));
|
|
ASSERT_NOCARRY (mpn_rshift (bp, bp, n, j));
|
|
|
|
ap[n-1] |= (a0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
|
|
bp[n-1] |= (b0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
|
|
|
|
ASSERT ( (a0 >> j) == 0);
|
|
ASSERT ( (b0 >> j) == 0);
|
|
}
|
|
else
|
|
{
|
|
ASSERT (n >= 2);
|
|
|
|
j -= GMP_NUMB_BITS;
|
|
ASSERT (ap[0] == 0);
|
|
ASSERT (bp[0] == 0);
|
|
|
|
ASSERT_NOCARRY (mpn_rshift (ap, ap + 1, n - 1, j));
|
|
ASSERT_NOCARRY (mpn_rshift (bp, bp + 1, n - 1, j));
|
|
|
|
ap[n-2] |= (a0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
|
|
bp[n-2] |= (b0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK;
|
|
|
|
ap[n-1] = (a0 >> j) | (a1 << (GMP_NUMB_BITS - j));
|
|
bp[n-1] = (b0 >> j) | (b1 << (GMP_NUMB_BITS - j));
|
|
}
|
|
|
|
ASSERT (ODD (ap[0]));
|
|
ASSERT (!ODD (bp[0]));
|
|
|
|
return n - ( (ap[n-1] | bp[n-1]) == 0);
|
|
}
|
|
|
|
/* When hbgcd is computed and results in j bits of reduction, then the
|
|
matrix elements will fit in floor (11 (j+1) / 8) bits.
|
|
|
|
If the input to hbgcd is n limbs, and k bits of desired reduction,
|
|
then 2k <= n GMP_NUMB_BITS and j < k. Then elements of the resulting
|
|
matrix must fit in
|
|
|
|
floor (11 (j+1) / 8) <= 11 k / 8 <= 11 n * GMP_NUMB_BITS / 16
|
|
|
|
Rounding up to an integral number of limbs, we get the limit
|
|
|
|
ceil (11 n / 16)
|
|
|
|
Table for small values:
|
|
|
|
n maximum size of matrix entries
|
|
|
|
1 1
|
|
2 2
|
|
3 3
|
|
4 3
|
|
5 4
|
|
6 5
|
|
7 5
|
|
8 6
|
|
|
|
For the computations, we need one extra limb.
|
|
*/
|
|
|
|
void
|
|
mpn_bgcd_matrix_init (struct bgcd_matrix *m, mp_ptr limbs, mp_size_t alloc)
|
|
{
|
|
m->alloc = alloc;
|
|
m->j = 0;
|
|
m->n = 1;
|
|
|
|
memset (m->sign, 0, sizeof (m->sign));
|
|
|
|
m->R[0][0] = limbs;
|
|
m->R[0][1] = limbs + alloc;
|
|
m->R[1][0] = limbs + 2 * alloc;
|
|
m->R[1][1] = limbs + 3 * alloc;
|
|
|
|
MPN_ZERO (limbs, 4 * alloc);
|
|
m->R[0][0][0] = m->R[1][1][0] = 1;
|
|
}
|
|
|
|
/* Needs zlimbs + 1 + m->n <= m->alloc limbs of temporary storage */
|
|
static void
|
|
bgcd_matrix_mul_q (struct bgcd_matrix *m, mp_size_t zlimbs, unsigned zbits,
|
|
mp_ptr qp, int qsign, mp_ptr tp)
|
|
{
|
|
unsigned i;
|
|
mp_size_t n, nn;
|
|
mp_limb_t grow;
|
|
|
|
ASSERT (tp != qp);
|
|
ASSERT (qp[zlimbs] < ((mp_limb_t ) 1 << zbits));
|
|
|
|
/* To multiply a column (u ; v) by (0, 2^j ; 2^j, q), set
|
|
|
|
u = 2^j u + q v
|
|
v = 2^j v
|
|
|
|
and then swap u, v. We need temporary storage for t = q * v.
|
|
|
|
Size should be increased by zlimbs or zlimbs + 1.
|
|
*/
|
|
|
|
grow = 0;
|
|
|
|
n = m->n;
|
|
nn = n + zlimbs;
|
|
|
|
ASSERT (nn + 1 <= m->alloc);
|
|
|
|
for (i = 0; i < 2; i++)
|
|
{
|
|
mp_ptr up = m->R[0][i];
|
|
mp_ptr vp = m->R[1][i];
|
|
int us = m->sign[0][i];
|
|
int vs = m->sign[1][i];
|
|
|
|
/* Set u <<= j */
|
|
if (zbits)
|
|
{
|
|
up[nn] = mpn_lshift (up + zlimbs, up, n, zbits);
|
|
}
|
|
else
|
|
{
|
|
MPN_COPY_DECR (up + zlimbs, up, n);
|
|
ASSERT (up[nn] == 0);
|
|
}
|
|
|
|
/* We could make some clever use of negate_diff, but it's easier to
|
|
clear the low limbs first. */
|
|
MPN_ZERO (up, zlimbs);
|
|
|
|
/* Set t = q * v, and u += t */
|
|
|
|
if (n > zlimbs)
|
|
mpn_mul (tp, vp, n, qp, zlimbs + 1);
|
|
else
|
|
mpn_mul (tp, qp, zlimbs + 1, vp, n);
|
|
|
|
if ( (us ^ vs ^ qsign) < 0)
|
|
{
|
|
mp_limb_t cy;
|
|
|
|
cy = mpn_sub_n (up, up, tp, nn + 1);
|
|
if (cy > 0)
|
|
{
|
|
ASSERT_NOCARRY (negate_diff (up, nn + 1));
|
|
us = ~us;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ASSERT_NOCARRY (mpn_add_n (up, up, tp, nn + 1));
|
|
}
|
|
grow |= up[nn];
|
|
|
|
/* Set v <<= j */
|
|
if (zbits)
|
|
{
|
|
vp[nn] = mpn_lshift (vp + zlimbs, vp, n, zbits);
|
|
grow |= vp[nn];
|
|
}
|
|
else
|
|
{
|
|
MPN_COPY_DECR (vp + zlimbs, vp, n);
|
|
ASSERT (vp[nn] == 0);
|
|
}
|
|
|
|
MPN_ZERO (vp, zlimbs);
|
|
|
|
MP_PTR_SWAP (m->R[0][i], m->R[1][i]);
|
|
m->sign[0][i] = vs;
|
|
m->sign[1][i] = us;
|
|
}
|
|
m->n = nn + (grow != 0);
|
|
m->j += zlimbs * GMP_NUMB_BITS + zbits;
|
|
}
|
|
|
|
/* Needs m->n limbs of temporary storage. */
|
|
static void
|
|
bgcd_matrix_mul_1 (struct bgcd_matrix *m, const struct bgcd_matrix1 *m1,
|
|
mp_ptr tp)
|
|
{
|
|
unsigned i;
|
|
mp_limb_t grow;
|
|
mp_limb_t r00 = m1->R[0][0];
|
|
mp_limb_t r01 = m1->R[0][1];
|
|
mp_limb_t r10 = m1->R[1][0];
|
|
mp_limb_t r11 = m1->R[1][1];
|
|
int s00 = m1->sign[0][0];
|
|
int s01 = m1->sign[0][1];
|
|
int s10 = m1->sign[1][0];
|
|
int s11 = m1->sign[1][1];
|
|
|
|
/* Carries, for the unlikely case that R grows by two limbs. */
|
|
mp_limb_t c[2][2];
|
|
|
|
mp_size_t n = m->n;
|
|
|
|
ASSERT (n + 1 <= m->alloc);
|
|
|
|
grow = 0;
|
|
|
|
for (i = 0; i < 2; i++)
|
|
{
|
|
mp_ptr up = m->R[0][i];
|
|
mp_ptr vp = m->R[1][i];
|
|
int us = m->sign[0][i];
|
|
int vs = m->sign[1][i];
|
|
int ts;
|
|
mp_limb_t uh, vh;
|
|
|
|
/* Make a copy of u */
|
|
MPN_COPY (tp, up, n);
|
|
|
|
if ( (s00 ^ s01 ^ us ^ vs) < 0)
|
|
{
|
|
uh = submul2_1 (up, &ts, up, vp, n, r00, r01);
|
|
c[0][i] = 0;
|
|
ts ^= s00 ^ us;
|
|
}
|
|
else
|
|
{
|
|
uh = addmul2_1 (up, &c[0][i], up, vp, n, r00, r01);
|
|
ts = s00 ^ us;
|
|
}
|
|
if ( (s10 ^ s11 ^ us ^ vs) < 0)
|
|
{
|
|
int s;
|
|
vh = submul2_1 (vp, &s, vp, tp, n, r11, r10);
|
|
c[1][i] = 0;
|
|
vs ^= s11 ^ s;
|
|
}
|
|
else
|
|
{
|
|
vh = addmul2_1 (vp, &c[1][i], vp, tp, n, r11, r10);
|
|
vs ^= s11;
|
|
}
|
|
|
|
up[n] = uh;
|
|
vp[n] = vh;
|
|
grow |= (uh | vh);
|
|
|
|
m->sign[0][i] = ts;
|
|
m->sign[1][i] = vs;
|
|
}
|
|
if ( (c[0][0] | c[0][1] | c[1][0] | c[1][1]) != 0)
|
|
{
|
|
/* Growth by two limbs. */
|
|
m->n += 2;
|
|
ASSERT (m->n + 1 <= m->alloc);
|
|
|
|
m->R[0][0][n+1] = c[0][0];
|
|
m->R[0][1][n+1] = c[0][1];
|
|
m->R[1][0][n+1] = c[1][0];
|
|
m->R[1][1][n+1] = c[1][1];
|
|
}
|
|
else
|
|
{
|
|
m->n += (grow != 0);
|
|
ASSERT (m->n + 1 <= m->alloc);
|
|
}
|
|
m->j += m1->j;
|
|
}
|
|
|
|
/* Computes r = u x + v y. Needs temporary space 2*(xn + un). The
|
|
least significant xn + un limbs are stored at rp. Returns carry.
|
|
|
|
rp == tp is allowed. */
|
|
static mp_limb_t
|
|
addmul2_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t xn,
|
|
mp_srcptr up, mp_srcptr vp, mp_size_t un, mp_ptr tp)
|
|
{
|
|
mp_size_t n = xn + un;
|
|
mp_ptr t1p = tp;
|
|
mp_ptr t2p = tp + n;
|
|
|
|
if (xn >= un)
|
|
{
|
|
mpn_mul (t1p, xp, xn, up, un);
|
|
mpn_mul (t2p, yp, xn, vp, un);
|
|
}
|
|
else
|
|
{
|
|
mpn_mul (t1p, up, un, xp, xn);
|
|
mpn_mul (t2p, vp, un, yp, xn);
|
|
}
|
|
|
|
return mpn_add_n (rp, t1p, t2p, n);
|
|
}
|
|
|
|
/* Computes r = u x - v y. Needs temporary space 2*(xn + un). Result,
|
|
xn + un limbs, is stored at rp. Returns -1 or zero depending on the
|
|
sign of the difference.
|
|
|
|
rp may be the same as either input or tp. */
|
|
static int
|
|
submul2_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t xn,
|
|
mp_srcptr up, mp_srcptr vp, mp_size_t un, mp_ptr tp)
|
|
{
|
|
mp_size_t n = xn + un;
|
|
mp_ptr t1p = tp;
|
|
mp_ptr t2p = tp + n;
|
|
|
|
if (xn >= un)
|
|
{
|
|
mpn_mul (t1p, xp, xn, up, un);
|
|
mpn_mul (t2p, yp, xn, vp, un);
|
|
}
|
|
else
|
|
{
|
|
mpn_mul (t1p, up, un, xp, xn);
|
|
mpn_mul (t2p, vp, un, yp, xn);
|
|
}
|
|
|
|
while (t1p[n-1] == t2p[n-1])
|
|
{
|
|
rp[--n] = 0;
|
|
if (n == 0)
|
|
return 0;
|
|
}
|
|
|
|
if (t1p[n-1] > t2p[n-1])
|
|
{
|
|
ASSERT_NOCARRY (mpn_sub_n (rp, t1p, t2p, n));
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
ASSERT (t1p[n-1] < t2p[n-1]);
|
|
ASSERT_NOCARRY (mpn_sub_n (rp, t2p, t1p, n));
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
/* Sets m = s * m. Needs temporary space 2 * (m->n + s->n) + m->n = 3 * m->n + 2 * s->n */
|
|
static void
|
|
bgcd_matrix_mul (struct bgcd_matrix *m, const struct bgcd_matrix *s,
|
|
mp_ptr tp)
|
|
{
|
|
unsigned i;
|
|
mp_limb_t grow = 0;
|
|
mp_ptr r00 = s->R[0][0];
|
|
mp_ptr r01 = s->R[0][1];
|
|
mp_ptr r10 = s->R[1][0];
|
|
mp_ptr r11 = s->R[1][1];
|
|
int s00 = s->sign[0][0];
|
|
int s01 = s->sign[0][1];
|
|
int s10 = s->sign[1][0];
|
|
int s11 = s->sign[1][1];
|
|
|
|
mp_size_t mn = m->n;
|
|
mp_size_t sn = s->n;
|
|
mp_size_t n = m->n + s->n;
|
|
|
|
/* Space for a copy of u */
|
|
mp_ptr cp = tp + 2 * n;
|
|
|
|
ASSERT (n + 1 <= m->alloc);
|
|
|
|
for (i = 0; i < 2; i++)
|
|
{
|
|
mp_ptr up = m->R[0][i];
|
|
mp_ptr vp = m->R[1][i];
|
|
int us = m->sign[0][i];
|
|
int vs = m->sign[1][i];
|
|
int ts;
|
|
|
|
/* Make a copy of u */
|
|
MPN_COPY (cp, up, mn);
|
|
|
|
/* Set u = r00 * u + r01 * v */
|
|
|
|
/* If r00 u and r01 v have different signs, we should subtract */
|
|
ts = us ^ s00;
|
|
if ( (s00 ^ us ^ s01 ^ vs) < 0)
|
|
{
|
|
ts ^= submul2_n (up, cp, vp, mn, r00, r01, sn, tp);
|
|
up[n] = 0;
|
|
}
|
|
else
|
|
{
|
|
up[n] = addmul2_n (up, cp, vp, mn, r00, r01, sn, tp);
|
|
grow |= up[n];
|
|
}
|
|
|
|
/* Set v = r10 (old) u + r11 v */
|
|
|
|
/* If r10 u and r11 v have different signs, we should subtract */
|
|
if ( (s10 ^ us ^ s11 ^ vs) < 0)
|
|
{
|
|
vs = us ^ s10 ^ submul2_n (vp, cp, vp, mn, r10, r11, sn, tp);
|
|
vp[n] = 0;
|
|
}
|
|
else
|
|
{
|
|
vp[n] = addmul2_n (vp, cp, vp, mn, r10, r11, sn, tp);
|
|
vs = us ^ s10;
|
|
grow |= vp[n];
|
|
}
|
|
|
|
m->sign[0][i] = ts;
|
|
m->sign[1][i] = vs;
|
|
}
|
|
|
|
if (grow > 0)
|
|
m->n = n + 1;
|
|
else
|
|
{
|
|
/* We don't need full normalization, stripping one leading zero
|
|
limb should be sufficient to get size below the bound. */
|
|
|
|
m->n = n - (m->R[0][0][n-1] == 0 && m->R[0][1][n-1] == 0
|
|
&& m->R[1][0][n-1] == 0 && m->R[1][1][n-1] == 0);
|
|
}
|
|
ASSERT (m->n + 1 <= m->alloc);
|
|
|
|
m->j += s->j;
|
|
}
|
|
|
|
/* Sets a = a + c * 2^(shift * GMP_NUMB_BITS),
|
|
________________________
|
|
_|____________________a___|
|
|
+ |__________________c_|
|
|
--> <-- shift
|
|
|
|
Stores MAX(an, cn + shift) limbs of the result at ap, and returns
|
|
carry.
|
|
*/
|
|
|
|
static mp_limb_t
|
|
bgcd_fix_add (mp_ptr ap, mp_size_t an, mp_srcptr cp, mp_size_t cn, mp_size_t shift)
|
|
{
|
|
if (cn + shift < an)
|
|
{
|
|
/* Can this really happen? */
|
|
ASSERT_ALWAYS (0 == "Can this happen?");
|
|
return mpn_add (ap + shift, ap + shift, an - shift, cp, cn);
|
|
}
|
|
else if (shift < an)
|
|
return mpn_add (ap + shift, cp, cn,
|
|
ap + shift, an - shift);
|
|
else
|
|
{
|
|
MPN_COPY (ap + shift, cp, cn);
|
|
if (shift > an)
|
|
MPN_ZERO (ap + an, shift - an);
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* Sets a = a - c * 2^(shift * GMP_NUMB_BITS,
|
|
________________________
|
|
_|____________________a___|
|
|
- |__________________c_|
|
|
--> <-- shift
|
|
|
|
Stores the cn + shift limbs of absolute value at ap, and returns 0
|
|
or -1 depending on the sign.
|
|
*/
|
|
static mp_size_t
|
|
bgcd_fix_sub (mp_ptr ap, mp_size_t an, mp_srcptr cp, mp_size_t cn, mp_size_t shift)
|
|
{
|
|
while (an > cn + shift)
|
|
{
|
|
if (ap[an-1] == 0)
|
|
an--;
|
|
else
|
|
{
|
|
ASSERT_NOCARRY (mpn_sub (ap + shift,
|
|
ap + shift, an - shift, cp, cn));
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
while (cn > 0 && cn + shift > an)
|
|
{
|
|
if (cp[cn - 1] == 0)
|
|
ap[shift + --cn] = 0;
|
|
else
|
|
{
|
|
/* Result must be negative */
|
|
if (shift == 0)
|
|
ASSERT_NOCARRY (mpn_sub (ap, cp, cn, ap, an));
|
|
else
|
|
{
|
|
/* Negate the lower part of a */
|
|
if (shift <= an)
|
|
{
|
|
ASSERT_NOCARRY (mpn_sub (ap + shift, cp, cn, ap + shift, an - shift));
|
|
if (negate_diff (ap, shift) == 0)
|
|
MPN_DECR_U (ap + shift, cn, 1);
|
|
}
|
|
else
|
|
{
|
|
if (negate_diff (ap, an) == 0)
|
|
{
|
|
mp_size_t i;
|
|
for (i = an; i < shift; i++)
|
|
ap[i] = GMP_NUMB_MASK;
|
|
|
|
MPN_DECR_U (ap + shift, cn, 1);
|
|
}
|
|
else
|
|
MPN_ZERO (ap + an, shift - an);
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
}
|
|
while (cn > 0)
|
|
{
|
|
if (ap[shift + cn - 1] == cp[cn - 1])
|
|
ap[shift + --cn] = 0;
|
|
else if (ap[shift + cn - 1] > cp[cn - 1])
|
|
{
|
|
/* Result must be positive */
|
|
ASSERT_NOCARRY (mpn_sub_n (ap + shift, ap + shift, cp, cn));
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
/* Result must be negative */
|
|
ASSERT (ap[shift + cn - 1] < cp[cn - 1]);
|
|
ASSERT_NOCARRY (mpn_sub_n(ap + shift, cp, ap + shift, cn));
|
|
|
|
if (shift > 0 && negate_diff (ap, shift) == 0)
|
|
MPN_DECR_U (ap + shift, cn, 1);
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Computes (C ; D) = 2^(-2j) R (A; B) =
|
|
|
|
2^(k * GMP_NUMB_BITS - 2j) R (a ; b) + (c ; d)
|
|
|
|
where a, b are stored at ap[k, ..., n-1], bp[k, ..., n-1],
|
|
and c, d are stored at ap[0, ..., l-1], bp[0, ..., l-1].
|
|
|
|
Results C, D are also stored at ap, bp. If necessary to get
|
|
non-negative values, flip the signs of the rows of the matrix R.
|
|
|
|
Sizes:
|
|
|
|
Input c,d are l limbs. The output C,D must always fit in n limbs
|
|
(and we do *not* have any temporary space beyond the n:th limb).
|
|
|
|
The temporaries R(a;b) must fit in n limbs. This is because
|
|
#(R(a;b)) <= #R + (n-k) GMP_NUMB_BITS + 1. From the bounds #R <=
|
|
11(j+1)/8 and k GMP_NUMB_BITS >= 2(j+1) we get
|
|
|
|
#(R(a;b)) <= n GMP_NUMB_BITS + (11 - 16)(j+1)/8 + 1
|
|
= n GMP_NUMB_BITS - (5j -3)/8 < n GMP_NUMB_BITS.
|
|
|
|
For shifted numbers 2^(k * GMP_NUMB_BITS - 2j) R (a ; b) we get
|
|
|
|
k * GMP_NUMB_BITS - 2j + #R + (n-k) GMP_NUMB_BITS + 1
|
|
<= n * GMP_NUMB_BITS + (11 j + 11 - 16j)/8 + 1
|
|
<= n * GMP_NUMB_BITS + (17 - 5j)/8
|
|
|
|
Taking the fllor of this expresstion, we see that if j >= 2, n
|
|
limbs should be sufficient. A more careful study of j = 1 shows
|
|
that n limbs is sufficient for all j >= 1.
|
|
|
|
Temporary storage needed: 4*n.
|
|
*/
|
|
static mp_size_t
|
|
bgcd_matrix_apply (struct bgcd_matrix *m,
|
|
mp_ptr ap, mp_ptr bp, mp_size_t n,
|
|
mp_size_t k, mp_size_t l, mp_ptr tp)
|
|
{
|
|
mp_ptr r00 = m->R[0][0];
|
|
mp_ptr r01 = m->R[0][1];
|
|
mp_ptr r10 = m->R[1][0];
|
|
mp_ptr r11 = m->R[1][1];
|
|
int s00 = m->sign[0][0];
|
|
int s01 = m->sign[0][1];
|
|
int s10 = m->sign[1][0];
|
|
int s11 = m->sign[1][1];
|
|
|
|
mp_ptr cp = tp + 2*n;
|
|
mp_ptr dp = tp + 3*n;
|
|
mp_size_t nn;
|
|
mp_limb_t ch;
|
|
mp_limb_t dh;
|
|
int cs;
|
|
int ds;
|
|
|
|
unsigned shift;
|
|
unsigned shift_bits;
|
|
mp_size_t shift_limbs;
|
|
|
|
/* This function probably works for k == 1, but then it's better to
|
|
use hbgcd2 and bgcd_matrix1_apply. */
|
|
ASSERT (k >= 2);
|
|
ASSERT (l <= k);
|
|
ASSERT (2 * m->j < k * GMP_NUMB_BITS);
|
|
|
|
nn = n - k + m->n;
|
|
ASSERT (nn <= n);
|
|
|
|
/* There are three chances for nn to grow: addmul2_n, left shift,
|
|
and bgcd_fix_add. */
|
|
|
|
cs = s00;
|
|
|
|
/* Compute R * (a ; b), and store result at cp, dp */
|
|
if ( (s00 ^ s01) < 0)
|
|
{
|
|
cs ^= submul2_n (cp, ap + k, bp + k, n - k,
|
|
r00, r01, m->n, tp);
|
|
ch = 0;
|
|
}
|
|
else
|
|
ch = addmul2_n (cp, ap + k, bp + k, n - k,
|
|
r00, r01, m->n, tp);
|
|
|
|
ds = s10;
|
|
if ( (s10 ^ s11) < 0)
|
|
{
|
|
ds ^= submul2_n (dp, ap + k, bp + k, n - k,
|
|
r10, r11, m->n, tp);
|
|
dh = 0;
|
|
}
|
|
else
|
|
dh = addmul2_n (dp, ap + k, bp + k, n - k,
|
|
r10, r11, m->n, tp);
|
|
|
|
if ( (ch | dh) > 0)
|
|
{
|
|
ASSERT (nn < n);
|
|
cp[nn] = ch;
|
|
dp[nn] = dh;
|
|
nn++;
|
|
}
|
|
|
|
shift = k * GMP_NUMB_BITS - 2 * m->j;
|
|
shift_limbs = shift / GMP_NUMB_BITS;
|
|
shift_bits = shift % GMP_NUMB_BITS;
|
|
|
|
if (shift_bits > 0)
|
|
{
|
|
ch = mpn_lshift (cp, cp, nn, shift_bits);
|
|
dh = mpn_lshift (dp, dp, nn, shift_bits);
|
|
|
|
if ( (ch | dh) > 0)
|
|
{
|
|
ASSERT (nn < n);
|
|
cp[nn] = ch;
|
|
dp[nn] = dh;
|
|
nn++;
|
|
}
|
|
}
|
|
|
|
/* Update a */
|
|
ASSERT (nn + shift_limbs <= n);
|
|
|
|
if (cs >= 0)
|
|
ch = bgcd_fix_add (ap, l, cp, nn, shift_limbs);
|
|
|
|
else
|
|
{
|
|
int s = bgcd_fix_sub (ap, l, cp, nn, shift_limbs);
|
|
|
|
ch = 0;
|
|
|
|
/* Flip signs of first matrix row, if needed. */
|
|
m->sign[0][0] ^= s;
|
|
m->sign[0][1] ^= s;
|
|
}
|
|
|
|
/* Update b */
|
|
if (ds >= 0)
|
|
dh = bgcd_fix_add (bp, l, dp, nn, shift_limbs);
|
|
else
|
|
{
|
|
int s = bgcd_fix_sub (bp, l, dp, nn, shift_limbs);
|
|
|
|
dh = 0;
|
|
|
|
/* Flip signs of second matrix row, if needed. */
|
|
m->sign[1][0] ^= s;
|
|
m->sign[1][1] ^= s;
|
|
}
|
|
|
|
nn += shift_limbs;
|
|
if (nn < l)
|
|
{
|
|
/* Can this really happen? */
|
|
ASSERT_ALWAYS (0 == "Can this happen?");
|
|
nn = l;
|
|
}
|
|
|
|
if ( (ch | dh) > 0)
|
|
{
|
|
ASSERT (nn < n);
|
|
ap[nn] = ch;
|
|
bp[nn] = dh;
|
|
nn++;
|
|
}
|
|
|
|
return nn;
|
|
}
|
|
|
|
/* Input: b is odd, k < GMP_NUMB_BITS. (Common case is a odd, k > 0).
|
|
|
|
Returns q, such that -2^k <= q < 2^k and v (a + q b) > k, i.e. there
|
|
are at least k+1 zeros at the least significant end.
|
|
*/
|
|
|
|
static inline mp_limb_signed_t
|
|
bdiv_1 (mp_limb_t a, mp_limb_t b, unsigned k)
|
|
{
|
|
mp_limb_t b_inv;
|
|
mp_limb_signed_t q;
|
|
mp_limb_t bit;
|
|
#if 0
|
|
#define BDIV_TABLE_SIZE 3
|
|
#define BDIV_TABLE_MASK ((1 << (BDIV_TABLE_SIZE + 1)) - 1)
|
|
|
|
/* Table of -a b^{-1} mod 2^{k+1}, indexed by
|
|
|
|
[b_k b_{k-1} ... b_1 a_k a_{k-1} a_{k-2} ... a_0]
|
|
*/
|
|
|
|
static const char qtable[1L << (BDIV_TABLE_SIZE * 2 + 1)] =
|
|
{
|
|
/* binv = 1 */ 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
|
/* binv = 11 */ 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11,
|
|
/* binv = 13 */ 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
|
|
/* binv = 7 */ 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 5, 14, 7,
|
|
/* binv = 9 */ 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,
|
|
/* binv = 3 */ 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3,
|
|
/* binv = 5 */ 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
|
|
/* binv = 15 */ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
};
|
|
|
|
ASSERT (ODD (b));
|
|
ASSERT (k < GMP_NUMB_BITS);
|
|
|
|
if (k <= BDIV_TABLE_SIZE)
|
|
{
|
|
bit = CNST_LIMB(1) << k;
|
|
q = qtable[ (((b-1) & BDIV_TABLE_MASK) << BDIV_TABLE_SIZE )
|
|
| (a & BDIV_TABLE_MASK) ] & (2*bit - 1);
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
mp_limb_t mask;
|
|
|
|
/* We need (k+1)-bit inverse */
|
|
b_inv = modlimb_invert_table[(b/2) & 0x7f];
|
|
if (k >= 8)
|
|
{
|
|
b_inv = 2 * b_inv - b_inv * b_inv * b;
|
|
if (k >= 16)
|
|
{
|
|
b_inv = 2 * b_inv - b_inv * b_inv * b;
|
|
if (GMP_NUMB_BITS > 32 && k >= 32)
|
|
{
|
|
b_inv = 2 * b_inv - b_inv * b_inv * b;
|
|
if (GMP_NUMB_BITS > 64 && k >= 64)
|
|
{
|
|
int inv_bits = 64;
|
|
do
|
|
{
|
|
b_inv = 2 * b_inv - b_inv * b_inv * b;
|
|
inv_bits *= 2;
|
|
}
|
|
while (inv_bits <= k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bit = CNST_LIMB(1) << k;
|
|
mask = 2*bit - 1;
|
|
|
|
ASSERT ( (b_inv * b & mask) == 1);
|
|
|
|
q = (- a * b_inv) & mask;
|
|
}
|
|
if (q >= bit)
|
|
q -= 2*bit;
|
|
|
|
return q;
|
|
}
|
|
|
|
/* Input is two two-limb numbers a, b (typically the least significant
|
|
limbs of some larger numbers). The integer k is the desired number
|
|
of bits of reduction.
|
|
|
|
Returns an array R of single limbs, and an integer j < k. If c, d
|
|
are formed as
|
|
|
|
/c\ -2j /a\
|
|
\d/ = 2 R \b/
|
|
|
|
then c is odd, d is even, and gcd(a,b) = gcd(c, d). R is
|
|
constructed by repeated bdiv, where iteration stops when either
|
|
|
|
v (d) >= k - j
|
|
|
|
or another iteration would cause R to overflow.
|
|
|
|
Returns 0 if no reduction is possible, otherwise j.
|
|
*/
|
|
|
|
/* two's complement version */
|
|
static unsigned
|
|
hbgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
|
|
unsigned k,
|
|
struct bgcd_matrix1 *m)
|
|
{
|
|
mp_limb_signed_t r00, r01, r10, r11;
|
|
unsigned s;
|
|
unsigned j;
|
|
unsigned v;
|
|
|
|
ASSERT (ODD (al));
|
|
ASSERT (!ODD (bl));
|
|
|
|
/* Multiply by (0, 2^bits; 2^bits, q) */
|
|
#define MUL_Q(R00, R01, R10, R11) do { \
|
|
R00 = (R00 << v) + q * R10; \
|
|
R01 = (R01 << v) + q * R11; \
|
|
R10 <<= v; \
|
|
R11 <<= v; \
|
|
\
|
|
s += v; \
|
|
s += ((R00 ^ (R00 >> 1)) | (R01 ^ (R01 >> 1))) >> s; \
|
|
j += v; \
|
|
} while (0)
|
|
|
|
/* Add q b to a. We treat q as an unsigned in the multiplication. If
|
|
q < 0 and th * w + tl = (q + w) bl is the result of the
|
|
unsigned multiplication, what we want
|
|
|
|
(q + w - w) (bh w + bl)
|
|
= q * bh * w + th w + tl - w bl =
|
|
= (q * bh + th - bl) w + tl (mod w^2)
|
|
*/
|
|
|
|
#define ADDMUL2_Q(AH, AL, BH, BL) do { \
|
|
mp_limb_t th, tl; \
|
|
mp_limb_t mask = q >> (GMP_LIMB_BITS - 1); \
|
|
\
|
|
umul_ppmm (th, tl, BL, q); \
|
|
\
|
|
AL += tl; \
|
|
AH += th + (AL < tl) + q * BH - (mask & BL); \
|
|
} while (0)
|
|
|
|
/* NOTE: We can reduce at most GMP_NUMB_BITS - 1 bits. Reducing
|
|
GMP_NUMB_BITS would be possible if b was preshifted, so we had 2
|
|
* GMP_NUMB_BITS + 1 of b. But a reduction of GMP_NUMB_BITS,
|
|
without R overflowing, is quite unlikely (probability on the
|
|
order of 1%). */
|
|
if (k > GMP_NUMB_BITS)
|
|
k = GMP_NUMB_BITS;
|
|
|
|
if (bl == 0)
|
|
return 0;
|
|
|
|
count_trailing_zeros (v, bl);
|
|
if (v >= k)
|
|
return 0;
|
|
|
|
RSHIFT2 (bh, bl, v);
|
|
if (v == GMP_NUMB_BITS - 1)
|
|
{
|
|
/* Special case that the two's complement code can't handle */
|
|
mp_limb_signed_t q;
|
|
mp_limb_signed_t sign;
|
|
|
|
q = bdiv_1 (al, bl, v);
|
|
ASSERT (ODD (q));
|
|
|
|
m->R[0][0] = 0;
|
|
m->R[0][1] = m->R[1][0] = CNST_LIMB(1) << (GMP_NUMB_BITS - 1);
|
|
m->sign[0][0] = m->sign[0][1] = m->sign[1][0] = 0;
|
|
|
|
sign = q >> (GMP_LIMB_BITS - 1);
|
|
m->R[1][1] = (q + sign) ^ sign;
|
|
m->sign[1][1] = sign;
|
|
|
|
return (m->j = GMP_NUMB_BITS - 1);
|
|
}
|
|
|
|
r00 = r11 = 1;
|
|
r01 = r10 = 0;
|
|
j = 0;
|
|
s = 1;
|
|
|
|
/* We need 2*(k-j) bits of accuracy */
|
|
/* Loop while we need more than one limb of accuracy. */
|
|
for (;;)
|
|
{
|
|
mp_limb_signed_t q;
|
|
unsigned oldv;
|
|
|
|
/* Divide by b */
|
|
q = bdiv_1 (al, bl, v);
|
|
ASSERT (ODD (q));
|
|
|
|
MUL_Q (r00, r01, r10, r11);
|
|
ASSERT (s < GMP_NUMB_BITS);
|
|
|
|
ADDMUL2_Q(ah, al, bh, bl);
|
|
ASSERT ( (al & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
|
|
|
|
if (al == 0)
|
|
goto split_swap;
|
|
|
|
oldv = v;
|
|
count_trailing_zeros (v, al);
|
|
ASSERT (v > oldv);
|
|
|
|
RSHIFT2 (ah, al, v);
|
|
v -= oldv;
|
|
|
|
if (j + v >= k || s + v >= GMP_NUMB_BITS)
|
|
goto split_swap;
|
|
else if (s + v == GMP_NUMB_BITS - 1)
|
|
goto final_swap;
|
|
else if (2*(k-j) <= GMP_NUMB_BITS)
|
|
goto middle;
|
|
|
|
/* Divide by a */
|
|
q = bdiv_1 (bl, al, v);
|
|
ASSERT (ODD (q));
|
|
|
|
MUL_Q (r10, r11, r00, r01);
|
|
ASSERT (s < GMP_NUMB_BITS);
|
|
|
|
ADDMUL2_Q(bh, bl, ah, al);
|
|
ASSERT ( (bl & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
|
|
|
|
if (bl == 0)
|
|
goto split;
|
|
|
|
oldv = v;
|
|
count_trailing_zeros (v, bl);
|
|
RSHIFT2 (bh, bl, v);
|
|
v -= oldv;
|
|
|
|
if (j + v >= k || s + v >= GMP_NUMB_BITS)
|
|
goto split;
|
|
else if (s + v == GMP_NUMB_BITS - 1)
|
|
goto final;
|
|
else if (2*(k-j) <= GMP_NUMB_BITS)
|
|
break;
|
|
}
|
|
|
|
/* Now we can forget about ah and bh. On the other hand, we need to
|
|
check for an overflowing R */
|
|
for (;;)
|
|
{
|
|
mp_limb_signed_t q;
|
|
unsigned oldv;
|
|
|
|
/* Divide by b */
|
|
q = bdiv_1 (al, bl, v);
|
|
ASSERT (ODD (q));
|
|
|
|
MUL_Q (r00, r01, r10, r11);
|
|
ASSERT (s < GMP_NUMB_BITS);
|
|
|
|
/* Update a, b */
|
|
al += q*bl;
|
|
|
|
ASSERT ( (al & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
|
|
|
|
if (al == 0)
|
|
goto split_swap;
|
|
|
|
oldv = v;
|
|
count_trailing_zeros (v, al);
|
|
al >>= v;
|
|
v -= oldv;
|
|
|
|
if (j + v >= k || s + v >= GMP_NUMB_BITS)
|
|
goto split_swap;
|
|
else if (s + v == GMP_NUMB_BITS - 1)
|
|
goto final_swap;
|
|
|
|
middle:
|
|
/* Divide by a */
|
|
q = bdiv_1 (bl, al, v);
|
|
ASSERT (ODD (q));
|
|
|
|
MUL_Q (r10, r11, r00, r01);
|
|
ASSERT (s < GMP_NUMB_BITS);
|
|
|
|
/* Update a, b */
|
|
bl += q*al;
|
|
|
|
ASSERT ( (bl & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0);
|
|
|
|
if (bl == 0)
|
|
goto split;
|
|
|
|
oldv = v;
|
|
count_trailing_zeros (v, bl);
|
|
bl >>= v;
|
|
v -= oldv;
|
|
|
|
if (j + v >= k || s + v >= GMP_NUMB_BITS)
|
|
goto split;
|
|
else if (s + v == GMP_NUMB_BITS - 1)
|
|
break;
|
|
}
|
|
#undef MUL_Q
|
|
#undef ADDMUL2_Q
|
|
|
|
final:
|
|
if (0)
|
|
{
|
|
final_swap:
|
|
MP_LIMB_SIGNED_T_SWAP (r00, r10);
|
|
MP_LIMB_SIGNED_T_SWAP (r01, r11);
|
|
MP_LIMB_T_SWAP (al, bl);
|
|
}
|
|
if (j + v < k && s + v < GMP_NUMB_BITS)
|
|
{
|
|
/* One more quotient will fit in sign-magnitude representation,
|
|
but two's complement calculation may overflow. */
|
|
|
|
mp_limb_signed_t q;
|
|
mp_limb_signed_t t0, t1;
|
|
mp_limb_signed_t sign;
|
|
|
|
q = bdiv_1 (al, bl, v);
|
|
ASSERT (ODD (q));
|
|
|
|
t0 = r00 << v;
|
|
t1 = q * r10; /* Can't overflow, since q > -2^{-v} */
|
|
|
|
r00 = t0 + t1;
|
|
|
|
/* Correct sign given by the majority function */
|
|
sign = ((t0 & t1) | (r00 & t0) | (r00 & t1)) >> (GMP_LIMB_BITS - 1);
|
|
m->R[1][0] = (r00 + sign) ^ sign;
|
|
m->sign[1][0] = sign;
|
|
|
|
t0 = r01 << v;
|
|
t1 = q * r11;
|
|
r01 = t0 + t1;
|
|
|
|
sign = ((t0 & t1) | (r01 & t0) | (r01 & t1) ) >> (GMP_LIMB_BITS - 1);
|
|
m->R[1][1] = (r01 + sign) ^ sign;
|
|
m->sign[1][1] = sign;
|
|
|
|
sign = r10 >> (GMP_LIMB_BITS - 1);
|
|
m->R[0][0] = ( (r10 << v) + sign) ^ sign;
|
|
m->sign[0][0] = sign;
|
|
|
|
sign = r11 >> (GMP_LIMB_BITS - 1);
|
|
m->R[0][1] = ( (r11 << v) + sign) ^ sign;
|
|
m->sign[0][1] = sign;
|
|
|
|
j += v;
|
|
}
|
|
else
|
|
{
|
|
mp_limb_signed_t sign;
|
|
if (0)
|
|
{
|
|
split_swap:
|
|
MP_LIMB_SIGNED_T_SWAP (r00, r10);
|
|
MP_LIMB_SIGNED_T_SWAP (r01, r11);
|
|
}
|
|
split:
|
|
/* Split into signed/magnitude representation */
|
|
|
|
sign = r00 >> (GMP_LIMB_BITS - 1);
|
|
m->R[0][0] = (r00 + sign) ^sign;
|
|
m->sign[0][0] = sign;
|
|
|
|
sign = r01 >> (GMP_LIMB_BITS - 1);
|
|
m->R[0][1] = (r01 + sign) ^sign;
|
|
m->sign[0][1] = sign;
|
|
|
|
sign = r10 >> (GMP_LIMB_BITS - 1);
|
|
m->R[1][0] = (r10 + sign) ^sign;
|
|
m->sign[1][0] = sign;
|
|
|
|
sign = r11 >> (GMP_LIMB_BITS - 1);
|
|
m->R[1][1] = (r11 + sign) ^sign;
|
|
m->sign[1][1] = sign;
|
|
}
|
|
|
|
ASSERT (j > 0);
|
|
m->j = j;
|
|
return j;
|
|
}
|
|
|
|
|
|
/* Compute the inverse of -b,
|
|
|
|
x * b = -1 (mod 2^(n * GMP_NUMB_BITS))
|
|
|
|
using the iteration formula
|
|
|
|
x_{k+1} = 2 x_k + x_k^2 b
|
|
|
|
In fact, when x_k is IN limbs, the only thing we need is the middle
|
|
part, limb IN to 2 IN, of the product x_k^2 b. The lower limbs are
|
|
known already, and the higher limbs are not significant.
|
|
|
|
Needs 3*xn limbs of temporary storage.
|
|
*/
|
|
|
|
/* NOTE: Handles input b of fewer limbs than output x.We should
|
|
usually have 2 zlimbs <= n */
|
|
static void
|
|
invert_limbs (mp_ptr xp, mp_size_t xn, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
|
|
{
|
|
mp_limb_t b0;
|
|
mp_limb_t x0;
|
|
|
|
ASSERT (bn >= 1);
|
|
if (bn > xn)
|
|
bn = xn;
|
|
|
|
ASSERT (!MPN_OVERLAP_P (xp, xn, bp, xn));
|
|
ASSERT (!MPN_OVERLAP_P (xp, xn, tp, 3*xn));
|
|
ASSERT (!MPN_OVERLAP_P (bp, bn, tp, 3*xn));
|
|
|
|
b0 = bp[0];
|
|
modlimb_invert (x0, -b0);
|
|
xp[0] = x0;
|
|
|
|
if (xn >= 2)
|
|
{
|
|
/* Iterate once, x1 = 2 x0 + x0^2 b, and keep only the two
|
|
least significant limbs. */
|
|
mp_limb_t sh, sl;
|
|
mp_limb_t t, dummy;
|
|
|
|
/* Compute sh, sl = x0^2 */
|
|
umul_ppmm (sh, sl, x0, x0);
|
|
|
|
/* Compute th, tl = x0^2 b (ignoring higher limbs) */
|
|
umul_ppmm (t, dummy, sl, b0); /* x0^2 b0 */
|
|
|
|
/* We always have a carry from the low limb which we don't
|
|
compute. */
|
|
xp[1] = t + sh * b0 + 1;
|
|
if (bn > 1)
|
|
xp[1] += sl * bp[1];
|
|
|
|
if (xn > 2)
|
|
{
|
|
int count;
|
|
int bitsize;
|
|
mp_size_t mask;
|
|
mp_size_t in;
|
|
|
|
/* We need at most xn + 1 limbs to compute the the square,
|
|
but at most xn limbs to store it. */
|
|
mp_ptr sp = tp;
|
|
|
|
/* And this product can be at most 2*xn */
|
|
mp_ptr pp = tp + xn;
|
|
|
|
count_leading_zeros (count, (mp_limb_t) xn-1);
|
|
bitsize = GMP_LIMB_BITS - count;
|
|
ASSERT (bitsize >= 2);
|
|
|
|
mask = 1L << (bitsize - 2);
|
|
|
|
for (in = 2; in < xn; mask /= 2)
|
|
{
|
|
mp_limb_t nn;
|
|
ASSERT (mask != 0);
|
|
|
|
/* We know the in least significant words of the
|
|
inverse, and iterate
|
|
|
|
x = 2*x + x^2 * b
|
|
|
|
to double the size. */
|
|
|
|
mpn_sqr_n (sp, xp, in);
|
|
|
|
/* Drop the high limb if we don't need it. */
|
|
nn = 2 * in - ((-xn & mask) != 0);
|
|
|
|
ASSERT (nn <= xn);
|
|
|
|
/* FIXME: We don't need the nn most significant
|
|
limbs. And we don't need the least significant in
|
|
limbs either, as we already know them. */
|
|
mpn_mul (pp, sp, nn, bp, MIN (nn, bn));
|
|
|
|
/* For the addition of 2x_k, why shouldn't we add in the
|
|
most significant limb of x_k? */
|
|
mpn_add_1 (xp + in, pp + in, nn - in, 1);
|
|
in = nn;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* For a, b odd, a of size n, b of size n - zlimbs, j = zlimbs *
|
|
GMP_NUMB_BITS + zbits, compute q = - a (b)^-1 mod 2^(j+1), with
|
|
-2^j < q < 2^j, and set
|
|
|
|
r <-- 2^(-j) (a + q b)
|
|
|
|
Stores the zlimb least significant limbs of r at ap, and returns
|
|
the most significant limb (which is always 0 or 1). Stores the
|
|
quotient, zlimbs + 1, limbs, at qp (if zbits == 0 and zlimbs > 0,
|
|
then the most significant limb of q is always zero).
|
|
|
|
Needs 4 * zlimbs of temporary storage to compute the quotient, and
|
|
n limbs to compute the remainder (but not at the same time).
|
|
*/
|
|
|
|
#define BDIVMOD_N_ITCH(n, zlimbs) (MAX (4*(zlimbs), (n)))
|
|
|
|
static mp_limb_t
|
|
bdivmod_n (mp_ptr qp, int *qsign,
|
|
int *rsign,
|
|
mp_ptr ap, mp_srcptr bp, mp_size_t n,
|
|
mp_size_t zlimbs, unsigned zbits,
|
|
mp_ptr tp)
|
|
{
|
|
mp_limb_signed_t q;
|
|
mp_limb_t hi;
|
|
mp_size_t scratch = BDIVMOD_N_ITCH (n, zlimbs);
|
|
|
|
ASSERT (ODD (ap[0]));
|
|
ASSERT (ODD (bp[0]));
|
|
|
|
ASSERT (!MPN_OVERLAP_P (qp, zlimbs + 1, ap, n));
|
|
ASSERT (!MPN_OVERLAP_P (qp, zlimbs + 1, bp, n - zlimbs));
|
|
|
|
ASSERT (!MPN_OVERLAP_P (ap, n, bp, n - zlimbs));
|
|
|
|
ASSERT (!MPN_OVERLAP_P (tp, scratch, ap, n));
|
|
ASSERT (!MPN_OVERLAP_P (tp, scratch, bp, n - zlimbs));
|
|
|
|
if (zlimbs == 0)
|
|
hi = 0;
|
|
else
|
|
{
|
|
/* First compute q such that a + q*b has zlimbs zero limbs at
|
|
the least significant end. */
|
|
|
|
invert_limbs (tp, zlimbs, bp, n - zlimbs, tp + zlimbs);
|
|
|
|
/* Let q = t * a. We only want the least significant
|
|
limbs. */
|
|
mpn_mul_n (tp + zlimbs, tp, ap, zlimbs);
|
|
MPN_COPY (qp, tp + zlimbs, zlimbs);
|
|
|
|
/* Compute q*b. We don't really need the zlimb least
|
|
significant limbs. */
|
|
if (zlimbs <= n - zlimbs)
|
|
mpn_mul (tp, bp, n - zlimbs, qp, zlimbs);
|
|
else
|
|
mpn_mul (tp, qp, zlimbs, bp, n - zlimbs);
|
|
|
|
/* We must get a carry when adding the lower limbs (since a odd
|
|
=> [ap, zlimbs] != 0). This is actually the only piece of
|
|
this code that depends on a being odd. */
|
|
|
|
MPN_COPY_INCR (ap, ap + zlimbs, n - zlimbs);
|
|
n -= zlimbs;
|
|
|
|
hi = mpn_add_n (ap, ap, tp + zlimbs, n);
|
|
hi += mpn_add_1 (ap, ap, n, 1);
|
|
}
|
|
|
|
/* Now we have zbits + 1 bits left to clear of ap[0] */
|
|
q = bdiv_1 (ap[0], bp[0], zbits);
|
|
|
|
if (q >= 0)
|
|
{
|
|
*qsign = 0;
|
|
*rsign = 0;
|
|
|
|
/* Can't overflow, since q < 2^(GMP_NUMB_BITS-1) and hi <= 1 */
|
|
hi += mpn_addmul_1 (ap, bp, n, q);
|
|
}
|
|
else
|
|
{
|
|
mp_limb_t cy;
|
|
|
|
*qsign = -1;
|
|
q = -q;
|
|
|
|
cy = mpn_submul_1 (ap, bp, n, q);
|
|
if (cy <= hi)
|
|
{
|
|
*rsign = 0;
|
|
hi -= cy;
|
|
}
|
|
else
|
|
{
|
|
*rsign = -1;
|
|
|
|
ASSERT_NOCARRY (negate_diff (ap, n));
|
|
hi = cy - hi - 1;
|
|
}
|
|
|
|
if (zlimbs > 0)
|
|
{
|
|
ASSERT_NOCARRY (negate_diff (qp, zlimbs));
|
|
q--;
|
|
}
|
|
}
|
|
|
|
qp[zlimbs] = q;
|
|
|
|
if (zbits > 0)
|
|
{
|
|
mpn_rshift (ap, ap, n, zbits);
|
|
ap[n-1] |= (hi << (GMP_NUMB_BITS - zbits) & GMP_NUMB_MASK);
|
|
hi >>= zbits;
|
|
}
|
|
|
|
return hi;
|
|
}
|
|
|
|
/* Takes as input two n-limb numbers (typically the least
|
|
significant limbs of some larger numbers), a odd, b even, and an
|
|
integer k.
|
|
|
|
Returns an array R, an integer j, two numbers a', b' such that
|
|
|
|
/a'\ -2j /a\
|
|
\b'/ = 2 R \b/
|
|
|
|
where
|
|
|
|
a' is odd
|
|
v (2^j a') < k,
|
|
v (2^j b') >= k
|
|
*/
|
|
|
|
/* Needs temporary space for calling bgcd_matrix1_apply,
|
|
bgcd_matrix1_mul, bdivmod_n, bgcd_matrix_mul_q, and storing q. The
|
|
most demanding is bdivmod, for which we need 5*n/2 limbs. */
|
|
|
|
#define HBGCD_N_BASE_ITCH(n) (5*(n)/2)
|
|
|
|
static mp_size_t
|
|
hbgcd_n_base (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned k,
|
|
struct bgcd_matrix *m, mp_ptr tp)
|
|
{
|
|
ASSERT (2 * k <= n * GMP_NUMB_BITS);
|
|
ASSERT (MPN_BGCD_MATRIX_ITCH (n) <= m->alloc);
|
|
|
|
/* Should be initialized already */
|
|
ASSERT (m->j == 0);
|
|
|
|
ASSERT (n >= 2);
|
|
ASSERT (ODD (ap[0]));
|
|
ASSERT (!ODD (bp[0]));
|
|
|
|
for (;;)
|
|
{
|
|
mp_size_t zlimbs;
|
|
unsigned zbits;
|
|
|
|
ASSERT (ODD (ap[0]));
|
|
ASSERT (!ODD (bp[0]));
|
|
|
|
zlimbs = power_of_2 (&zbits, bp, n);
|
|
|
|
if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k)
|
|
break;
|
|
|
|
if (zlimbs == 0)
|
|
{
|
|
/* FIXME: Some duplication with next case. */
|
|
struct bgcd_matrix1 m1;
|
|
|
|
/* Here, we may have n == 1, and then the values ap[1] and
|
|
bp[1] will not be used. */
|
|
hbgcd2 (ap[1], ap[0], bp[1], bp[0], k - m->j, &m1);
|
|
ASSERT (m1.j > 0);
|
|
|
|
/* Modifies the signs of the matrix elements. */
|
|
n = bgcd_matrix1_apply (&m1, ap, bp, n, tp);
|
|
|
|
bgcd_matrix_mul_1 (m, &m1, tp);
|
|
}
|
|
else
|
|
{
|
|
mp_size_t i;
|
|
mp_limb_t cy;
|
|
int qsign;
|
|
int rsign;
|
|
|
|
ASSERT (zlimbs <= (n-1) /2);
|
|
|
|
if (zbits)
|
|
ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits));
|
|
else
|
|
MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs);
|
|
|
|
/* Needs zlimbs + 1 to store the quotient, and MAX (n, 4
|
|
zlimbs) for the computations. Since zlimbs <= (n-1)/2,
|
|
the total need is at most
|
|
|
|
(n + 1) / 2 + MAX (n, 2*(n-1)) = n/2 + 1/2 + 2*n - 2 <= 5*n/2
|
|
*/
|
|
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n,
|
|
zlimbs, zbits, tp + zlimbs + 1);
|
|
|
|
/* Needs zlimbs + 1 + m->n <= 2*n limbs of temporary storage. */
|
|
bgcd_matrix_mul_q (m, zlimbs, zbits, tp, qsign, tp + zlimbs + 1);
|
|
|
|
if (rsign < 0)
|
|
{
|
|
m->sign[1][0] = ~m->sign[1][0];
|
|
m->sign[1][1] = ~m->sign[1][1];
|
|
}
|
|
|
|
n -= zlimbs;
|
|
if (cy > 0)
|
|
{
|
|
ap[n] = cy;
|
|
bp[n] = 0;
|
|
n++;
|
|
}
|
|
|
|
/* Swapping pointers would confuse our caller, so we have to swap
|
|
the contents. */
|
|
for (i = 0; i<n; i++)
|
|
MP_LIMB_T_SWAP (ap[i], bp[i]);
|
|
}
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
/* For input of size n, we need 11*n/2 + 3 limbs of temporary storage.
|
|
|
|
This is sufficient for the first recursive call and the division
|
|
step. For the final recursive call, we need 3*n/2 + 7 for the
|
|
matrix, and 4*(n-1) limbs for the work after the recursive call.
|
|
|
|
The recursive call needs 11*n/4 + 3limbs of storage, so this works out
|
|
because
|
|
|
|
4*(n-1) - (11*n/4 + 3) = (16 - 11) n / 4 - 4 - 3 >= 3, for n >= 8.
|
|
|
|
Actually, approximately 5*n/4 limbs are unused by the recursive
|
|
call, so perhaps it is possible to rearrange the storage so that we
|
|
get by with only 17*n/4???
|
|
*/
|
|
|
|
mp_size_t
|
|
mpn_hbgcd_n_itch (mp_size_t n)
|
|
{
|
|
if (BELOW_THRESHOLD (n, HBGCD_THRESHOLD))
|
|
return HBGCD_N_BASE_ITCH (n);
|
|
else
|
|
return 11*n/2 + 3;
|
|
}
|
|
|
|
mp_size_t
|
|
mpn_hbgcd_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned k,
|
|
struct bgcd_matrix *m, mp_ptr tp)
|
|
{
|
|
/* Original size */
|
|
mp_size_t l = n;
|
|
|
|
if (BELOW_THRESHOLD (n, HBGCD_THRESHOLD))
|
|
return hbgcd_n_base (ap, bp, n, k, m, tp);
|
|
|
|
/* Should be initialized already */
|
|
ASSERT (m->j == 0);
|
|
|
|
ASSERT (n >= 2);
|
|
ASSERT (ODD (ap[0]));
|
|
ASSERT (!ODD (bp[0]));
|
|
|
|
/* FIXME: Should we allow smaller n? */
|
|
ASSERT (2 * k <= n * GMP_NUMB_BITS);
|
|
|
|
ASSERT (MPN_BGCD_MATRIX_ITCH (n) <= m->alloc);
|
|
|
|
{
|
|
/* First recursive call */
|
|
mp_size_t zlimbs;
|
|
unsigned zbits;
|
|
unsigned k1;
|
|
mp_size_t n1;
|
|
mp_size_t nn;
|
|
|
|
zlimbs = power_of_2 (&zbits, bp, n);
|
|
|
|
if (zlimbs * GMP_NUMB_BITS + zbits >= k)
|
|
goto done;
|
|
|
|
/* Split on a limb boundary */
|
|
n1 = (n+1)/2;
|
|
k1 = n1 * (GMP_NUMB_BITS / 2);
|
|
|
|
ASSERT (k1 >= (k+1)/2);
|
|
|
|
if (zlimbs * GMP_NUMB_BITS + zbits < k1)
|
|
{
|
|
ASSERT (n1 < n);
|
|
|
|
nn = mpn_hbgcd_n (ap, bp, n1, k1, m, tp);
|
|
|
|
ASSERT (m->j > 0);
|
|
|
|
/* Needs space 4 * l */
|
|
n = bgcd_matrix_apply (m, ap, bp, n, n1, nn, tp);
|
|
}
|
|
ASSERT (m->j < k1);
|
|
}
|
|
ASSERT (m->n <= l);
|
|
{
|
|
/* Division. This stage needs temporary storage of 5 * l / 2 limbs
|
|
(same analysis as the division case in in hbgcd_n_base) */
|
|
mp_size_t zlimbs;
|
|
unsigned zbits;
|
|
mp_size_t i;
|
|
mp_limb_t cy;
|
|
int qsign;
|
|
int rsign;
|
|
|
|
zlimbs = power_of_2 (&zbits, bp, n);
|
|
|
|
if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k)
|
|
goto done;
|
|
|
|
ASSERT (zlimbs <= (n-1) /2);
|
|
|
|
if (zbits)
|
|
ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits));
|
|
else
|
|
MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs);
|
|
|
|
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n,
|
|
zlimbs, zbits, tp + zlimbs + 1);
|
|
|
|
bgcd_matrix_mul_q (m, zlimbs, zbits, tp, qsign, tp + zlimbs + 1);
|
|
|
|
if (rsign < 0)
|
|
{
|
|
m->sign[1][0] = ~m->sign[1][0];
|
|
m->sign[1][1] = ~m->sign[1][1];
|
|
}
|
|
|
|
n -= zlimbs;
|
|
if (cy > 0)
|
|
{
|
|
ap[n] = cy;
|
|
bp[n] = 0;
|
|
n++;
|
|
}
|
|
|
|
/* Swapping pointers would confuse our caller, so we have to swap
|
|
the contents. */
|
|
for (i = 0; i<n; i++)
|
|
MP_LIMB_T_SWAP (ap[i], bp[i]);
|
|
}
|
|
ASSERT (m->j >= (k+1)/2);
|
|
|
|
ASSERT (n <= 2 + (27 * l + 5) / 32);
|
|
ASSERT (n < l);
|
|
|
|
{
|
|
/* Second and final recursive call */
|
|
mp_size_t zlimbs;
|
|
unsigned zbits;
|
|
unsigned k1;
|
|
mp_size_t n1;
|
|
mp_size_t nn;
|
|
|
|
zlimbs = power_of_2 (&zbits, bp, n);
|
|
|
|
if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k)
|
|
goto done;
|
|
|
|
k1 = k - m->j;
|
|
|
|
ASSERT (k1 <= (k+1) / 2);
|
|
|
|
if (k1 <= GMP_NUMB_BITS)
|
|
{
|
|
struct bgcd_matrix1 m1;
|
|
|
|
hbgcd2 (ap[1], ap[0], bp[1], bp[0], k1, &m1);
|
|
ASSERT (m1.j > 0);
|
|
|
|
n = bgcd_matrix1_apply (&m1, ap, bp, n, tp);
|
|
bgcd_matrix_mul_1 (m, &m1, tp);
|
|
|
|
/* May need another one */
|
|
k1 = k - m->j;
|
|
|
|
if (hbgcd2 (ap[1], ap[0], bp[1], bp[0], k1, &m1) > 0)
|
|
{
|
|
n = bgcd_matrix1_apply (&m1, ap, bp, n, tp);
|
|
bgcd_matrix_mul_1 (m, &m1, tp);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
struct bgcd_matrix m1;
|
|
|
|
/* Need at least 2 * k1 bits */
|
|
n1 = (2 * k1 + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS;
|
|
ASSERT (n1 < n);
|
|
ASSERT (n1 <= l/2);
|
|
|
|
/* Each matrix element is of size at most
|
|
|
|
MPN_BGCD_MATRIX_ITCH (n1) <= (11*l/2 + 31) / 16
|
|
= 11 * l/32 + 31 / 16 < 3*l/8 + 31/16 - l/32 <= 3*l/8 + 27/16
|
|
|
|
Hence, 3*l/2 + 7 limbs should be enough for the entire matrix.
|
|
*/
|
|
|
|
mpn_bgcd_matrix_init (&m1, tp, MPN_BGCD_MATRIX_ITCH (n1));
|
|
ASSERT (4 * MPN_BGCD_MATRIX_ITCH (n1) <= 3*l/2 + 7);
|
|
tp += 3*l/2 + 7;
|
|
nn = mpn_hbgcd_n (ap, bp, n1, k1, &m1, tp);
|
|
|
|
ASSERT (m->j > 0);
|
|
|
|
/* Needs space 4*n <= 4*(l - 1) */
|
|
n = bgcd_matrix_apply (&m1, ap, bp, n, n1, nn, tp);
|
|
|
|
/* Needs space 3 * (MPN_BGCD_MATRIX_ITCH (l) - 1) + 2 * (MPN_BGCD_MATRIX_ITCH(n1) - 1)
|
|
<= 3 * (11 * l + 15) / 16 + 2 * (11 * l/2 + 15) / 16
|
|
= 44 * l / 16 + (45 + 30)/16
|
|
< 11 * l / 4 + 5 < 3*l + 5
|
|
|
|
==> need <= 3*l + 4 <= * (l - 1);
|
|
|
|
where l denotes the original input size. */
|
|
|
|
bgcd_matrix_mul (m, &m1, tp);
|
|
}
|
|
}
|
|
|
|
done:
|
|
ASSERT (m->j < k);
|
|
|
|
return n;
|
|
}
|
|
|
|
/* For a hbgcd step, we need
|
|
|
|
4*MPN_BGCD_MATRIX_ITCH(n/2) + MAX(hbgcd_n_itch(n/2), 4*n)
|
|
<= 3*n/2 + 7 + MAX (11*n/4 + 3, 4*n)
|
|
== (3/2 + 4) * n + 7 = 11*n/2 + 7
|
|
|
|
For a division step, we need
|
|
|
|
zlimbs + 1 + (MAX (4 * zlimbs, n)) <= n + 4*(n-1) < 5*n
|
|
|
|
For kinds of steps, 11*n/2 + 7 is sufficient.
|
|
*/
|
|
|
|
#define BGCD_N_ITCH(n) (11*(n)/2 + 7)
|
|
|
|
static mp_size_t
|
|
bgcd_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
|
|
{
|
|
ASSERT (ODD (ap[0]));
|
|
|
|
if (ODD (bp[0]))
|
|
{
|
|
int cmp = mpn_cmp (ap, bp, n);
|
|
if (cmp == 0)
|
|
{
|
|
MPN_NORMALIZE (ap, n);
|
|
MPN_COPY (gp, ap, n);
|
|
return n;
|
|
}
|
|
else if (cmp > 0)
|
|
MP_PTR_SWAP (ap, bp);
|
|
|
|
ASSERT_NOCARRY (mpn_sub_n (bp, bp, ap, n));
|
|
}
|
|
|
|
while (ABOVE_THRESHOLD (n, BGCD_THRESHOLD))
|
|
{
|
|
struct bgcd_matrix m;
|
|
mp_size_t zlimbs;
|
|
unsigned zbits;
|
|
|
|
mp_size_t n1;
|
|
mp_size_t l;
|
|
|
|
mp_size_t alloc;
|
|
|
|
zlimbs = power_of_2 (&zbits, bp, n);
|
|
if (zlimbs == n)
|
|
{
|
|
MPN_NORMALIZE (ap, n);
|
|
MPN_COPY (gp, ap, n);
|
|
return n;
|
|
}
|
|
|
|
n1 = n/2;
|
|
alloc = MPN_BGCD_MATRIX_ITCH (n1);
|
|
ASSERT (4*alloc <= 3*n/2 + 7);
|
|
mpn_bgcd_matrix_init (&m, tp + 4 * n, alloc);
|
|
|
|
l = mpn_hbgcd_n (ap, bp, n1, n1 * GMP_NUMB_BITS / 2, &m, tp);
|
|
if (m.j > 0)
|
|
n = bgcd_matrix_apply (&m, ap, bp, n, n1, l, tp);
|
|
else
|
|
{
|
|
mp_limb_t cy;
|
|
int qsign;
|
|
int rsign;
|
|
|
|
if (zbits)
|
|
ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits));
|
|
else
|
|
MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs);
|
|
|
|
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n,
|
|
zlimbs, zbits, tp + zlimbs + 1);
|
|
|
|
n -= zlimbs;
|
|
if (cy > 0)
|
|
{
|
|
ap[n] = cy;
|
|
bp[n] = 0;
|
|
n++;
|
|
}
|
|
|
|
MP_PTR_SWAP (ap, bp);
|
|
}
|
|
}
|
|
|
|
/* Use old algorithm */
|
|
{
|
|
mp_size_t an;
|
|
mp_size_t bn;
|
|
|
|
an = bn = n;
|
|
MPN_NORMALIZE (ap, an);
|
|
MPN_NORMALIZE (bp, bn);
|
|
if (bn == 0)
|
|
{
|
|
MPN_COPY (gp, ap, an);
|
|
return an;
|
|
}
|
|
/* The *smaller* number must be odd */
|
|
if (an > bn || (an == bn && ap[an-1] > bp[an - 1]))
|
|
{
|
|
int bits;
|
|
while (bp[0] == 0)
|
|
{
|
|
bp++;
|
|
bn--;
|
|
}
|
|
count_trailing_zeros (bits, bp[0]);
|
|
if (bits > 0)
|
|
{
|
|
ASSERT_NOCARRY (mpn_rshift (bp, bp, bn, bits));
|
|
bn -= (bp[bn-1] == 0);
|
|
}
|
|
MPN_PTR_SWAP (ap, an, bp, bn);
|
|
}
|
|
|
|
return mpn_gcd (gp, bp, bn, ap, an);
|
|
}
|
|
}
|
|
|
|
/* Needs BDIVMOD_N_ITCH(an, an - bn) + an - bn + 1 for the initial division,
|
|
and BGCD_N_ITCH (bn) for the bgcd_n call. I.e. the max of
|
|
|
|
an - bn + 1 + MAX(4 * (an - bn), an) <= 5*an
|
|
11 * bn/2 + 7
|
|
*/
|
|
#define BGCD_ITCH(an, bn) MAX(5*(an), BGCD_N_ITCH(bn))
|
|
|
|
static mp_size_t
|
|
bgcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t bn, mp_ptr tp)
|
|
{
|
|
/* mpn_gcd uses different conventions than the rest of the code. */
|
|
ASSERT (ODD (bp[0]));
|
|
ASSERT (an >= bn);
|
|
|
|
if (an > bn)
|
|
{
|
|
mp_limb_t zlimbs;
|
|
unsigned zbits;
|
|
|
|
zlimbs = power_of_2 (&zbits, ap, an);
|
|
ASSERT (zlimbs < an);
|
|
|
|
if (zbits > 0)
|
|
ASSERT_NOCARRY (mpn_rshift (ap, ap + zlimbs, an - zlimbs, zbits));
|
|
else if (zlimbs > 0)
|
|
MPN_COPY_INCR (ap, ap + zlimbs, an - zlimbs);
|
|
|
|
an = an - zlimbs;
|
|
an -= (ap[an-1] == 0);
|
|
|
|
if (an < bn)
|
|
MPN_ZERO (ap + an, an - bn);
|
|
|
|
else if (an > bn)
|
|
{
|
|
mp_size_t qn = an - bn + 1;
|
|
mp_limb_t cy;
|
|
int qsign;
|
|
int rsign;
|
|
|
|
cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, an,
|
|
an - bn, 0, tp + qn);
|
|
an = bn;
|
|
ASSERT (!ODD (ap[0]));
|
|
|
|
if (cy > 0)
|
|
{
|
|
ASSERT (cy == 1);
|
|
mpn_rshift1 (ap, ap, an);
|
|
ap[an-1] |= GMP_NUMB_HIGHBIT;
|
|
}
|
|
}
|
|
}
|
|
/* Now both numbers are of size bn, and b is still odd */
|
|
return bgcd_n (gp, bp, ap, bn, tp);
|
|
}
|
|
|
|
mp_size_t
|
|
mpn_bgcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t bn)
|
|
{
|
|
mp_ptr tp;
|
|
mp_size_t gn;
|
|
|
|
TMP_DECL;
|
|
|
|
TMP_MARK;
|
|
tp = TMP_ALLOC_LIMBS (BGCD_ITCH (an, bn));
|
|
|
|
gn = bgcd (gp, ap, an, bp, bn, tp);
|
|
|
|
TMP_FREE;
|
|
return gn;
|
|
}
|