/* Stehle's and Zimmermann's binary algorithm */ #include /* for NULL */ #include /* #define WANT_ASSERT 0 */ #include "mpir.h" #include "gmp-impl.h" #include "longlong.h" /* Notation: W = 2^GMP_NUMB_BITS */ #define ODD(x) ((x) & 1) /* Extract one limb, shifting count bits right ________ ________ |___xh___||___xl___| |____r____| >count< */ #define EXTRACT_LIMB(count, xh, xl) \ (((xl) >> (count) ) | (((xh) << (GMP_NUMB_BITS - count)) & GMP_NUMB_MASK)) #define RSHIFT2(h, l, count) do { \ (l) = ((l)>>(count)) | (((h)<<(GMP_NUMB_BITS - count)) & GMP_NUMB_MASK); \ (h) >>= (count); \ } while (0) #define MP_LIMB_SIGNED_T_SWAP(x, y) \ do { \ mp_limb_signed_t __mp_limb_signed_t_swap__tmp = (x); \ (x) = (y); \ (y) = __mp_limb_signed_t_swap__tmp; \ } while (0) #define INT_SWAP(x, y) \ do { \ int __int_swap__tmp = (x); \ (x) = (y); \ (y) = __int_swap__tmp; \ } while (0) /* Computes 2^(n * GMP_NUMB_BITS) - x. Returns 1 in case x was zero, otherwise zero. This function is used when a subtraction yields a borrow, to negate the difference. */ static mp_limb_t negate_diff (mp_ptr xp, mp_size_t n) { mp_size_t i; ASSERT (n > 0); for (i = 0; i < n; i++) { if (xp[i] > 0) { xp[i] = ~xp[i] + 1; i++; if (i < n) mpn_com_n (xp + i, xp + i, n - i); return 0; } } return 1; } /* Computes r = u x + v y. Returns most significant limb. rp and xp may be the same. */ static mp_limb_t addmul2_1 (mp_ptr rp, mp_limb_t *cy, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_limb_t u, mp_limb_t v) { mp_limb_t h1; mp_limb_t h2; ASSERT (rp != yp); h1 = mpn_mul_1 (rp, xp, n, u); h2 = mpn_addmul_1 (rp, yp, n, v); h1 += h2; *cy = (h1 < h2); return h1; } /* Computes r = u x - v y. rp and xp may be the same. Returns most significant limb. Sets *rsign to reflect the sign of the result. */ static mp_limb_t submul2_1 (mp_ptr rp, int *rsign, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_limb_t u, mp_limb_t v) { mp_limb_t hi; mp_limb_t cy; ASSERT (rp != yp); hi = mpn_mul_1 (rp, xp, n, u); cy = mpn_submul_1 (rp, yp, n, v); if (cy <= hi) { hi -= cy; *rsign = 0; } else { /* We need to negate the difference, - ( (hi - cy) 2^k + r) = (cy - hi) 2^k - r = (cy - hi - 1) 2^k + 2^k - r */ *rsign = -1; hi = cy - hi - 1 + negate_diff (rp, n); } return hi; } /* Returns the number of zero limbs at the least significant end of x, and the number of zero bits of the first non-zero limb. If x == 0, returns n and sets *bits = 0. */ static mp_size_t power_of_2 (unsigned *bits, mp_srcptr xp, mp_size_t n) { mp_size_t zlimbs; for (zlimbs = 0; zlimbs < n && xp[zlimbs] == 0; zlimbs++) ; if (zlimbs == n || ODD (xp[zlimbs])) *bits = 0; else { int count; count_trailing_zeros (count, xp[zlimbs]); *bits = count; } return zlimbs; } struct bgcd_matrix1 { unsigned j; /* Sign/magnitute representation. sign = 0 for positive, -1 for negative */ signed char sign[2][2]; mp_limb_t R[2][2]; }; /* Multiplies a,b by 2^(-2j) R. Changes the signs of the rows of R if needed to make a, b >= 0. Needs n limbs of temporary storage. Returns normalized length. */ static mp_size_t bgcd_matrix1_apply (struct bgcd_matrix1 *m, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) { /* Top two limbs */ mp_limb_t a0, a1, b0, b1; int s0; int s1; int ts; unsigned j; /* Make a copy of a */ MPN_COPY (tp, ap, n); s0 = m->sign[0][0]; s1 = m->sign[0][1]; if ( (s0 ^ s1) < 0) { a0 = submul2_1 (ap, &ts, ap, bp, n, m->R[0][0], m->R[0][1]); a1 = 0; ts ^= s0; m->sign[0][0] ^= ts; m->sign[0][1] ^= ts; } else { a0 = addmul2_1 (ap, &a1, ap, bp, n, m->R[0][0], m->R[0][1]); m->sign[0][0] = m->sign[0][1] = 0; } s0 = m->sign[1][0]; s1 = m->sign[1][1]; if ( (s0 ^ s1) < 0) { b0 = submul2_1 (bp, &ts, bp, tp, n, m->R[1][1], m->R[1][0]); b1 = 0; ts ^= s1; m->sign[1][0] ^= ts; m->sign[1][1] ^= ts; } else { b0 = addmul2_1 (bp, &b1, bp, tp, n, m->R[1][1], m->R[1][0]); m->sign[1][0] = m->sign[1][1] = 0; } /* Shift 2j bits right */ j = 2 * m->j; if (j == GMP_NUMB_BITS) { ASSERT (n >= 2); ASSERT (ap[0] == 0); ASSERT (bp[0] == 0); ASSERT (a1 == 0); ASSERT (b1 == 0); MPN_COPY_INCR (ap, ap+1, n-1); MPN_COPY_INCR (bp, bp+1, n-1); ap[n-1] = a0; bp[n-1] = b0; } else if (j < GMP_NUMB_BITS) { /* This is the only case where n == 1 is allowed */ ASSERT (a1 == 0); ASSERT (b1 == 0); ASSERT_NOCARRY (mpn_rshift (ap, ap, n, j)); ASSERT_NOCARRY (mpn_rshift (bp, bp, n, j)); ap[n-1] |= (a0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK; bp[n-1] |= (b0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK; ASSERT ( (a0 >> j) == 0); ASSERT ( (b0 >> j) == 0); } else { ASSERT (n >= 2); j -= GMP_NUMB_BITS; ASSERT (ap[0] == 0); ASSERT (bp[0] == 0); ASSERT_NOCARRY (mpn_rshift (ap, ap + 1, n - 1, j)); ASSERT_NOCARRY (mpn_rshift (bp, bp + 1, n - 1, j)); ap[n-2] |= (a0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK; bp[n-2] |= (b0 << (GMP_NUMB_BITS - j)) & GMP_NUMB_MASK; ap[n-1] = (a0 >> j) | (a1 << (GMP_NUMB_BITS - j)); bp[n-1] = (b0 >> j) | (b1 << (GMP_NUMB_BITS - j)); } ASSERT (ODD (ap[0])); ASSERT (!ODD (bp[0])); return n - ( (ap[n-1] | bp[n-1]) == 0); } /* When hbgcd is computed and results in j bits of reduction, then the matrix elements will fit in floor (11 (j+1) / 8) bits. If the input to hbgcd is n limbs, and k bits of desired reduction, then 2k <= n GMP_NUMB_BITS and j < k. Then elements of the resulting matrix must fit in floor (11 (j+1) / 8) <= 11 k / 8 <= 11 n * GMP_NUMB_BITS / 16 Rounding up to an integral number of limbs, we get the limit ceil (11 n / 16) Table for small values: n maximum size of matrix entries 1 1 2 2 3 3 4 3 5 4 6 5 7 5 8 6 For the computations, we need one extra limb. */ void mpn_bgcd_matrix_init (struct bgcd_matrix *m, mp_ptr limbs, mp_size_t alloc) { m->alloc = alloc; m->j = 0; m->n = 1; memset (m->sign, 0, sizeof (m->sign)); m->R[0][0] = limbs; m->R[0][1] = limbs + alloc; m->R[1][0] = limbs + 2 * alloc; m->R[1][1] = limbs + 3 * alloc; MPN_ZERO (limbs, 4 * alloc); m->R[0][0][0] = m->R[1][1][0] = 1; } /* Needs zlimbs + 1 + m->n <= m->alloc limbs of temporary storage */ static void bgcd_matrix_mul_q (struct bgcd_matrix *m, mp_size_t zlimbs, unsigned zbits, mp_ptr qp, int qsign, mp_ptr tp) { unsigned i; mp_size_t n, nn; mp_limb_t grow; ASSERT (tp != qp); ASSERT (qp[zlimbs] < ((mp_limb_t ) 1 << zbits)); /* To multiply a column (u ; v) by (0, 2^j ; 2^j, q), set u = 2^j u + q v v = 2^j v and then swap u, v. We need temporary storage for t = q * v. Size should be increased by zlimbs or zlimbs + 1. */ grow = 0; n = m->n; nn = n + zlimbs; ASSERT (nn + 1 <= m->alloc); for (i = 0; i < 2; i++) { mp_ptr up = m->R[0][i]; mp_ptr vp = m->R[1][i]; int us = m->sign[0][i]; int vs = m->sign[1][i]; /* Set u <<= j */ if (zbits) { up[nn] = mpn_lshift (up + zlimbs, up, n, zbits); } else { MPN_COPY_DECR (up + zlimbs, up, n); ASSERT (up[nn] == 0); } /* We could make some clever use of negate_diff, but it's easier to clear the low limbs first. */ MPN_ZERO (up, zlimbs); /* Set t = q * v, and u += t */ if (n > zlimbs) mpn_mul (tp, vp, n, qp, zlimbs + 1); else mpn_mul (tp, qp, zlimbs + 1, vp, n); if ( (us ^ vs ^ qsign) < 0) { mp_limb_t cy; cy = mpn_sub_n (up, up, tp, nn + 1); if (cy > 0) { ASSERT_NOCARRY (negate_diff (up, nn + 1)); us = ~us; } } else { ASSERT_NOCARRY (mpn_add_n (up, up, tp, nn + 1)); } grow |= up[nn]; /* Set v <<= j */ if (zbits) { vp[nn] = mpn_lshift (vp + zlimbs, vp, n, zbits); grow |= vp[nn]; } else { MPN_COPY_DECR (vp + zlimbs, vp, n); ASSERT (vp[nn] == 0); } MPN_ZERO (vp, zlimbs); MP_PTR_SWAP (m->R[0][i], m->R[1][i]); m->sign[0][i] = vs; m->sign[1][i] = us; } m->n = nn + (grow != 0); m->j += zlimbs * GMP_NUMB_BITS + zbits; } /* Needs m->n limbs of temporary storage. */ static void bgcd_matrix_mul_1 (struct bgcd_matrix *m, const struct bgcd_matrix1 *m1, mp_ptr tp) { unsigned i; mp_limb_t grow; mp_limb_t r00 = m1->R[0][0]; mp_limb_t r01 = m1->R[0][1]; mp_limb_t r10 = m1->R[1][0]; mp_limb_t r11 = m1->R[1][1]; int s00 = m1->sign[0][0]; int s01 = m1->sign[0][1]; int s10 = m1->sign[1][0]; int s11 = m1->sign[1][1]; /* Carries, for the unlikely case that R grows by two limbs. */ mp_limb_t c[2][2]; mp_size_t n = m->n; ASSERT (n + 1 <= m->alloc); grow = 0; for (i = 0; i < 2; i++) { mp_ptr up = m->R[0][i]; mp_ptr vp = m->R[1][i]; int us = m->sign[0][i]; int vs = m->sign[1][i]; int ts; mp_limb_t uh, vh; /* Make a copy of u */ MPN_COPY (tp, up, n); if ( (s00 ^ s01 ^ us ^ vs) < 0) { uh = submul2_1 (up, &ts, up, vp, n, r00, r01); c[0][i] = 0; ts ^= s00 ^ us; } else { uh = addmul2_1 (up, &c[0][i], up, vp, n, r00, r01); ts = s00 ^ us; } if ( (s10 ^ s11 ^ us ^ vs) < 0) { int s; vh = submul2_1 (vp, &s, vp, tp, n, r11, r10); c[1][i] = 0; vs ^= s11 ^ s; } else { vh = addmul2_1 (vp, &c[1][i], vp, tp, n, r11, r10); vs ^= s11; } up[n] = uh; vp[n] = vh; grow |= (uh | vh); m->sign[0][i] = ts; m->sign[1][i] = vs; } if ( (c[0][0] | c[0][1] | c[1][0] | c[1][1]) != 0) { /* Growth by two limbs. */ m->n += 2; ASSERT (m->n + 1 <= m->alloc); m->R[0][0][n+1] = c[0][0]; m->R[0][1][n+1] = c[0][1]; m->R[1][0][n+1] = c[1][0]; m->R[1][1][n+1] = c[1][1]; } else { m->n += (grow != 0); ASSERT (m->n + 1 <= m->alloc); } m->j += m1->j; } /* Computes r = u x + v y. Needs temporary space 2*(xn + un). The least significant xn + un limbs are stored at rp. Returns carry. rp == tp is allowed. */ static mp_limb_t addmul2_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t xn, mp_srcptr up, mp_srcptr vp, mp_size_t un, mp_ptr tp) { mp_size_t n = xn + un; mp_ptr t1p = tp; mp_ptr t2p = tp + n; if (xn >= un) { mpn_mul (t1p, xp, xn, up, un); mpn_mul (t2p, yp, xn, vp, un); } else { mpn_mul (t1p, up, un, xp, xn); mpn_mul (t2p, vp, un, yp, xn); } return mpn_add_n (rp, t1p, t2p, n); } /* Computes r = u x - v y. Needs temporary space 2*(xn + un). Result, xn + un limbs, is stored at rp. Returns -1 or zero depending on the sign of the difference. rp may be the same as either input or tp. */ static int submul2_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t xn, mp_srcptr up, mp_srcptr vp, mp_size_t un, mp_ptr tp) { mp_size_t n = xn + un; mp_ptr t1p = tp; mp_ptr t2p = tp + n; if (xn >= un) { mpn_mul (t1p, xp, xn, up, un); mpn_mul (t2p, yp, xn, vp, un); } else { mpn_mul (t1p, up, un, xp, xn); mpn_mul (t2p, vp, un, yp, xn); } while (t1p[n-1] == t2p[n-1]) { rp[--n] = 0; if (n == 0) return 0; } if (t1p[n-1] > t2p[n-1]) { ASSERT_NOCARRY (mpn_sub_n (rp, t1p, t2p, n)); return 0; } else { ASSERT (t1p[n-1] < t2p[n-1]); ASSERT_NOCARRY (mpn_sub_n (rp, t2p, t1p, n)); return -1; } } /* Sets m = s * m. Needs temporary space 2 * (m->n + s->n) + m->n = 3 * m->n + 2 * s->n */ static void bgcd_matrix_mul (struct bgcd_matrix *m, const struct bgcd_matrix *s, mp_ptr tp) { unsigned i; mp_limb_t grow = 0; mp_ptr r00 = s->R[0][0]; mp_ptr r01 = s->R[0][1]; mp_ptr r10 = s->R[1][0]; mp_ptr r11 = s->R[1][1]; int s00 = s->sign[0][0]; int s01 = s->sign[0][1]; int s10 = s->sign[1][0]; int s11 = s->sign[1][1]; mp_size_t mn = m->n; mp_size_t sn = s->n; mp_size_t n = m->n + s->n; /* Space for a copy of u */ mp_ptr cp = tp + 2 * n; ASSERT (n + 1 <= m->alloc); for (i = 0; i < 2; i++) { mp_ptr up = m->R[0][i]; mp_ptr vp = m->R[1][i]; int us = m->sign[0][i]; int vs = m->sign[1][i]; int ts; /* Make a copy of u */ MPN_COPY (cp, up, mn); /* Set u = r00 * u + r01 * v */ /* If r00 u and r01 v have different signs, we should subtract */ ts = us ^ s00; if ( (s00 ^ us ^ s01 ^ vs) < 0) { ts ^= submul2_n (up, cp, vp, mn, r00, r01, sn, tp); up[n] = 0; } else { up[n] = addmul2_n (up, cp, vp, mn, r00, r01, sn, tp); grow |= up[n]; } /* Set v = r10 (old) u + r11 v */ /* If r10 u and r11 v have different signs, we should subtract */ if ( (s10 ^ us ^ s11 ^ vs) < 0) { vs = us ^ s10 ^ submul2_n (vp, cp, vp, mn, r10, r11, sn, tp); vp[n] = 0; } else { vp[n] = addmul2_n (vp, cp, vp, mn, r10, r11, sn, tp); vs = us ^ s10; grow |= vp[n]; } m->sign[0][i] = ts; m->sign[1][i] = vs; } if (grow > 0) m->n = n + 1; else { /* We don't need full normalization, stripping one leading zero limb should be sufficient to get size below the bound. */ m->n = n - (m->R[0][0][n-1] == 0 && m->R[0][1][n-1] == 0 && m->R[1][0][n-1] == 0 && m->R[1][1][n-1] == 0); } ASSERT (m->n + 1 <= m->alloc); m->j += s->j; } /* Sets a = a + c * 2^(shift * GMP_NUMB_BITS), ________________________ _|____________________a___| + |__________________c_| --> <-- shift Stores MAX(an, cn + shift) limbs of the result at ap, and returns carry. */ static mp_limb_t bgcd_fix_add (mp_ptr ap, mp_size_t an, mp_srcptr cp, mp_size_t cn, mp_size_t shift) { if (cn + shift < an) { /* Can this really happen? */ ASSERT_ALWAYS (0 == "Can this happen?"); return mpn_add (ap + shift, ap + shift, an - shift, cp, cn); } else if (shift < an) return mpn_add (ap + shift, cp, cn, ap + shift, an - shift); else { MPN_COPY (ap + shift, cp, cn); if (shift > an) MPN_ZERO (ap + an, shift - an); return 0; } } /* Sets a = a - c * 2^(shift * GMP_NUMB_BITS, ________________________ _|____________________a___| - |__________________c_| --> <-- shift Stores the cn + shift limbs of absolute value at ap, and returns 0 or -1 depending on the sign. */ static mp_size_t bgcd_fix_sub (mp_ptr ap, mp_size_t an, mp_srcptr cp, mp_size_t cn, mp_size_t shift) { while (an > cn + shift) { if (ap[an-1] == 0) an--; else { ASSERT_NOCARRY (mpn_sub (ap + shift, ap + shift, an - shift, cp, cn)); return 0; } } while (cn > 0 && cn + shift > an) { if (cp[cn - 1] == 0) ap[shift + --cn] = 0; else { /* Result must be negative */ if (shift == 0) ASSERT_NOCARRY (mpn_sub (ap, cp, cn, ap, an)); else { /* Negate the lower part of a */ if (shift <= an) { ASSERT_NOCARRY (mpn_sub (ap + shift, cp, cn, ap + shift, an - shift)); if (negate_diff (ap, shift) == 0) MPN_DECR_U (ap + shift, cn, 1); } else { if (negate_diff (ap, an) == 0) { mp_size_t i; for (i = an; i < shift; i++) ap[i] = GMP_NUMB_MASK; MPN_DECR_U (ap + shift, cn, 1); } else MPN_ZERO (ap + an, shift - an); } } return -1; } } while (cn > 0) { if (ap[shift + cn - 1] == cp[cn - 1]) ap[shift + --cn] = 0; else if (ap[shift + cn - 1] > cp[cn - 1]) { /* Result must be positive */ ASSERT_NOCARRY (mpn_sub_n (ap + shift, ap + shift, cp, cn)); return 0; } else { /* Result must be negative */ ASSERT (ap[shift + cn - 1] < cp[cn - 1]); ASSERT_NOCARRY (mpn_sub_n(ap + shift, cp, ap + shift, cn)); if (shift > 0 && negate_diff (ap, shift) == 0) MPN_DECR_U (ap + shift, cn, 1); return -1; } } return 0; } /* Computes (C ; D) = 2^(-2j) R (A; B) = 2^(k * GMP_NUMB_BITS - 2j) R (a ; b) + (c ; d) where a, b are stored at ap[k, ..., n-1], bp[k, ..., n-1], and c, d are stored at ap[0, ..., l-1], bp[0, ..., l-1]. Results C, D are also stored at ap, bp. If necessary to get non-negative values, flip the signs of the rows of the matrix R. Sizes: Input c,d are l limbs. The output C,D must always fit in n limbs (and we do *not* have any temporary space beyond the n:th limb). The temporaries R(a;b) must fit in n limbs. This is because #(R(a;b)) <= #R + (n-k) GMP_NUMB_BITS + 1. From the bounds #R <= 11(j+1)/8 and k GMP_NUMB_BITS >= 2(j+1) we get #(R(a;b)) <= n GMP_NUMB_BITS + (11 - 16)(j+1)/8 + 1 = n GMP_NUMB_BITS - (5j -3)/8 < n GMP_NUMB_BITS. For shifted numbers 2^(k * GMP_NUMB_BITS - 2j) R (a ; b) we get k * GMP_NUMB_BITS - 2j + #R + (n-k) GMP_NUMB_BITS + 1 <= n * GMP_NUMB_BITS + (11 j + 11 - 16j)/8 + 1 <= n * GMP_NUMB_BITS + (17 - 5j)/8 Taking the fllor of this expresstion, we see that if j >= 2, n limbs should be sufficient. A more careful study of j = 1 shows that n limbs is sufficient for all j >= 1. Temporary storage needed: 4*n. */ static mp_size_t bgcd_matrix_apply (struct bgcd_matrix *m, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t k, mp_size_t l, mp_ptr tp) { mp_ptr r00 = m->R[0][0]; mp_ptr r01 = m->R[0][1]; mp_ptr r10 = m->R[1][0]; mp_ptr r11 = m->R[1][1]; int s00 = m->sign[0][0]; int s01 = m->sign[0][1]; int s10 = m->sign[1][0]; int s11 = m->sign[1][1]; mp_ptr cp = tp + 2*n; mp_ptr dp = tp + 3*n; mp_size_t nn; mp_limb_t ch; mp_limb_t dh; int cs; int ds; unsigned shift; unsigned shift_bits; mp_size_t shift_limbs; /* This function probably works for k == 1, but then it's better to use hbgcd2 and bgcd_matrix1_apply. */ ASSERT (k >= 2); ASSERT (l <= k); ASSERT (2 * m->j < k * GMP_NUMB_BITS); nn = n - k + m->n; ASSERT (nn <= n); /* There are three chances for nn to grow: addmul2_n, left shift, and bgcd_fix_add. */ cs = s00; /* Compute R * (a ; b), and store result at cp, dp */ if ( (s00 ^ s01) < 0) { cs ^= submul2_n (cp, ap + k, bp + k, n - k, r00, r01, m->n, tp); ch = 0; } else ch = addmul2_n (cp, ap + k, bp + k, n - k, r00, r01, m->n, tp); ds = s10; if ( (s10 ^ s11) < 0) { ds ^= submul2_n (dp, ap + k, bp + k, n - k, r10, r11, m->n, tp); dh = 0; } else dh = addmul2_n (dp, ap + k, bp + k, n - k, r10, r11, m->n, tp); if ( (ch | dh) > 0) { ASSERT (nn < n); cp[nn] = ch; dp[nn] = dh; nn++; } shift = k * GMP_NUMB_BITS - 2 * m->j; shift_limbs = shift / GMP_NUMB_BITS; shift_bits = shift % GMP_NUMB_BITS; if (shift_bits > 0) { ch = mpn_lshift (cp, cp, nn, shift_bits); dh = mpn_lshift (dp, dp, nn, shift_bits); if ( (ch | dh) > 0) { ASSERT (nn < n); cp[nn] = ch; dp[nn] = dh; nn++; } } /* Update a */ ASSERT (nn + shift_limbs <= n); if (cs >= 0) ch = bgcd_fix_add (ap, l, cp, nn, shift_limbs); else { int s = bgcd_fix_sub (ap, l, cp, nn, shift_limbs); ch = 0; /* Flip signs of first matrix row, if needed. */ m->sign[0][0] ^= s; m->sign[0][1] ^= s; } /* Update b */ if (ds >= 0) dh = bgcd_fix_add (bp, l, dp, nn, shift_limbs); else { int s = bgcd_fix_sub (bp, l, dp, nn, shift_limbs); dh = 0; /* Flip signs of second matrix row, if needed. */ m->sign[1][0] ^= s; m->sign[1][1] ^= s; } nn += shift_limbs; if (nn < l) { /* Can this really happen? */ ASSERT_ALWAYS (0 == "Can this happen?"); nn = l; } if ( (ch | dh) > 0) { ASSERT (nn < n); ap[nn] = ch; bp[nn] = dh; nn++; } return nn; } /* Input: b is odd, k < GMP_NUMB_BITS. (Common case is a odd, k > 0). Returns q, such that -2^k <= q < 2^k and v (a + q b) > k, i.e. there are at least k+1 zeros at the least significant end. */ static inline mp_limb_signed_t bdiv_1 (mp_limb_t a, mp_limb_t b, unsigned k) { mp_limb_t b_inv; mp_limb_signed_t q; mp_limb_t bit; #if 0 #define BDIV_TABLE_SIZE 3 #define BDIV_TABLE_MASK ((1 << (BDIV_TABLE_SIZE + 1)) - 1) /* Table of -a b^{-1} mod 2^{k+1}, indexed by [b_k b_{k-1} ... b_1 a_k a_{k-1} a_{k-2} ... a_0] */ static const char qtable[1L << (BDIV_TABLE_SIZE * 2 + 1)] = { /* binv = 1 */ 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, /* binv = 11 */ 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, /* binv = 13 */ 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, /* binv = 7 */ 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 5, 14, 7, /* binv = 9 */ 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9, /* binv = 3 */ 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, /* binv = 5 */ 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, /* binv = 15 */ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; ASSERT (ODD (b)); ASSERT (k < GMP_NUMB_BITS); if (k <= BDIV_TABLE_SIZE) { bit = CNST_LIMB(1) << k; q = qtable[ (((b-1) & BDIV_TABLE_MASK) << BDIV_TABLE_SIZE ) | (a & BDIV_TABLE_MASK) ] & (2*bit - 1); } else #endif { mp_limb_t mask; /* We need (k+1)-bit inverse */ b_inv = modlimb_invert_table[(b/2) & 0x7f]; if (k >= 8) { b_inv = 2 * b_inv - b_inv * b_inv * b; if (k >= 16) { b_inv = 2 * b_inv - b_inv * b_inv * b; if (GMP_NUMB_BITS > 32 && k >= 32) { b_inv = 2 * b_inv - b_inv * b_inv * b; if (GMP_NUMB_BITS > 64 && k >= 64) { int inv_bits = 64; do { b_inv = 2 * b_inv - b_inv * b_inv * b; inv_bits *= 2; } while (inv_bits <= k); } } } } bit = CNST_LIMB(1) << k; mask = 2*bit - 1; ASSERT ( (b_inv * b & mask) == 1); q = (- a * b_inv) & mask; } if (q >= bit) q -= 2*bit; return q; } /* Input is two two-limb numbers a, b (typically the least significant limbs of some larger numbers). The integer k is the desired number of bits of reduction. Returns an array R of single limbs, and an integer j < k. If c, d are formed as /c\ -2j /a\ \d/ = 2 R \b/ then c is odd, d is even, and gcd(a,b) = gcd(c, d). R is constructed by repeated bdiv, where iteration stops when either v (d) >= k - j or another iteration would cause R to overflow. Returns 0 if no reduction is possible, otherwise j. */ /* two's complement version */ static unsigned hbgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, unsigned k, struct bgcd_matrix1 *m) { mp_limb_signed_t r00, r01, r10, r11; unsigned s; unsigned j; unsigned v; ASSERT (ODD (al)); ASSERT (!ODD (bl)); /* Multiply by (0, 2^bits; 2^bits, q) */ #define MUL_Q(R00, R01, R10, R11) do { \ R00 = (R00 << v) + q * R10; \ R01 = (R01 << v) + q * R11; \ R10 <<= v; \ R11 <<= v; \ \ s += v; \ s += ((R00 ^ (R00 >> 1)) | (R01 ^ (R01 >> 1))) >> s; \ j += v; \ } while (0) /* Add q b to a. We treat q as an unsigned in the multiplication. If q < 0 and th * w + tl = (q + w) bl is the result of the unsigned multiplication, what we want (q + w - w) (bh w + bl) = q * bh * w + th w + tl - w bl = = (q * bh + th - bl) w + tl (mod w^2) */ #define ADDMUL2_Q(AH, AL, BH, BL) do { \ mp_limb_t th, tl; \ mp_limb_t mask = q >> (GMP_LIMB_BITS - 1); \ \ umul_ppmm (th, tl, BL, q); \ \ AL += tl; \ AH += th + (AL < tl) + q * BH - (mask & BL); \ } while (0) /* NOTE: We can reduce at most GMP_NUMB_BITS - 1 bits. Reducing GMP_NUMB_BITS would be possible if b was preshifted, so we had 2 * GMP_NUMB_BITS + 1 of b. But a reduction of GMP_NUMB_BITS, without R overflowing, is quite unlikely (probability on the order of 1%). */ if (k > GMP_NUMB_BITS) k = GMP_NUMB_BITS; if (bl == 0) return 0; count_trailing_zeros (v, bl); if (v >= k) return 0; RSHIFT2 (bh, bl, v); if (v == GMP_NUMB_BITS - 1) { /* Special case that the two's complement code can't handle */ mp_limb_signed_t q; mp_limb_signed_t sign; q = bdiv_1 (al, bl, v); ASSERT (ODD (q)); m->R[0][0] = 0; m->R[0][1] = m->R[1][0] = CNST_LIMB(1) << (GMP_NUMB_BITS - 1); m->sign[0][0] = m->sign[0][1] = m->sign[1][0] = 0; sign = q >> (GMP_LIMB_BITS - 1); m->R[1][1] = (q + sign) ^ sign; m->sign[1][1] = sign; return (m->j = GMP_NUMB_BITS - 1); } r00 = r11 = 1; r01 = r10 = 0; j = 0; s = 1; /* We need 2*(k-j) bits of accuracy */ /* Loop while we need more than one limb of accuracy. */ for (;;) { mp_limb_signed_t q; unsigned oldv; /* Divide by b */ q = bdiv_1 (al, bl, v); ASSERT (ODD (q)); MUL_Q (r00, r01, r10, r11); ASSERT (s < GMP_NUMB_BITS); ADDMUL2_Q(ah, al, bh, bl); ASSERT ( (al & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0); if (al == 0) goto split_swap; oldv = v; count_trailing_zeros (v, al); ASSERT (v > oldv); RSHIFT2 (ah, al, v); v -= oldv; if (j + v >= k || s + v >= GMP_NUMB_BITS) goto split_swap; else if (s + v == GMP_NUMB_BITS - 1) goto final_swap; else if (2*(k-j) <= GMP_NUMB_BITS) goto middle; /* Divide by a */ q = bdiv_1 (bl, al, v); ASSERT (ODD (q)); MUL_Q (r10, r11, r00, r01); ASSERT (s < GMP_NUMB_BITS); ADDMUL2_Q(bh, bl, ah, al); ASSERT ( (bl & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0); if (bl == 0) goto split; oldv = v; count_trailing_zeros (v, bl); RSHIFT2 (bh, bl, v); v -= oldv; if (j + v >= k || s + v >= GMP_NUMB_BITS) goto split; else if (s + v == GMP_NUMB_BITS - 1) goto final; else if (2*(k-j) <= GMP_NUMB_BITS) break; } /* Now we can forget about ah and bh. On the other hand, we need to check for an overflowing R */ for (;;) { mp_limb_signed_t q; unsigned oldv; /* Divide by b */ q = bdiv_1 (al, bl, v); ASSERT (ODD (q)); MUL_Q (r00, r01, r10, r11); ASSERT (s < GMP_NUMB_BITS); /* Update a, b */ al += q*bl; ASSERT ( (al & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0); if (al == 0) goto split_swap; oldv = v; count_trailing_zeros (v, al); al >>= v; v -= oldv; if (j + v >= k || s + v >= GMP_NUMB_BITS) goto split_swap; else if (s + v == GMP_NUMB_BITS - 1) goto final_swap; middle: /* Divide by a */ q = bdiv_1 (bl, al, v); ASSERT (ODD (q)); MUL_Q (r10, r11, r00, r01); ASSERT (s < GMP_NUMB_BITS); /* Update a, b */ bl += q*al; ASSERT ( (bl & ((CNST_LIMB(1) << (v + 1)) - 1)) == 0); if (bl == 0) goto split; oldv = v; count_trailing_zeros (v, bl); bl >>= v; v -= oldv; if (j + v >= k || s + v >= GMP_NUMB_BITS) goto split; else if (s + v == GMP_NUMB_BITS - 1) break; } #undef MUL_Q #undef ADDMUL2_Q final: if (0) { final_swap: MP_LIMB_SIGNED_T_SWAP (r00, r10); MP_LIMB_SIGNED_T_SWAP (r01, r11); MP_LIMB_T_SWAP (al, bl); } if (j + v < k && s + v < GMP_NUMB_BITS) { /* One more quotient will fit in sign-magnitude representation, but two's complement calculation may overflow. */ mp_limb_signed_t q; mp_limb_signed_t t0, t1; mp_limb_signed_t sign; q = bdiv_1 (al, bl, v); ASSERT (ODD (q)); t0 = r00 << v; t1 = q * r10; /* Can't overflow, since q > -2^{-v} */ r00 = t0 + t1; /* Correct sign given by the majority function */ sign = ((t0 & t1) | (r00 & t0) | (r00 & t1)) >> (GMP_LIMB_BITS - 1); m->R[1][0] = (r00 + sign) ^ sign; m->sign[1][0] = sign; t0 = r01 << v; t1 = q * r11; r01 = t0 + t1; sign = ((t0 & t1) | (r01 & t0) | (r01 & t1) ) >> (GMP_LIMB_BITS - 1); m->R[1][1] = (r01 + sign) ^ sign; m->sign[1][1] = sign; sign = r10 >> (GMP_LIMB_BITS - 1); m->R[0][0] = ( (r10 << v) + sign) ^ sign; m->sign[0][0] = sign; sign = r11 >> (GMP_LIMB_BITS - 1); m->R[0][1] = ( (r11 << v) + sign) ^ sign; m->sign[0][1] = sign; j += v; } else { mp_limb_signed_t sign; if (0) { split_swap: MP_LIMB_SIGNED_T_SWAP (r00, r10); MP_LIMB_SIGNED_T_SWAP (r01, r11); } split: /* Split into signed/magnitude representation */ sign = r00 >> (GMP_LIMB_BITS - 1); m->R[0][0] = (r00 + sign) ^sign; m->sign[0][0] = sign; sign = r01 >> (GMP_LIMB_BITS - 1); m->R[0][1] = (r01 + sign) ^sign; m->sign[0][1] = sign; sign = r10 >> (GMP_LIMB_BITS - 1); m->R[1][0] = (r10 + sign) ^sign; m->sign[1][0] = sign; sign = r11 >> (GMP_LIMB_BITS - 1); m->R[1][1] = (r11 + sign) ^sign; m->sign[1][1] = sign; } ASSERT (j > 0); m->j = j; return j; } /* Compute the inverse of -b, x * b = -1 (mod 2^(n * GMP_NUMB_BITS)) using the iteration formula x_{k+1} = 2 x_k + x_k^2 b In fact, when x_k is IN limbs, the only thing we need is the middle part, limb IN to 2 IN, of the product x_k^2 b. The lower limbs are known already, and the higher limbs are not significant. Needs 3*xn limbs of temporary storage. */ /* NOTE: Handles input b of fewer limbs than output x.We should usually have 2 zlimbs <= n */ static void invert_limbs (mp_ptr xp, mp_size_t xn, mp_srcptr bp, mp_size_t bn, mp_ptr tp) { mp_limb_t b0; mp_limb_t x0; ASSERT (bn >= 1); if (bn > xn) bn = xn; ASSERT (!MPN_OVERLAP_P (xp, xn, bp, xn)); ASSERT (!MPN_OVERLAP_P (xp, xn, tp, 3*xn)); ASSERT (!MPN_OVERLAP_P (bp, bn, tp, 3*xn)); b0 = bp[0]; modlimb_invert (x0, -b0); xp[0] = x0; if (xn >= 2) { /* Iterate once, x1 = 2 x0 + x0^2 b, and keep only the two least significant limbs. */ mp_limb_t sh, sl; mp_limb_t t, dummy; /* Compute sh, sl = x0^2 */ umul_ppmm (sh, sl, x0, x0); /* Compute th, tl = x0^2 b (ignoring higher limbs) */ umul_ppmm (t, dummy, sl, b0); /* x0^2 b0 */ /* We always have a carry from the low limb which we don't compute. */ xp[1] = t + sh * b0 + 1; if (bn > 1) xp[1] += sl * bp[1]; if (xn > 2) { int count; int bitsize; mp_size_t mask; mp_size_t in; /* We need at most xn + 1 limbs to compute the the square, but at most xn limbs to store it. */ mp_ptr sp = tp; /* And this product can be at most 2*xn */ mp_ptr pp = tp + xn; count_leading_zeros (count, (mp_limb_t) xn-1); bitsize = GMP_LIMB_BITS - count; ASSERT (bitsize >= 2); mask = 1L << (bitsize - 2); for (in = 2; in < xn; mask /= 2) { mp_limb_t nn; ASSERT (mask != 0); /* We know the in least significant words of the inverse, and iterate x = 2*x + x^2 * b to double the size. */ mpn_sqr_n (sp, xp, in); /* Drop the high limb if we don't need it. */ nn = 2 * in - ((-xn & mask) != 0); ASSERT (nn <= xn); /* FIXME: We don't need the nn most significant limbs. And we don't need the least significant in limbs either, as we already know them. */ mpn_mul (pp, sp, nn, bp, MIN (nn, bn)); /* For the addition of 2x_k, why shouldn't we add in the most significant limb of x_k? */ mpn_add_1 (xp + in, pp + in, nn - in, 1); in = nn; } } } } /* For a, b odd, a of size n, b of size n - zlimbs, j = zlimbs * GMP_NUMB_BITS + zbits, compute q = - a (b)^-1 mod 2^(j+1), with -2^j < q < 2^j, and set r <-- 2^(-j) (a + q b) Stores the zlimb least significant limbs of r at ap, and returns the most significant limb (which is always 0 or 1). Stores the quotient, zlimbs + 1, limbs, at qp (if zbits == 0 and zlimbs > 0, then the most significant limb of q is always zero). Needs 4 * zlimbs of temporary storage to compute the quotient, and n limbs to compute the remainder (but not at the same time). */ #define BDIVMOD_N_ITCH(n, zlimbs) (MAX (4*(zlimbs), (n))) static mp_limb_t bdivmod_n (mp_ptr qp, int *qsign, int *rsign, mp_ptr ap, mp_srcptr bp, mp_size_t n, mp_size_t zlimbs, unsigned zbits, mp_ptr tp) { mp_limb_signed_t q; mp_limb_t hi; mp_size_t scratch = BDIVMOD_N_ITCH (n, zlimbs); ASSERT (ODD (ap[0])); ASSERT (ODD (bp[0])); ASSERT (!MPN_OVERLAP_P (qp, zlimbs + 1, ap, n)); ASSERT (!MPN_OVERLAP_P (qp, zlimbs + 1, bp, n - zlimbs)); ASSERT (!MPN_OVERLAP_P (ap, n, bp, n - zlimbs)); ASSERT (!MPN_OVERLAP_P (tp, scratch, ap, n)); ASSERT (!MPN_OVERLAP_P (tp, scratch, bp, n - zlimbs)); if (zlimbs == 0) hi = 0; else { /* First compute q such that a + q*b has zlimbs zero limbs at the least significant end. */ invert_limbs (tp, zlimbs, bp, n - zlimbs, tp + zlimbs); /* Let q = t * a. We only want the least significant limbs. */ mpn_mul_n (tp + zlimbs, tp, ap, zlimbs); MPN_COPY (qp, tp + zlimbs, zlimbs); /* Compute q*b. We don't really need the zlimb least significant limbs. */ if (zlimbs <= n - zlimbs) mpn_mul (tp, bp, n - zlimbs, qp, zlimbs); else mpn_mul (tp, qp, zlimbs, bp, n - zlimbs); /* We must get a carry when adding the lower limbs (since a odd => [ap, zlimbs] != 0). This is actually the only piece of this code that depends on a being odd. */ MPN_COPY_INCR (ap, ap + zlimbs, n - zlimbs); n -= zlimbs; hi = mpn_add_n (ap, ap, tp + zlimbs, n); hi += mpn_add_1 (ap, ap, n, 1); } /* Now we have zbits + 1 bits left to clear of ap[0] */ q = bdiv_1 (ap[0], bp[0], zbits); if (q >= 0) { *qsign = 0; *rsign = 0; /* Can't overflow, since q < 2^(GMP_NUMB_BITS-1) and hi <= 1 */ hi += mpn_addmul_1 (ap, bp, n, q); } else { mp_limb_t cy; *qsign = -1; q = -q; cy = mpn_submul_1 (ap, bp, n, q); if (cy <= hi) { *rsign = 0; hi -= cy; } else { *rsign = -1; ASSERT_NOCARRY (negate_diff (ap, n)); hi = cy - hi - 1; } if (zlimbs > 0) { ASSERT_NOCARRY (negate_diff (qp, zlimbs)); q--; } } qp[zlimbs] = q; if (zbits > 0) { mpn_rshift (ap, ap, n, zbits); ap[n-1] |= (hi << (GMP_NUMB_BITS - zbits) & GMP_NUMB_MASK); hi >>= zbits; } return hi; } /* Takes as input two n-limb numbers (typically the least significant limbs of some larger numbers), a odd, b even, and an integer k. Returns an array R, an integer j, two numbers a', b' such that /a'\ -2j /a\ \b'/ = 2 R \b/ where a' is odd v (2^j a') < k, v (2^j b') >= k */ /* Needs temporary space for calling bgcd_matrix1_apply, bgcd_matrix1_mul, bdivmod_n, bgcd_matrix_mul_q, and storing q. The most demanding is bdivmod, for which we need 5*n/2 limbs. */ #define HBGCD_N_BASE_ITCH(n) (5*(n)/2) static mp_size_t hbgcd_n_base (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned k, struct bgcd_matrix *m, mp_ptr tp) { ASSERT (2 * k <= n * GMP_NUMB_BITS); ASSERT (MPN_BGCD_MATRIX_ITCH (n) <= m->alloc); /* Should be initialized already */ ASSERT (m->j == 0); ASSERT (n >= 2); ASSERT (ODD (ap[0])); ASSERT (!ODD (bp[0])); for (;;) { mp_size_t zlimbs; unsigned zbits; ASSERT (ODD (ap[0])); ASSERT (!ODD (bp[0])); zlimbs = power_of_2 (&zbits, bp, n); if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k) break; if (zlimbs == 0) { /* FIXME: Some duplication with next case. */ struct bgcd_matrix1 m1; /* Here, we may have n == 1, and then the values ap[1] and bp[1] will not be used. */ hbgcd2 (ap[1], ap[0], bp[1], bp[0], k - m->j, &m1); ASSERT (m1.j > 0); /* Modifies the signs of the matrix elements. */ n = bgcd_matrix1_apply (&m1, ap, bp, n, tp); bgcd_matrix_mul_1 (m, &m1, tp); } else { mp_size_t i; mp_limb_t cy; int qsign; int rsign; ASSERT (zlimbs <= (n-1) /2); if (zbits) ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits)); else MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs); /* Needs zlimbs + 1 to store the quotient, and MAX (n, 4 zlimbs) for the computations. Since zlimbs <= (n-1)/2, the total need is at most (n + 1) / 2 + MAX (n, 2*(n-1)) = n/2 + 1/2 + 2*n - 2 <= 5*n/2 */ cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n, zlimbs, zbits, tp + zlimbs + 1); /* Needs zlimbs + 1 + m->n <= 2*n limbs of temporary storage. */ bgcd_matrix_mul_q (m, zlimbs, zbits, tp, qsign, tp + zlimbs + 1); if (rsign < 0) { m->sign[1][0] = ~m->sign[1][0]; m->sign[1][1] = ~m->sign[1][1]; } n -= zlimbs; if (cy > 0) { ap[n] = cy; bp[n] = 0; n++; } /* Swapping pointers would confuse our caller, so we have to swap the contents. */ for (i = 0; i= 3, for n >= 8. Actually, approximately 5*n/4 limbs are unused by the recursive call, so perhaps it is possible to rearrange the storage so that we get by with only 17*n/4??? */ mp_size_t mpn_hbgcd_n_itch (mp_size_t n) { if (BELOW_THRESHOLD (n, HBGCD_THRESHOLD)) return HBGCD_N_BASE_ITCH (n); else return 11*n/2 + 3; } mp_size_t mpn_hbgcd_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned k, struct bgcd_matrix *m, mp_ptr tp) { /* Original size */ mp_size_t l = n; if (BELOW_THRESHOLD (n, HBGCD_THRESHOLD)) return hbgcd_n_base (ap, bp, n, k, m, tp); /* Should be initialized already */ ASSERT (m->j == 0); ASSERT (n >= 2); ASSERT (ODD (ap[0])); ASSERT (!ODD (bp[0])); /* FIXME: Should we allow smaller n? */ ASSERT (2 * k <= n * GMP_NUMB_BITS); ASSERT (MPN_BGCD_MATRIX_ITCH (n) <= m->alloc); { /* First recursive call */ mp_size_t zlimbs; unsigned zbits; unsigned k1; mp_size_t n1; mp_size_t nn; zlimbs = power_of_2 (&zbits, bp, n); if (zlimbs * GMP_NUMB_BITS + zbits >= k) goto done; /* Split on a limb boundary */ n1 = (n+1)/2; k1 = n1 * (GMP_NUMB_BITS / 2); ASSERT (k1 >= (k+1)/2); if (zlimbs * GMP_NUMB_BITS + zbits < k1) { ASSERT (n1 < n); nn = mpn_hbgcd_n (ap, bp, n1, k1, m, tp); ASSERT (m->j > 0); /* Needs space 4 * l */ n = bgcd_matrix_apply (m, ap, bp, n, n1, nn, tp); } ASSERT (m->j < k1); } ASSERT (m->n <= l); { /* Division. This stage needs temporary storage of 5 * l / 2 limbs (same analysis as the division case in in hbgcd_n_base) */ mp_size_t zlimbs; unsigned zbits; mp_size_t i; mp_limb_t cy; int qsign; int rsign; zlimbs = power_of_2 (&zbits, bp, n); if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k) goto done; ASSERT (zlimbs <= (n-1) /2); if (zbits) ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits)); else MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs); cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n, zlimbs, zbits, tp + zlimbs + 1); bgcd_matrix_mul_q (m, zlimbs, zbits, tp, qsign, tp + zlimbs + 1); if (rsign < 0) { m->sign[1][0] = ~m->sign[1][0]; m->sign[1][1] = ~m->sign[1][1]; } n -= zlimbs; if (cy > 0) { ap[n] = cy; bp[n] = 0; n++; } /* Swapping pointers would confuse our caller, so we have to swap the contents. */ for (i = 0; ij >= (k+1)/2); ASSERT (n <= 2 + (27 * l + 5) / 32); ASSERT (n < l); { /* Second and final recursive call */ mp_size_t zlimbs; unsigned zbits; unsigned k1; mp_size_t n1; mp_size_t nn; zlimbs = power_of_2 (&zbits, bp, n); if (zlimbs * GMP_NUMB_BITS + zbits + m->j >= k) goto done; k1 = k - m->j; ASSERT (k1 <= (k+1) / 2); if (k1 <= GMP_NUMB_BITS) { struct bgcd_matrix1 m1; hbgcd2 (ap[1], ap[0], bp[1], bp[0], k1, &m1); ASSERT (m1.j > 0); n = bgcd_matrix1_apply (&m1, ap, bp, n, tp); bgcd_matrix_mul_1 (m, &m1, tp); /* May need another one */ k1 = k - m->j; if (hbgcd2 (ap[1], ap[0], bp[1], bp[0], k1, &m1) > 0) { n = bgcd_matrix1_apply (&m1, ap, bp, n, tp); bgcd_matrix_mul_1 (m, &m1, tp); } } else { struct bgcd_matrix m1; /* Need at least 2 * k1 bits */ n1 = (2 * k1 + GMP_NUMB_BITS - 1) / GMP_NUMB_BITS; ASSERT (n1 < n); ASSERT (n1 <= l/2); /* Each matrix element is of size at most MPN_BGCD_MATRIX_ITCH (n1) <= (11*l/2 + 31) / 16 = 11 * l/32 + 31 / 16 < 3*l/8 + 31/16 - l/32 <= 3*l/8 + 27/16 Hence, 3*l/2 + 7 limbs should be enough for the entire matrix. */ mpn_bgcd_matrix_init (&m1, tp, MPN_BGCD_MATRIX_ITCH (n1)); ASSERT (4 * MPN_BGCD_MATRIX_ITCH (n1) <= 3*l/2 + 7); tp += 3*l/2 + 7; nn = mpn_hbgcd_n (ap, bp, n1, k1, &m1, tp); ASSERT (m->j > 0); /* Needs space 4*n <= 4*(l - 1) */ n = bgcd_matrix_apply (&m1, ap, bp, n, n1, nn, tp); /* Needs space 3 * (MPN_BGCD_MATRIX_ITCH (l) - 1) + 2 * (MPN_BGCD_MATRIX_ITCH(n1) - 1) <= 3 * (11 * l + 15) / 16 + 2 * (11 * l/2 + 15) / 16 = 44 * l / 16 + (45 + 30)/16 < 11 * l / 4 + 5 < 3*l + 5 ==> need <= 3*l + 4 <= * (l - 1); where l denotes the original input size. */ bgcd_matrix_mul (m, &m1, tp); } } done: ASSERT (m->j < k); return n; } /* For a hbgcd step, we need 4*MPN_BGCD_MATRIX_ITCH(n/2) + MAX(hbgcd_n_itch(n/2), 4*n) <= 3*n/2 + 7 + MAX (11*n/4 + 3, 4*n) == (3/2 + 4) * n + 7 = 11*n/2 + 7 For a division step, we need zlimbs + 1 + (MAX (4 * zlimbs, n)) <= n + 4*(n-1) < 5*n For kinds of steps, 11*n/2 + 7 is sufficient. */ #define BGCD_N_ITCH(n) (11*(n)/2 + 7) static mp_size_t bgcd_n (mp_ptr gp, mp_ptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) { ASSERT (ODD (ap[0])); if (ODD (bp[0])) { int cmp = mpn_cmp (ap, bp, n); if (cmp == 0) { MPN_NORMALIZE (ap, n); MPN_COPY (gp, ap, n); return n; } else if (cmp > 0) MP_PTR_SWAP (ap, bp); ASSERT_NOCARRY (mpn_sub_n (bp, bp, ap, n)); } while (ABOVE_THRESHOLD (n, BGCD_THRESHOLD)) { struct bgcd_matrix m; mp_size_t zlimbs; unsigned zbits; mp_size_t n1; mp_size_t l; mp_size_t alloc; zlimbs = power_of_2 (&zbits, bp, n); if (zlimbs == n) { MPN_NORMALIZE (ap, n); MPN_COPY (gp, ap, n); return n; } n1 = n/2; alloc = MPN_BGCD_MATRIX_ITCH (n1); ASSERT (4*alloc <= 3*n/2 + 7); mpn_bgcd_matrix_init (&m, tp + 4 * n, alloc); l = mpn_hbgcd_n (ap, bp, n1, n1 * GMP_NUMB_BITS / 2, &m, tp); if (m.j > 0) n = bgcd_matrix_apply (&m, ap, bp, n, n1, l, tp); else { mp_limb_t cy; int qsign; int rsign; if (zbits) ASSERT_NOCARRY (mpn_rshift (bp, bp + zlimbs, n - zlimbs, zbits)); else MPN_COPY_INCR (bp, bp + zlimbs, n - zlimbs); cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, n, zlimbs, zbits, tp + zlimbs + 1); n -= zlimbs; if (cy > 0) { ap[n] = cy; bp[n] = 0; n++; } MP_PTR_SWAP (ap, bp); } } /* Use old algorithm */ { mp_size_t an; mp_size_t bn; an = bn = n; MPN_NORMALIZE (ap, an); MPN_NORMALIZE (bp, bn); if (bn == 0) { MPN_COPY (gp, ap, an); return an; } /* The *smaller* number must be odd */ if (an > bn || (an == bn && ap[an-1] > bp[an - 1])) { int bits; while (bp[0] == 0) { bp++; bn--; } count_trailing_zeros (bits, bp[0]); if (bits > 0) { ASSERT_NOCARRY (mpn_rshift (bp, bp, bn, bits)); bn -= (bp[bn-1] == 0); } MPN_PTR_SWAP (ap, an, bp, bn); } return mpn_gcd (gp, bp, bn, ap, an); } } /* Needs BDIVMOD_N_ITCH(an, an - bn) + an - bn + 1 for the initial division, and BGCD_N_ITCH (bn) for the bgcd_n call. I.e. the max of an - bn + 1 + MAX(4 * (an - bn), an) <= 5*an 11 * bn/2 + 7 */ #define BGCD_ITCH(an, bn) MAX(5*(an), BGCD_N_ITCH(bn)) static mp_size_t bgcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t bn, mp_ptr tp) { /* mpn_gcd uses different conventions than the rest of the code. */ ASSERT (ODD (bp[0])); ASSERT (an >= bn); if (an > bn) { mp_limb_t zlimbs; unsigned zbits; zlimbs = power_of_2 (&zbits, ap, an); ASSERT (zlimbs < an); if (zbits > 0) ASSERT_NOCARRY (mpn_rshift (ap, ap + zlimbs, an - zlimbs, zbits)); else if (zlimbs > 0) MPN_COPY_INCR (ap, ap + zlimbs, an - zlimbs); an = an - zlimbs; an -= (ap[an-1] == 0); if (an < bn) MPN_ZERO (ap + an, an - bn); else if (an > bn) { mp_size_t qn = an - bn + 1; mp_limb_t cy; int qsign; int rsign; cy = bdivmod_n (tp, &qsign, &rsign, ap, bp, an, an - bn, 0, tp + qn); an = bn; ASSERT (!ODD (ap[0])); if (cy > 0) { ASSERT (cy == 1); mpn_rshift1 (ap, ap, an); ap[an-1] |= GMP_NUMB_HIGHBIT; } } } /* Now both numbers are of size bn, and b is still odd */ return bgcd_n (gp, bp, ap, bn, tp); } mp_size_t mpn_bgcd (mp_ptr gp, mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t bn) { mp_ptr tp; mp_size_t gn; TMP_DECL; TMP_MARK; tp = TMP_ALLOC_LIMBS (BGCD_ITCH (an, bn)); gn = bgcd (gp, ap, an, bp, bn, tp); TMP_FREE; return gn; }