/* hgcd2.c THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "mpir.h" #include "gmp-impl.h" #include "longlong.h" #if GMP_NAIL_BITS == 0 /* Copied from mpn/generic/gcdext.c, and modified slightly to return the remainder. */ /* Two-limb division optimized for small quotients. */ static inline mp_limb_t div2 (mp_ptr rp, mp_limb_t nh, mp_limb_t nl, mp_limb_t dh, mp_limb_t dl) { mp_limb_t q = 0; if ((mp_limb_signed_t) nh < 0) { int cnt; for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++) { dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); dl = dl << 1; } while (cnt) { q <<= 1; if (nh > dh || (nh == dh && nl >= dl)) { sub_ddmmss (nh, nl, nh, nl, dh, dl); q |= 1; } dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); dh = dh >> 1; cnt--; } } else { int cnt; for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) { dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); dl = dl << 1; } while (cnt) { dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); dh = dh >> 1; q <<= 1; if (nh > dh || (nh == dh && nl >= dl)) { sub_ddmmss (nh, nl, nh, nl, dh, dl); q |= 1; } cnt--; } } rp[0] = nl; rp[1] = nh; return q; } #else /* GMP_NAIL_BITS != 0 */ /* Two-limb division optimized for small quotients. Input words include nails, which must be zero. */ static inline mp_limb_t div2 (mp_ptr rp, mp_limb_t nh, mp_limb_t nl, mp_limb_t dh, mp_limb_t dl) { mp_limb_t q = 0; int cnt; ASSERT_LIMB (nh); ASSERT_LIMB (nl); ASSERT_LIMB (dh); ASSERT_LIMB (dl); /* FIXME: Always called with nh > 0 and dh >0. Then it should be enough to look at the high limbs to select cnt. */ for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) { dh = (dh << 1) | (dl >> (GMP_NUMB_BITS - 1)); dl = (dl << 1) & GMP_NUMB_MASK; } while (cnt) { dl = (dh << (GMP_NUMB_BITS - 1)) | (dl >> 1); dh = dh >> 1; dl &= GMP_NUMB_MASK; q <<= 1; if (nh > dh || (nh == dh && nl >= dl)) { /* FIXME: We could perhaps optimize this by unrolling the loop 2^GMP_NUMB_BITS - 1 times? */ nl -= dl; nh -= dh; nh -= (nl >> (GMP_LIMB_BITS - 1)); nl &= GMP_NUMB_MASK; q |= 1; } cnt--; } ASSERT (nh < dh || (nh == dh && nl < dl)); rp[0] = nl; rp[1] = nh; return q; } #endif /* GMP_NAIL_BITS != 0 */ #define SUB_2(w1,w0, x1,x0, y1,y0) \ do { \ ASSERT_LIMB (x1); \ ASSERT_LIMB (x0); \ ASSERT_LIMB (y1); \ ASSERT_LIMB (y0); \ \ if (GMP_NAIL_BITS == 0) \ sub_ddmmss (w1,w0, x1,x0, y1,y0); \ else \ { \ mp_limb_t __w0, __c; \ SUBC_LIMB (__c, __w0, x0, y0); \ (w1) = ((x1) - (y1) - __c) & GMP_NUMB_MASK; \ (w0) = __w0; \ } \ } while (0) /* Produce r_k from r_i and r_j, and the corresponding quotient. */ #if __GMP_HAVE_TOKEN_PASTE #define HGCD2_STEP(i, j, k, q) do { \ SUB_2 (rh ## k, rl ## k, \ rh ## i, rl ## i, \ rh ## j, rl ## j); \ \ /* Could check here for the special case rh3 == 0, \ but it's covered by the below condition as well */ \ if ( rh ## k < rh ## j \ || ( rh ## k == rh ## j \ && rl ## k < rl ## j)) \ { \ /* Unit quotient */ \ u ## k = u ## i + u ## j; \ v ## k = v ## i + v ## j; \ \ (q) = 1; \ } \ else \ { \ mp_limb_t r[2]; \ q = 1 + div2 (r, rh ## k, rl ## k, \ rh ## j, rl ## j); \ rl ## k = r[0]; rh ## k = r[1]; \ u ## k = u ## i + q * u ## j; \ v ## k = v ## i + q * v ## j; \ } \ } while (0) #else /* ! __GMP_HAVE_TOKEN_PASTE */ #define HGCD2_STEP(i, j, k, q) do { \ SUB_2 (rh/**/k, rl/**/k, \ rh/**/i, rl/**/i, \ rh/**/j, rl/**/j); \ \ /* Could check here for the special case rh3 == 0, \ but it's covered by the below condition as well */ \ if ( rh/**/k < rh/**/j \ || ( rh/**/k == rh/**/j \ && rl/**/k < rl/**/j)) \ { \ /* Unit quotient */ \ u/**/k = u/**/i + u/**/j; \ v/**/k = v/**/i + v/**/j; \ \ q = 1; \ } \ else \ { \ mp_limb_t r[2]; \ q = 1 + div2 (r, rh/**/k, rl/**/k, \ rh/**/j, rl/**/j); \ rl/**/k = r[0]; rh/**/k = r[1]; \ u/**/k = u/**/i + q * u/**/j; \ v/**/k = v/**/i + q * v/**/j; \ } \ } while (0) #endif /* ! __GMP_HAVE_TOKEN_PASTE */ /* Repeatedly divides A by B, until the remainder is a single limb. Stores cofactors and quotients in HGCD. On success, HGCD->row[0, 1, 2] correspond to remainders that are larger than one limb, while HGCD->row[3] correspond to a remainder that fit in a single limb. Return 0 on failure (if B or A mod B fits in a single limb). Return 1 if r0 and r1 are correct, but we still make no progress because r0 = A, r1 = B. Otherwise return 2, 3 or 4 depending on how many of the r:s that satisfy Jebelean's criterion. */ /* FIXME: There are two more micro optimizations that could be done to this code: The div2 function starts with checking the most significant bit of the numerator. When we call div2, that bit is know in advance for all but the one or two first calls, so we could split div2 in two functions, and call the right one. */ int mpn_hgcd2 (struct hgcd2 *hgcd, mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl) { /* For all divisions, we special case q = 1, which accounts for approximately 41% of the quotients for random numbers (Knuth, TAOCP 4.5.3) */ /* Use scalar variables */ mp_limb_t rh1, rl1, u1, v1; mp_limb_t rh2, rl2, u2, v2; mp_limb_t rh3, rl3, u3, v3; mp_limb_t q0, q1; ASSERT_LIMB (ah); ASSERT_LIMB (al); ASSERT_LIMB (bh); ASSERT_LIMB (bl); ASSERT (ah > bh || (ah == bh && al >= bl)); if (bh == 0) return 0; { mp_limb_t rh0, rl0, u0, v0; /* Initialize first two rows */ rh0 = ah; rl0 = al; u0 = 1; v0 = 0; rh1 = bh; rl1 = bl; u1 = 0; v1 = 1; SUB_2 (rh2, rl2, rh0, rl0, rh1, rl1); if (rh2 == 0) return 0; if (rh2 < rh1 || (rh2 == rh1 && rl2 < rl1)) { /* Unit quotient */ q0 = 1; } else { mp_limb_t r[2]; q0 = 1 + div2 (r, rh2, rl2, rh1, rl1); rl2 = r[0]; rh2 = r[1]; if (rh2 == 0) return 0; } u2 = 1; v2 = q0; /* The simple version of the loop is as follows: | | hgcd->sign = 0; | for (;;) | { | (q, rh3, rl3]) = divmod (r1, r2); | u[3] = u1 + q * u2; | v[3] = v1 + q * v2; | | if (rh3 == 0) | break; | | HGCD2_SHIFT4_LEFT (hgcd->row); | hgcd->sign = ~hgcd->sign; | } | | But then we special case for q = 1, and unroll the loop four times | to avoid data movement. */ for (;;) { HGCD2_STEP (1, 2, 3, q1); if (rh3 == 0) { hgcd->row[0].u = u0; hgcd->row[0].v = v0; hgcd->sign = 0; break; } HGCD2_STEP (2, 3, 0, q0); if (rh0 == 0) { hgcd->row[0].u = u1; hgcd->row[0].v = v1; rh1 = rh2; rl1 = rl2; u1 = u2; v1 = v2; rh2 = rh3; rl2 = rl3; u2 = u3; v2 = v3; rh3 = rh0; rl3 = rl0; u3 = u0; v3 = v0; hgcd->sign = -1; break; } HGCD2_STEP (3, 0, 1, q1); if (rh1 == 0) { hgcd->row[0].u = u2; hgcd->row[0].v = v2; rh2 = rh0; rl2 = rl0; u2 = u0; v2 = v0; MP_LIMB_T_SWAP (rh1, rh3); MP_LIMB_T_SWAP (rl1, rl3); MP_LIMB_T_SWAP ( u1, u3); MP_LIMB_T_SWAP ( v1, v3); hgcd->sign = 0; break; } HGCD2_STEP (0, 1, 2, q0); if (rh2 == 0) { hgcd->row[0].u = u3; hgcd->row[0].v = v3; rh3 = rh2; rl3 = rl2; u3 = u2; v3 = v2; rh2 = rh1; rl2 = rl1; u2 = u1; v2 = v1; rh1 = rh0; rl1 = rl0; u1 = u0; v1 = v0; hgcd->sign = -1; break; } } } ASSERT (rh1 != 0); ASSERT (rh2 != 0); ASSERT (rh3 == 0); ASSERT (rh1 > rh2 || (rh1 == rh2 && rl1 > rl2)); ASSERT (rh2 > rh3 || (rh2 == rh3 && rl2 > rl3)); /* Coefficients to be returned */ hgcd->row[1].u = u1; hgcd->row[1].v = v1; hgcd->row[2].u = u2; hgcd->row[2].v = v2; hgcd->row[3].u = u3; hgcd->row[3].v = v3; if (hgcd->sign >= 0) { hgcd->q[0] = q0; hgcd->q[1] = q1; } else { hgcd->q[0] = q1; hgcd->q[1] = q0; } /* Rows 1, 2 and 3 are used below, rh0, rl0, u0 and v0 are not. */ #if GMP_NAIL_BITS == 0 { mp_limb_t sh; mp_limb_t sl; mp_limb_t th; mp_limb_t tl; /* Check r2 */ /* We always have r2 > u2, v2 */ if (hgcd->sign >= 0) { /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ sl = u2 + u1; sh = (sl < u1); } else { /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ sl = v2 + v1; sh = (sl < v1); } sub_ddmmss (th, tl, rh1, rl1, rh2, rl2); if (th < sh || (th == sh && tl < sl)) return 2 - (hgcd->row[0].v == 0); /* Check r3 */ if (hgcd->sign >= 0) { /* Check r3 >= max (-u3, -v3) = |u3| */ if (rl3 < u3) return 3; /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ sl = v3 + v2; sh = (sl < v2); } else { /* Check r3 >= max (-u3, -v3) = |v3| */ if (rl3 < v3) return 3; /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ sl = u3 + u2; sh = (sl < u2); } sub_ddmmss (th, tl, rh2, rl2, 0, rl3); if (th < sh || (th == sh && tl < sl)) return 3; return 4; } #else /* GMP_NAIL_BITS > 0 */ { mp_limb_t sl; mp_limb_t th; mp_limb_t tl; /* Check r2 */ /* We always have r2 > u2, v2 */ if (hgcd->sign >= 0) { /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ sl = u2 + u1; } else { /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ sl = v2 + v1; } tl = rl1 - rl2; th = rh1 - rh2 - (tl >> (GMP_LIMB_BITS - 1)); ASSERT_LIMB (th); if (th < (CNST_LIMB (1) << GMP_NAIL_BITS) && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) return 2 - (hgcd->row[0].v == 0); /* Check r3 */ if (hgcd->sign >= 0) { /* Check r3 >= max (-u3, -v3) = |u3| */ if (rl3 < u3) return 3; /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ sl = v3 + v2; } else { /* Check r3 >= max (-u3, -v3) = |v3| */ if (rl3 < v3) return 3; /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ sl = u3 + u2; } tl = rl2 - rl3; th = rh2 - (tl >> (GMP_LIMB_BITS - 1)); ASSERT_LIMB (th); if (th < (CNST_LIMB (1) << GMP_NAIL_BITS) && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) return 3; return 4; } #endif /* GMP_NAIL_BITS > 0 */ } mp_size_t mpn_hgcd2_fix (mp_ptr rp, mp_size_t ralloc, int sign, mp_limb_t u, mp_srcptr ap, mp_size_t asize, mp_limb_t v, mp_srcptr bp, mp_size_t bsize) { mp_size_t rsize; mp_limb_t cy; ASSERT_LIMB (u); ASSERT_LIMB (v); if (sign < 0) { MP_LIMB_T_SWAP (u,v); MPN_SRCPTR_SWAP (ap, asize, bp, bsize); } ASSERT (u > 0); ASSERT (asize <= ralloc); rsize = asize; cy = mpn_mul_1 (rp, ap, asize, u); if (cy) { ASSERT (rsize < ralloc); rp[rsize++] = cy; } if (v > 0) { ASSERT (bsize <= rsize); cy = mpn_submul_1 (rp, bp, bsize, v); if (cy) { ASSERT (bsize < rsize); ASSERT_NOCARRY (mpn_sub_1 (rp + bsize, rp + bsize, rsize - bsize, cy)); } MPN_NORMALIZE (rp, rsize); } return rsize; } #undef HGCD2_STEP