diff --git a/gmp-impl.h b/gmp-impl.h index fad5eac3..63952244 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -2745,6 +2745,70 @@ mp_limb_t mpn_invert_limb _PROTO ((mp_limb_t)) ATTRIBUTE_CONST; dinv = v; \ } while (0) +#define add_sssaaaaaa(sh, sm, sl, ah, am, al, bh, bm, bl) \ + __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0" \ + : "=r" (sh), "=r" (sm), "=&r" (sl) \ + : "0" ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)), \ + "1" ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)), \ + "2" ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl))) \ + +#define sub_dddmmmsss(sh, sm, sl, ah, am, al, bh, bm, bl) \ + __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0" \ + : "=r" (sh), "=r" (sm), "=&r" (sl) \ + : "0" ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)), \ + "1" ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)), \ + "2" ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl))) \ + +#define mpir_invert_pi2(dinv, d1, d2) \ +do { \ + mp_limb_t __q, __r[2], __p[2], __cy; \ + \ + if ((d2) + 1 == 0 && (d1) + 1 == 0) \ + (dinv) = 0; \ + else { \ + if ((d1) + 1 == 0) \ + (dinv) = ~(d1), __r[1] = ~(d2); \ + else \ + udiv_qrnnd((dinv), __r[1], ~(d1), ~(d2), (d1) + 1); \ + \ + if ((d2) + 1 != 0) { \ + __r[0] = 0; \ + umul_ppmm(__p[1], __p[0], (dinv), ~(d2)); \ + __cy = mpn_add_n(__r, __r, __p, 2); \ + \ + __p[0] = (d2) + 1, __p[1] = (d1) + ((d2) + 1 == 0); \ + while (__cy || mpn_cmp(__r, __p, 2) >= 0) \ + { (dinv)++; __cy -= mpn_sub_n(__r, __r, __p, 2); } \ + } \ + } \ +} while (0) + +#define mpir_div32_preinv2(q, a1, a2, a3, d1, d2, dinv) \ + do { \ + mp_limb_t __q2, __q3, __q4, __r2, __r3, __p1, __p2; \ + umul_ppmm((q), __q2, (a1), (dinv)); \ + add_ssaaaa((q), __q2, (q), __q2, (a1), (a2)); \ + umul_ppmm(__p1, __p2, (q), (d2)); \ + sub_ddmmss(__r2, __r3, (a2) - (q)*(d1), (a3), __p1, __p2); \ + sub_ddmmss(__r2, __r3, __r2, __r3, (d1), (d2)); \ + if (__r2 < __q2) (q)++; \ + } while (0) + +#define mpir_divrem32_preinv2(q, r2, r3, a1, a2, a3, d1, d2, dinv) \ + do { \ + mp_limb_t __q2, __q3, __q4, __p1, __p2, __cy; \ + umul_ppmm((q), __q2, (a1), (dinv)); \ + add_ssaaaa((q), __q2, (q), __q2, (a1), (a2)); \ + umul_ppmm(__p1, __p2, (q), (d2)); \ + (r3) = (a3); \ + (r2) = (a2) - (q)*(d1); \ + sub_ddmmss((r2), (r3), (r2), (r3), __p1, __p2); \ + sub_ddmmss((r2), (r3), (r2), (r3), (d1), (d2)); \ + (q)++; \ + if ((r2) >= __q2) \ + { (q)--; add_ssaaaa((r2), (r3), (r2), (r3), (d1), (d2)); } \ + } while (0) + /* Compute quotient the quotient and remainder for n / d. Requires d >= B^2 / 2 and n < d B. di is the inverse diff --git a/mpn/generic/sb_div_qr.c b/mpn/generic/sb_div_qr.c index 7495eb21..ef9f2624 100644 --- a/mpn/generic/sb_div_qr.c +++ b/mpn/generic/sb_div_qr.c @@ -40,8 +40,8 @@ mpn_sb_div_qr (mp_ptr qp, mp_limb_t qh; mp_size_t i; mp_limb_t n1, n0; - mp_limb_t d1, d0; - mp_limb_t cy, cy1; + mp_limb_t d1, d0, d01, d11; + mp_limb_t cy, cy1, cy2; mp_limb_t q; ASSERT (dn > 2); @@ -61,6 +61,9 @@ mpn_sb_div_qr (mp_ptr qp, d1 = dp[dn + 1]; d0 = dp[dn + 0]; + d01 = d0 + 1; + d11 = d1 + (d01 == 0); + np -= 2; n1 = np[1]; @@ -76,20 +79,36 @@ mpn_sb_div_qr (mp_ptr qp, } else { - tdiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + mpir_divrem32_preinv2(q, n1, n0, n1, np[1], np[0], d11, d01, dinv); - cy = mpn_submul_1 (np - dn, dp, dn, q); + add_sssaaaaaa(cy, n1, n0, 0, n1, n0, 0, 0, q); + while (UNLIKELY(cy != 0 || n1 >= d1)) + { + if (n1 == d1 && n0 < d0 && cy == 0) break; + sub_dddmmmsss(cy, n1, n0, cy, n1, n0, 0, d1, d0); + (q)++; + } + + cy2 = mpn_submul_1 (np - dn, dp, dn, q); + + sub_dddmmmsss(cy, n1, n0, 0, n1, n0, 0, 0, cy2); + /*cy1 = n0 < cy2; + n0 = (n0 - cy2); + cy = -(n1 < cy1); + n1 = (n1 - cy1);*/ + + /*add_sssaaaaaa(cy, n1, n0, cy, n1, n0, 0, 0, q);*/ + /*cy1 = (n0 + q < n0); + n0 = (n0 + q); + cy += (n1 + cy1 < n1); + n1 = (n1 + cy1);*/ - cy1 = n0 < cy; - n0 = (n0 - cy) & GMP_NUMB_MASK; - cy = n1 < cy1; - n1 = (n1 - cy1) & GMP_NUMB_MASK; np[0] = n0; if (UNLIKELY (cy != 0)) { n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); - q--; + q--; } } diff --git a/tests/mpn/t-sb_div_qr.c b/tests/mpn/t-sb_div_qr.c index 600bd4ed..09d61c0f 100644 --- a/tests/mpn/t-sb_div_qr.c +++ b/tests/mpn/t-sb_div_qr.c @@ -64,7 +64,7 @@ check_sb_div_qr (void) MPN_COPY(np2, np, nn); - invert_1(dip, dp[dn - 1], dp[dn - 2]); + mpir_invert_pi2(dip, dp[dn - 1], dp[dn - 2]); qn = nn - dn + 1; @@ -82,12 +82,14 @@ check_sb_div_qr (void) if (rn > nn) { + printf("iteration = %ld\n", i); printf("failed: q*d has too many limbs\n"); abort(); } if (mpn_cmp(rp, np2, nn) > 0) { + printf("iteration = %ld\n", i); printf("failed: remainder negative\n"); abort(); } @@ -104,6 +106,7 @@ check_sb_div_qr (void) s = (rn < dn) ? -1 : (rn > dn) ? 1 : mpn_cmp(rp, dp, dn); if (s >= 0) { + printf("iteration = %ld\n", i); printf ("failed:\n"); printf ("nn = %lu, dn = %lu, qn = %lu, rn = %lu\n\n", nn, dn, qn, rn); gmp_printf (" np: %Nx\n\n", np2, nn); @@ -115,6 +118,7 @@ check_sb_div_qr (void) if (mpn_cmp(rp, np, rn) != 0) { + printf("iteration = %ld\n", i); printf("failed: remainder does not match\n"); gmp_printf (" np: %Nx\n\n", np2, nn); gmp_printf (" dp: %Nx\n\n", dp, dn);