Fast sb_divappr_q code.
This commit is contained in:
parent
019ddbfb99
commit
0b033b07f2
@ -32,6 +32,20 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
#define SB_DIVAPPR_Q_SMALL_THRESHOLD 30
|
||||
|
||||
void __divapprox_helper(mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t qn)
|
||||
{
|
||||
mpn_sub_n(np + 1, np + 1, dp, qn + 1);
|
||||
np[1] += dp[qn];
|
||||
|
||||
for (qn--; qn >= 0; qn--)
|
||||
{
|
||||
qp[qn] = ~(mp_limb_t) 0;
|
||||
add_ssaaaa(np[1], np[0], np[1], np[0], 0, dp[qn]);
|
||||
}
|
||||
}
|
||||
|
||||
mp_limb_t
|
||||
mpn_sb_divappr_q (mp_ptr qp,
|
||||
mp_ptr np, mp_size_t nn,
|
||||
@ -41,7 +55,7 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
mp_limb_t qh;
|
||||
mp_size_t qn, i;
|
||||
mp_limb_t n1, n0;
|
||||
mp_limb_t d1, d0;
|
||||
mp_limb_t d1, d0, d11, d01;
|
||||
mp_limb_t cy, cy1;
|
||||
mp_limb_t q;
|
||||
mp_limb_t flag;
|
||||
@ -63,6 +77,70 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
if (qh != 0)
|
||||
mpn_sub_n (np - dn, np - dn, dp, dn);
|
||||
|
||||
if (dn <= SB_DIVAPPR_Q_SMALL_THRESHOLD)
|
||||
{
|
||||
/* Reduce until n - 2 >= qn */
|
||||
for (qn--, np--; qn > dn - 2; qn--)
|
||||
{
|
||||
/* fetch next word */
|
||||
cy = np[0];
|
||||
|
||||
np--;
|
||||
mpir_divapprox32_preinv2(q, cy, np[0], dinv);
|
||||
|
||||
/* a -= d*q1 */
|
||||
cy -= mpn_submul_1(np - dn + 1, dp, dn, q);
|
||||
|
||||
/* correct if remainder is too large */
|
||||
if (UNLIKELY(cy || np[0] >= dp[dn - 1]))
|
||||
{
|
||||
if (cy || mpn_cmp(np - dn + 1, dp, dn) >= 0)
|
||||
{
|
||||
q++;
|
||||
cy -= mpn_sub_n(np - dn + 1, np - dn + 1, dp, dn);
|
||||
}
|
||||
}
|
||||
|
||||
qp[qn] = q;
|
||||
}
|
||||
|
||||
qn++;
|
||||
dp = dp + dn - qn - 1; /* make d length qn + 1 */
|
||||
|
||||
for ( ; qn > 0; qn--)
|
||||
{
|
||||
/* fetch next word */
|
||||
cy = np[0];
|
||||
|
||||
np--;
|
||||
/* rare case where truncation ruins normalisation */
|
||||
if (cy > dp[qn] || (cy == dp[qn] && mpn_cmp(np - qn + 1, dp, qn) >= 0))
|
||||
{
|
||||
__divapprox_helper(qp, np - qn, dp, qn);
|
||||
return qh;
|
||||
}
|
||||
|
||||
mpir_divapprox32_preinv2(q, cy, np[0], dinv);
|
||||
|
||||
/* a -= d*q1 */
|
||||
cy -= mpn_submul_1(np - qn, dp, qn + 1, q);
|
||||
|
||||
/* correct if remainder is too large */
|
||||
if (UNLIKELY(cy || np[0] >= dp[qn]))
|
||||
{
|
||||
if (cy || mpn_cmp(np - qn, dp, qn + 1) >= 0)
|
||||
{
|
||||
q++;
|
||||
cy -= mpn_sub_n(np - qn, np - qn, dp, qn + 1);
|
||||
}
|
||||
}
|
||||
|
||||
qp[qn - 1] = q;
|
||||
dp++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
qp += qn;
|
||||
|
||||
dn -= 2; /* offset dn by 2 for main division loops,
|
||||
@ -74,6 +152,9 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
|
||||
n1 = np[1];
|
||||
|
||||
d01 = d0 + 1;
|
||||
d11 = d1 + (d01 < d0);
|
||||
|
||||
for (i = qn - (dn + 2); i >= 0; i--)
|
||||
{
|
||||
np--;
|
||||
@ -85,7 +166,7 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
}
|
||||
else
|
||||
{
|
||||
udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
|
||||
mpir_divrem32_preinv2 (q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);
|
||||
|
||||
cy = mpn_submul_1 (np - dn, dp, dn, q);
|
||||
|
||||
@ -131,7 +212,7 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
}
|
||||
else
|
||||
{
|
||||
udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
|
||||
mpir_divrem32_preinv2 (q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);
|
||||
|
||||
cy = mpn_submul_1 (np - dn, dp, dn, q);
|
||||
|
||||
@ -175,7 +256,7 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
}
|
||||
else
|
||||
{
|
||||
udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
|
||||
mpir_divrem32_preinv2 (q, n1, n0, n1, np[1], np[0], d11, d01, d1, d0, dinv);
|
||||
|
||||
np[1] = n1;
|
||||
np[0] = n0;
|
||||
@ -185,6 +266,7 @@ mpn_sb_divappr_q (mp_ptr qp,
|
||||
}
|
||||
|
||||
ASSERT_ALWAYS (np[1] == n1);
|
||||
}
|
||||
|
||||
return qh;
|
||||
}
|
||||
|
@ -64,7 +64,7 @@ check_sb_divappr_q (void)
|
||||
|
||||
MPN_COPY(np2, np, nn);
|
||||
|
||||
mpir_invert_pi1(dip, dp[dn - 1], dp[dn - 2]);
|
||||
mpir_invert_pi2(dip, dp[dn - 1], dp[dn - 2]);
|
||||
|
||||
qn = nn - dn + 1;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user