Factored out mpn_toom3_sqr_n and mpn_toom3_mul_n and removed duplication
of mpn_toom3_interpolate. Rewrote mpn_toom3_sqr_n.
This commit is contained in:
parent
44dadcf975
commit
9c79e0a98b
2
configure
vendored
2
configure
vendored
@ -31125,7 +31125,7 @@ gmp_mpn_functions="$extra_functions \
|
||||
rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
|
||||
bdivmod gcd gcd_1 gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \
|
||||
mullow_n mullow_basecase redc_basecase \
|
||||
$gmp_mpn_functions_optional toom3_mul toom4_mul_n toom7_mul_n"
|
||||
$gmp_mpn_functions_optional toom3_mul toom3_mul_n toom4_mul_n toom7_mul_n"
|
||||
|
||||
|
||||
|
||||
|
@ -2494,7 +2494,7 @@ gmp_mpn_functions="$extra_functions \
|
||||
rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
|
||||
bdivmod gcd gcd_1 gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \
|
||||
mullow_n mullow_basecase redc_basecase \
|
||||
$gmp_mpn_functions_optional toom3_mul toom4_mul_n toom7_mul_n"
|
||||
$gmp_mpn_functions_optional toom3_mul toom3_mul_n toom4_mul_n toom7_mul_n"
|
||||
|
||||
define(GMP_MULFUNC_CHOICES,
|
||||
[# functions that can be provided by multi-function files
|
||||
|
@ -46,12 +46,13 @@ nodist_EXTRA_libmpn_la_SOURCES = \
|
||||
gcd_finda.c gcd_1.c gcdext.c get_d.c get_str.c \
|
||||
hamdist.c invert_limb.c lshift1.c rshift1.c \
|
||||
ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \
|
||||
mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c toom4_mul_n.c toom7_mul_n.c \
|
||||
mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c \
|
||||
toom3_mul_n.c toom4_mul_n.c toom7_mul_n.c \
|
||||
mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c \
|
||||
pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
|
||||
rootrem.c redc_basecase.c sb_divrem_mn.c scan0.c scan1.c set_str.c \
|
||||
sqr_basecase.c sqr_diagonal.c \
|
||||
sqrtrem.c sub.c sub_1.c sub_n.c subadd_n.c submul_1.c \
|
||||
sqrtrem.c sub.c sub_1.c sub_n.c subadd_n.c submul_1.c \
|
||||
tdiv_qr.c udiv_qrnnd.c udiv_w_sdiv.c xor_n.c xnor_n.c
|
||||
|
||||
noinst_LTLIBRARIES = libmpn.la
|
||||
|
@ -293,12 +293,13 @@ nodist_EXTRA_libmpn_la_SOURCES = \
|
||||
gcd_finda.c gcd_1.c gcdext.c get_d.c get_str.c \
|
||||
hamdist.c invert_limb.c lshift1.c rshift1.c \
|
||||
ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \
|
||||
mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c toom4_mul_n.c toom7_mul_n.c \
|
||||
mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c \
|
||||
toom3_mul_n.c toom4_mul_n.c toom7_mul_n.c \
|
||||
mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c \
|
||||
pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
|
||||
rootrem.c redc_basecase.c sb_divrem_mn.c scan0.c scan1.c set_str.c \
|
||||
sqr_basecase.c sqr_diagonal.c \
|
||||
sqrtrem.c sub.c sub_1.c sub_n.c subadd_n.c submul_1.c \
|
||||
sqrtrem.c sub.c sub_1.c sub_n.c subadd_n.c submul_1.c \
|
||||
tdiv_qr.c udiv_qrnnd.c udiv_w_sdiv.c xor_n.c xnor_n.c
|
||||
|
||||
noinst_LTLIBRARIES = libmpn.la
|
||||
@ -593,6 +594,8 @@ tdiv_qr_.c: tdiv_qr.c $(ANSI2KNR)
|
||||
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/tdiv_qr.c; then echo $(srcdir)/tdiv_qr.c; else echo tdiv_qr.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
|
||||
toom3_mul_.c: toom3_mul.c $(ANSI2KNR)
|
||||
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/toom3_mul.c; then echo $(srcdir)/toom3_mul.c; else echo toom3_mul.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
|
||||
toom3_mul_n_.c: toom3_mul_n.c $(ANSI2KNR)
|
||||
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/toom3_mul_n.c; then echo $(srcdir)/toom3_mul_n.c; else echo toom3_mul_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
|
||||
toom4_mul_n_.c: toom4_mul_n.c $(ANSI2KNR)
|
||||
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/toom4_mul_n.c; then echo $(srcdir)/toom4_mul_n.c; else echo toom4_mul_n.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
|
||||
toom7_mul_n_.c: toom7_mul_n.c $(ANSI2KNR)
|
||||
@ -653,10 +656,11 @@ sqr_diagonal_.lo sqrtrem_.$(OBJEXT) sqrtrem_.lo sub_.$(OBJEXT) sub_.lo \
|
||||
sub_1_.$(OBJEXT) sub_1_.lo sub_n_.$(OBJEXT) sub_n_.lo \
|
||||
subadd_n_.$(OBJEXT) subadd_n_.lo submul_1_.$(OBJEXT) submul_1_.lo \
|
||||
tdiv_qr_.$(OBJEXT) tdiv_qr_.lo toom3_mul_.$(OBJEXT) toom3_mul_.lo \
|
||||
toom4_mul_n_.$(OBJEXT) toom4_mul_n_.lo toom7_mul_n_.$(OBJEXT) \
|
||||
toom7_mul_n_.lo udiv_qrnnd_.$(OBJEXT) udiv_qrnnd_.lo \
|
||||
udiv_w_sdiv_.$(OBJEXT) udiv_w_sdiv_.lo xnor_n_.$(OBJEXT) xnor_n_.lo \
|
||||
xor_n_.$(OBJEXT) xor_n_.lo : $(ANSI2KNR)
|
||||
toom3_mul_n_.$(OBJEXT) toom3_mul_n_.lo toom4_mul_n_.$(OBJEXT) \
|
||||
toom4_mul_n_.lo toom7_mul_n_.$(OBJEXT) toom7_mul_n_.lo \
|
||||
udiv_qrnnd_.$(OBJEXT) udiv_qrnnd_.lo udiv_w_sdiv_.$(OBJEXT) \
|
||||
udiv_w_sdiv_.lo xnor_n_.$(OBJEXT) xnor_n_.lo xor_n_.$(OBJEXT) \
|
||||
xor_n_.lo : $(ANSI2KNR)
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
@ -421,603 +421,6 @@ mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
|
||||
}
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
* *
|
||||
* Toom 3-way multiplication and squaring *
|
||||
* *
|
||||
*****************************************************************************/
|
||||
|
||||
/* put in {c, 2n} where n = 2k+r the value of {v0,2k} (already in place)
|
||||
+ B^k * [{v1, 2k+1} - {t1, 2k+1}]
|
||||
+ B^(2k) * [{t2, 2k+1} - {v0+vinf, 2k}]
|
||||
+ B^(3k) * [{t1, 2k+1} - {t2, 2k+1}]
|
||||
+ B^(4k) * {vinf,2r} (high 2r-1 limbs already in place)
|
||||
where {t1, 2k+1} = (3*{v0,2k}+2*sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,2r}
|
||||
{t2, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1})/2
|
||||
(sa is the sign of {vm1, 2k+1}).
|
||||
|
||||
{vinf, 2r} stores the content of {v0, 2r} + {vinf, 2r}, with carry in cinf0.
|
||||
vinf0 is the low limb of vinf.
|
||||
|
||||
ws is temporary space, and should have at least 2r limbs.
|
||||
|
||||
Think about:
|
||||
|
||||
The evaluated point a-b+c stands a good chance of having a zero carry
|
||||
limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly.
|
||||
Perhaps this could be tested and stripped. Doing so before recursing
|
||||
would be better than stripping at the start of mpn_toom3_mul_n/sqr_n,
|
||||
since then the recursion could be based on the new size. Although in
|
||||
truth the kara vs toom3 crossover is never so exact that one limb either
|
||||
way makes a difference.
|
||||
|
||||
A small value like 1 or 2 for the carry could perhaps also be handled
|
||||
with an add_n or addlsh1_n. Would that be faster than an extra limb on a
|
||||
(recursed) multiply/square?
|
||||
*/
|
||||
static void
|
||||
toom3_interpolate (mp_ptr c, mp_srcptr v1, mp_ptr v2, mp_ptr vm1,
|
||||
mp_ptr vinf, mp_size_t k, mp_size_t r, int sa,
|
||||
mp_limb_t vinf0, mp_limb_t cinf0, mp_ptr ws)
|
||||
{
|
||||
mp_limb_t cy, saved;
|
||||
unsigned long twok = k + k;
|
||||
unsigned long kk1 = twok + 1;
|
||||
unsigned long twor = r + r;
|
||||
mp_ptr c1, c2, c3, c4, c5;
|
||||
mp_limb_t cout; /* final carry, should be zero at the end */
|
||||
|
||||
c1 = c + k;
|
||||
c2 = c1 + k;
|
||||
c3 = c2 + k;
|
||||
c4 = c3 + k;
|
||||
c5 = c4 + k;
|
||||
|
||||
#define v0 (c)
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
|
||||
v0 |vm1| hi(vinf) v1 v2+2vm1 vinf
|
||||
+lo(v0) */
|
||||
|
||||
ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */
|
||||
#ifdef HAVE_NATIVE_mpn_rsh1add_n
|
||||
mpn_rsh1add_n (v2, v2, v0, twok); /* v2 <- (lo(v2)+v0) / 2, exact */
|
||||
cy = v2[twok] & 1; /* add high limb of v2 divided by 2 */
|
||||
v2[twok] >>= 1;
|
||||
MPN_INCR_U (v2 + twok - 1, 2, cy << (GMP_NUMB_BITS - 1));
|
||||
#else
|
||||
v2[twok] += mpn_add_n (v2, v2, v0, twok);
|
||||
mpn_rshift1 (v2, v2, kk1);
|
||||
#endif
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
|
||||
v0 |vm1| hi(vinf) v1 (3v0+2vm1+v2) vinf
|
||||
/6 +lo(v0) */
|
||||
|
||||
/* vm1 <- t2 := (v1 + sa*vm1) / 2
|
||||
t2 = a0*b0+a0*b2+a1*b1+a2*b0+a2*b2 >= 0
|
||||
No carry comes out from {v1, kk1} +/- {vm1, kk1},
|
||||
and the division by two is exact */
|
||||
if (sa >= 0)
|
||||
{
|
||||
#ifdef HAVE_NATIVE_mpn_rsh1add_n
|
||||
mpn_rsh1add_n (vm1, v1, vm1, kk1);
|
||||
#else
|
||||
mpn_add_n (vm1, vm1, v1, kk1);
|
||||
mpn_rshift1 (vm1, vm1, kk1);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
|
||||
mpn_rsh1sub_n (vm1, v1, vm1, kk1);
|
||||
#else
|
||||
mpn_sub_n (vm1, v1, vm1, kk1);
|
||||
mpn_rshift1 (vm1, vm1, kk1);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
|
||||
v0 t2 hi(vinf) v1 t1 vinf+lo(v0) */
|
||||
|
||||
/* subtract 2*vinf to v2,
|
||||
result is t1 := a0*b0+a0*b2+a1*b1+a1*b2+a2*b0+a2*b1+a2*b2 >= 0 */
|
||||
saved = c4[0];
|
||||
c4[0] = vinf0;
|
||||
#ifdef HAVE_NATIVE_mpn_sublsh1_n
|
||||
cy = mpn_sublsh1_n (v2, v2, c4, twor);
|
||||
#else
|
||||
cy = mpn_lshift1 (ws, c4, twor);
|
||||
cy += mpn_sub_n (v2, v2, ws, twor);
|
||||
#endif
|
||||
MPN_DECR_U (v2 + twor, kk1 - twor, cy);
|
||||
c4[0] = saved;
|
||||
|
||||
/* subtract {t2, 2k+1} in {c+3k, 2k+1} i.e. in {t2+k, 2k+1}:
|
||||
by chunks of k limbs from right to left to avoid overlap */
|
||||
#define t2 (vm1)
|
||||
/* a borrow may occur in one of the 2 following __GMPN_SUB_1 calls, but since
|
||||
the final result is nonnegative, it will be compensated later on */
|
||||
__GMPN_SUB_1 (cout, c5, c5, twor - k, t2[twok]);
|
||||
cy = mpn_sub_n (c4, c4, t2 + k, k);
|
||||
__GMPN_SUB_1 (cout, c5, c5, twor - k, cy);
|
||||
cy = mpn_sub_n (c3, c3, t2, k);
|
||||
__GMPN_SUB_1 (cout, c4, c4, twor, cy);
|
||||
|
||||
/* don't forget to add vinf0 in {c+4k, ...} */
|
||||
__GMPN_ADD_1 (cout, c4, c4, twor, vinf0);
|
||||
|
||||
/* c c+k c+2k c+3k c+4k+1 t t+2k+1 t+4k+2
|
||||
v0 t2 hi(vinf) v1 t1 vinf
|
||||
-t2 +lo(v0)
|
||||
*/
|
||||
|
||||
/* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
|
||||
v0 t2 vinf v1 t1 vinf
|
||||
-t2 +lo(v0)
|
||||
*/
|
||||
|
||||
/* subtract v0+vinf in {c+2k, ...} */
|
||||
cy = cinf0 + mpn_sub_n (c2, c2, vinf, twor);
|
||||
if (twor < twok)
|
||||
{
|
||||
__GMPN_SUB_1 (cy, c2 + twor, c2 + twor, twok - twor, cy);
|
||||
cy += mpn_sub_n (c2 + twor, c2 + twor, v0 + twor, twok - twor);
|
||||
}
|
||||
__GMPN_SUB_1 (cout, c4, c4, twor, cy); /* 2n-4k = 2r */
|
||||
|
||||
/* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
|
||||
v0 t2 vinf v1 t1 vinf
|
||||
-v0 -t2 +lo(v0)
|
||||
-vinf */
|
||||
|
||||
/* subtract t1 in {c+k, ...} */
|
||||
cy = mpn_sub_n (c1, c1, v2, kk1);
|
||||
__GMPN_SUB_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1)=k+2r-1 */
|
||||
|
||||
/* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
|
||||
v0 t2 vinf v1 t1 vinf
|
||||
-t1 -v0 -t2
|
||||
-vinf */
|
||||
|
||||
/* add t1 in {c+3k, ...} */
|
||||
cy = mpn_add_n (c3, c3, v2, kk1);
|
||||
__GMPN_ADD_1 (cout, c5 + 1, c5 + 1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
|
||||
|
||||
/* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
|
||||
v0 t2 t1 vinf v1 t1 vinf
|
||||
-t1 -v0 -t2
|
||||
-vinf */
|
||||
|
||||
/* add v1 in {c+k, ...} */
|
||||
cy = mpn_add_n (c1, c1, v1, kk1);
|
||||
__GMPN_ADD_1 (cout, c3 + 1, c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
|
||||
|
||||
/* c c+k c+2k c+3k c+4k t t+2k+1 t+4k+2
|
||||
v0 v1 t2 t1 vinf v1 t1 vinf
|
||||
-t1 -v0 -t2
|
||||
-vinf */
|
||||
#undef v0
|
||||
#undef t2
|
||||
}
|
||||
|
||||
/*
|
||||
We have
|
||||
|
||||
{v0,2k} {v1,2k+1} {c+4k+1,r+r2-1}
|
||||
v0 v1 {-}vinf
|
||||
|
||||
vinf0 is the first limb of vinf, which is overwritten by v1
|
||||
|
||||
{vm1,2k+1} {v2, 2k+1}
|
||||
|
||||
ws is temporary space
|
||||
|
||||
sa is the sign of vm1
|
||||
|
||||
rr2 is r+r2
|
||||
|
||||
We want to compute
|
||||
|
||||
t1 <- (3*v0+2*vm1+v2)/6-2*vinf
|
||||
t2 <- (v1+vm1)/2
|
||||
then the result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where
|
||||
c0 <- v0
|
||||
c1 <- v1 - t1
|
||||
c2 <- t2 - v0 - vinf
|
||||
c3 <- t1 - t2
|
||||
c4 <- vinf
|
||||
*/
|
||||
static void
|
||||
toom3_n_interpolate (mp_ptr c, mp_ptr v1, mp_ptr v2, mp_ptr vm1,
|
||||
mp_ptr vinf, mp_size_t k, mp_size_t rr2, int sa,
|
||||
mp_limb_t vinf0, mp_ptr ws)
|
||||
{
|
||||
mp_limb_t cy, saved;
|
||||
unsigned long twok = k + k;
|
||||
unsigned long kk1 = twok + 1;
|
||||
mp_ptr c1, c2, c3, c4, c5;
|
||||
mp_limb_t cout; /* final carry, should be zero at the end */
|
||||
|
||||
c1 = c + k;
|
||||
c2 = c1 + k;
|
||||
c3 = c2 + k;
|
||||
c4 = c3 + k;
|
||||
c5 = c4 + k;
|
||||
|
||||
#define v0 (c)
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1}
|
||||
v0 v1 {-}vinf
|
||||
|
||||
{vm1,2k+1} {v2, 2k+1}
|
||||
*/
|
||||
|
||||
/* v2 <- v2 - vm1 */
|
||||
if (sa < 0)
|
||||
{
|
||||
mpn_add_n(v2, v2, vm1, kk1);
|
||||
} else
|
||||
{
|
||||
mpn_sub_n(v2, v2, vm1, kk1);
|
||||
}
|
||||
|
||||
ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */
|
||||
|
||||
/* vm1 <- t2 := (v1 - sa*vm1) / 2
|
||||
*/
|
||||
if (sa < 0)
|
||||
{
|
||||
#ifdef HAVE_NATIVE_mpn_rsh1add_n
|
||||
mpn_rsh1add_n (vm1, v1, vm1, kk1);
|
||||
#else
|
||||
mpn_add_n (vm1, vm1, v1, kk1);
|
||||
mpn_rshift1 (vm1, vm1, kk1);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
|
||||
mpn_rsh1sub_n (vm1, v1, vm1, kk1);
|
||||
#else
|
||||
mpn_sub_n (vm1, v1, vm1, kk1);
|
||||
mpn_rshift1 (vm1, vm1, kk1);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* v1 <- v1 - v0 - vinf */
|
||||
|
||||
saved = c4[0];
|
||||
c4[0] = vinf0;
|
||||
#if HAVE_NATIVE_mpn_subadd_n
|
||||
cy = mpn_subadd_n(v1, v1, v0, c4, rr2);
|
||||
#else
|
||||
cy = mpn_sub_n(v1, v1, v0, rr2);
|
||||
cy += mpn_sub_n(v1, v1, c4, rr2);
|
||||
#endif
|
||||
c4[0] = saved;
|
||||
if (rr2 < twok)
|
||||
{
|
||||
v1[twok] -= mpn_sub_n(v1 + rr2, v1 + rr2, v0 + rr2, twok - rr2);
|
||||
MPN_DECR_U(v1 + rr2, kk1 - rr2, cy);
|
||||
}
|
||||
else v1[twok] -= cy;
|
||||
|
||||
saved = c4[0];
|
||||
c4[0] = vinf0;
|
||||
/* subtract 5*vinf from v2,
|
||||
*/
|
||||
cy = mpn_submul_1 (v2, c4, rr2, CNST_LIMB(5));
|
||||
MPN_DECR_U (v2 + rr2, kk1 - rr2, cy);
|
||||
c4[0] = saved;
|
||||
|
||||
/* v2 = (v2 - v1)/2 (exact)
|
||||
*/
|
||||
#ifdef HAVE_NATIVE_mpn_rsh1sub_n
|
||||
mpn_rsh1sub_n (v2, v2, v1, kk1);
|
||||
#else
|
||||
mpn_sub_n (v2, v2, v1, kk1);
|
||||
mpn_rshift1 (v2, v2, kk1);
|
||||
#endif
|
||||
|
||||
/* v1 = v1 - vm1
|
||||
*/
|
||||
mpn_sub_n(v1, v1, vm1, kk1);
|
||||
|
||||
/* vm1 = vm1 - v2 and add vm1 in {c+k, ...} */
|
||||
#if HAVE_NATIVE_mpn_addsub_n
|
||||
cy = mpn_addsub_n(c1, c1, vm1, v2, kk1);
|
||||
#else
|
||||
mpn_sub_n(vm1, vm1, v2, kk1);
|
||||
cy = mpn_add_n (c1, c1, vm1, kk1);
|
||||
#endif
|
||||
mpn_add_1(c3 + 1, c3 + 1, rr2 + k - 1, cy); /* 4k+rr2-(3k+1) = rr2+k-1 */
|
||||
|
||||
/* don't forget to add vinf0 in {c+4k, ...} */
|
||||
mpn_add_1(c4, c4, rr2, vinf0);
|
||||
|
||||
|
||||
/* add v2 in {c+3k, ...} */
|
||||
cy = mpn_add_n (c3, c3, v2, kk1);
|
||||
mpn_add_1(c5 + 1, c5 + 1, rr2 - k - 1, cy); /* 4k+rr2-(5k+1) = rr2-k-1 */
|
||||
|
||||
#undef v0
|
||||
}
|
||||
|
||||
#define TOOM3_MUL_REC(p, a, b, n, ws) \
|
||||
do { \
|
||||
if (MUL_TOOM3_THRESHOLD / 3 < MUL_KARATSUBA_THRESHOLD \
|
||||
&& BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \
|
||||
mpn_mul_basecase (p, a, n, b, n); \
|
||||
else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) \
|
||||
mpn_kara_mul_n (p, a, b, n, ws); \
|
||||
else \
|
||||
mpn_toom3_mul_n (p, a, b, n, ws); \
|
||||
} while (0)
|
||||
|
||||
#define TOOM3_SQR_REC(p, a, n, ws) \
|
||||
do { \
|
||||
if (SQR_TOOM3_THRESHOLD / 3 < SQR_BASECASE_THRESHOLD \
|
||||
&& BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \
|
||||
mpn_mul_basecase (p, a, n, a, n); \
|
||||
else if (SQR_TOOM3_THRESHOLD / 3 < SQR_KARATSUBA_THRESHOLD \
|
||||
&& BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
|
||||
mpn_sqr_basecase (p, a, n); \
|
||||
else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
|
||||
mpn_kara_sqr_n (p, a, n, ws); \
|
||||
else \
|
||||
mpn_toom3_sqr_n (p, a, n, ws); \
|
||||
} while (0)
|
||||
|
||||
/* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD,
|
||||
and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3).
|
||||
|
||||
Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1).
|
||||
Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1).
|
||||
|
||||
With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1).
|
||||
Since THRESHOLD >= 17, we have n/(k+1) >= 19/8
|
||||
thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n).
|
||||
|
||||
We need in addition 2*r for mpn_sublsh1_n, so the total is at most
|
||||
8/3*n+8*log2(n).
|
||||
*/
|
||||
void
|
||||
mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t)
|
||||
{
|
||||
mp_size_t k, k1, kk1, r, r2, twok, rr2;
|
||||
mp_limb_t cy, cc, saved, vinf0;
|
||||
mp_ptr trec;
|
||||
int sa, sb;
|
||||
mp_ptr c1, c2, c3, c4, c5, t1, t2, t3, t4;
|
||||
|
||||
ASSERT(GMP_NUMB_BITS >= 6);
|
||||
|
||||
k = (n + 2) / 3; /* ceil(n/3) */
|
||||
ASSERT(GMP_NUMB_BITS >= 6);
|
||||
ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */
|
||||
|
||||
twok = 2 * k;
|
||||
k1 = k + 1;
|
||||
kk1 = k + k1;
|
||||
r = n - twok; /* last chunk */
|
||||
rr2 = 2*r;
|
||||
|
||||
c1 = c + k;
|
||||
c2 = c1 + k;
|
||||
c3 = c2 + k;
|
||||
c4 = c3 + k;
|
||||
c5 = c4 + k;
|
||||
|
||||
t1 = t + k;
|
||||
t2 = t1 + k;
|
||||
t3 = t2 + k;
|
||||
t4 = t3 + k;
|
||||
|
||||
trec = t + 4 * k + 3;
|
||||
|
||||
/* put a0+a2 in {c, k+1}, and b0+b2 in {c4 + 2, k+1};
|
||||
put a0+a1+a2 in {t2 + 1, k+1} and b0+b1+b2 in {t3 + 2,k+1}
|
||||
*/
|
||||
cy = mpn_add_n (c, a, a + twok, r);
|
||||
cc = mpn_add_n (c4 + 2, b, b + twok, r);
|
||||
if (r < k)
|
||||
{
|
||||
__GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
|
||||
__GMPN_ADD_1 (cc, c4 + 2 + r, b + r, k - r, cc);
|
||||
}
|
||||
t3[1] = (c1[0] = cy) + mpn_add_n (t2 + 1, c, a + k, k);
|
||||
t4[2] = (c5[2] = cc) + mpn_add_n (t3 + 2, c4 + 2, b + k, k);
|
||||
|
||||
/* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {c2, 2k+1};
|
||||
since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */
|
||||
TOOM3_MUL_REC (c2, t2 + 1, t3 + 2, k1, trec);
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1}
|
||||
v1
|
||||
*/
|
||||
|
||||
/* put |a0-a1+a2| in {c,k+1} and |b0-b1+b2| in {c4 + 2,k+1} */
|
||||
/* sa = sign(a0-a1+a2) */
|
||||
/* sb = sign(b0-b1+b2) */
|
||||
sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
|
||||
c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
|
||||
: mpn_sub_n (c, a + k, c, k);
|
||||
/* b0+b2 is in {c4+2, k+1} now */
|
||||
sb = (c5[2] != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k);
|
||||
c5[2] = (sb >= 0) ? c5[2] - mpn_sub_n (c4 + 2, c4 + 2, b + k, k)
|
||||
: mpn_sub_n (c4 + 2, b + k, c4 + 2, k);
|
||||
sa *= sb; /* sign of vm1 */
|
||||
|
||||
/* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1};
|
||||
since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */
|
||||
TOOM3_MUL_REC (t, c, c4 + 2, k1, trec);
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1}
|
||||
v1
|
||||
|
||||
{t, 2k+1} {t+2k+1, 2k + 1}
|
||||
vm1
|
||||
*/
|
||||
|
||||
/*
|
||||
compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c4 + 2, k+1}
|
||||
*/
|
||||
#if HAVE_NATIVE_mpn_addlsh1_n
|
||||
c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
|
||||
c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r);
|
||||
if (r < k)
|
||||
{
|
||||
__GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
|
||||
__GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]);
|
||||
}
|
||||
c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
|
||||
c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k);
|
||||
#else
|
||||
c[r] = mpn_lshift1 (c, a + twok, r);
|
||||
c4[r + 2] = mpn_lshift1 (c4 + 2, b + twok, r);
|
||||
if (r < k)
|
||||
{
|
||||
MPN_ZERO(c + r + 1, k - r);
|
||||
MPN_ZERO(c4 + r + 3, k - r);
|
||||
}
|
||||
c1[0] += mpn_add_n (c, c, a + k, k);
|
||||
c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k);
|
||||
mpn_lshift1 (c, c, k1);
|
||||
mpn_lshift1 (c4 + 2, c4 + 2, k1);
|
||||
c1[0] += mpn_add_n (c, c, a, k);
|
||||
c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k);
|
||||
#endif
|
||||
|
||||
#define v2 (t+2*k+1)
|
||||
|
||||
/* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1}
|
||||
v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */
|
||||
TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec);
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1}
|
||||
v1
|
||||
|
||||
{t, 2k+1} {t+2k+1, 2k + 1}
|
||||
vm1 v2
|
||||
*/
|
||||
|
||||
/* compute v0 := a0*b0 in {c, 2k} */
|
||||
TOOM3_MUL_REC (c, a, b, k, trec);
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1}
|
||||
v0 v1
|
||||
|
||||
{t, 2k+1} {t+2k+1, 2k + 1}
|
||||
vm1 v2
|
||||
*/
|
||||
|
||||
#define vinf (c+4*k)
|
||||
|
||||
/* compute vinf := a2*b2 in {c4, r + r2},
|
||||
*/
|
||||
saved = c4[0];
|
||||
|
||||
TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec);
|
||||
|
||||
vinf0 = c4[0];
|
||||
c4[0] = saved;
|
||||
|
||||
/* {c,2k} {c+2k,2k+1} {c+4k+1,r+r2-1}
|
||||
v0 v1 {-}vinf
|
||||
|
||||
{t, 2k+1} {t+2k+1, 2k + 1}
|
||||
vm1 v2
|
||||
|
||||
vinf0 = {-}
|
||||
*/
|
||||
|
||||
toom3_n_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2);
|
||||
|
||||
#undef v2
|
||||
#undef vinf
|
||||
}
|
||||
|
||||
void
|
||||
mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t)
|
||||
{
|
||||
mp_size_t k, k1, kk1, r, twok, twor;
|
||||
mp_limb_t cy, saved, vinf0, cinf0;
|
||||
mp_ptr trec;
|
||||
int sa;
|
||||
mp_ptr c1, c2, c3, c4;
|
||||
|
||||
ASSERT(GMP_NUMB_BITS >= 6);
|
||||
ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */
|
||||
|
||||
/* the algorithm is the same as mpn_mul_n_tc3, with b=a */
|
||||
|
||||
k = (n + 2) / 3; /* ceil(n/3) */
|
||||
twok = 2 * k;
|
||||
k1 = k + 1;
|
||||
kk1 = k + k1;
|
||||
r = n - twok; /* last chunk */
|
||||
twor = 2 * r;
|
||||
|
||||
c1 = c + k;
|
||||
c2 = c1 + k;
|
||||
c3 = c2 + k;
|
||||
c4 = c3 + k;
|
||||
|
||||
trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */
|
||||
|
||||
cy = mpn_add_n (c, a, a + twok, r);
|
||||
if (r < k)
|
||||
__GMPN_ADD_1 (cy, c + r, a + r, k - r, cy);
|
||||
c3[2] = (c1[0] = cy) + mpn_add_n (c2 + 2, c, a + k, k);
|
||||
|
||||
#define v2 (t+2*k+1)
|
||||
#define vinf (t+4*k+2)
|
||||
|
||||
TOOM3_SQR_REC (t, c2 + 2, k1, trec);
|
||||
|
||||
sa = (c[k] != 0) ? 1 : mpn_cmp (c, a + k, k);
|
||||
c[k] = (sa >= 0) ? c[k] - mpn_sub_n (c, c, a + k, k)
|
||||
: mpn_sub_n (c, a + k, c, k);
|
||||
|
||||
TOOM3_SQR_REC (c2, c, k1, trec);
|
||||
|
||||
#ifdef HAVE_NATIVE_mpn_addlsh1_n
|
||||
c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r);
|
||||
if (r < k)
|
||||
__GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]);
|
||||
c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k);
|
||||
#else
|
||||
c[r] = mpn_lshift1 (c, a + twok, r);
|
||||
if (r < k)
|
||||
MPN_ZERO(c + r + 1, k - r);
|
||||
c1[0] += mpn_add_n (c, c, a + k, k);
|
||||
mpn_lshift1 (c, c, k1);
|
||||
c1[0] += mpn_add_n (c, c, a, k);
|
||||
#endif
|
||||
|
||||
TOOM3_SQR_REC (v2, c, k1, trec);
|
||||
|
||||
TOOM3_SQR_REC (c, a, k, trec);
|
||||
|
||||
#ifdef HAVE_NATIVE_mpn_addlsh1_n
|
||||
mpn_addlsh1_n (v2, v2, c2, kk1);
|
||||
#else
|
||||
mpn_lshift1 (t + 4 * k + 2, c2, kk1);
|
||||
mpn_add_n (v2, v2, t + 4 * k + 2, kk1);
|
||||
#endif
|
||||
|
||||
saved = c4[0];
|
||||
TOOM3_SQR_REC (c4, a + twok, r, trec);
|
||||
cinf0 = mpn_add_n (vinf, c4, c, twor);
|
||||
vinf0 = c4[0];
|
||||
c4[0] = saved;
|
||||
|
||||
toom3_interpolate (c, t, v2, c2, vinf, k, r, 1, vinf0, cinf0, vinf + twor);
|
||||
|
||||
#undef v2
|
||||
#undef vinf
|
||||
}
|
||||
|
||||
void
|
||||
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
|
||||
{
|
||||
|
@ -66,8 +66,8 @@ MA 02110-1301, USA. */
|
||||
c3 <- t1 - t2
|
||||
c4 <- vinf
|
||||
*/
|
||||
static void
|
||||
toom3_interpolate (mp_ptr c, mp_ptr v1, mp_ptr v2, mp_ptr vm1,
|
||||
void
|
||||
mpn_toom3_interpolate (mp_ptr c, mp_ptr v1, mp_ptr v2, mp_ptr vm1,
|
||||
mp_ptr vinf, mp_size_t k, mp_size_t rr2, int sa,
|
||||
mp_limb_t vinf0, mp_ptr ws)
|
||||
{
|
||||
@ -401,7 +401,7 @@ mpn_toom3_mul (mp_ptr c, mp_srcptr a, mp_size_t an, mp_srcptr b, mp_size_t bn, m
|
||||
vinf0 = {-}
|
||||
*/
|
||||
|
||||
toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2);
|
||||
mpn_toom3_interpolate (c, c2, v2, t, vinf, k, rr2, sa, vinf0, t4+2);
|
||||
|
||||
#undef v2
|
||||
#undef vinf
|
||||
|
Loading…
Reference in New Issue
Block a user