First hack at toom4_mul (unbalanced toom4).
This commit is contained in:
parent
9a475c7038
commit
5d3cf5509d
@ -2503,8 +2503,8 @@ gmp_mpn_functions="$extra_functions \
|
||||
mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1 \
|
||||
rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
|
||||
bdivmod gcd gcd_1 gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \
|
||||
mullow_n mullow_basecase redc_basecase \
|
||||
$gmp_mpn_functions_optional toom3_mul toom3_mul_n toom4_mul_n toom7_mul_n"
|
||||
mullow_n mullow_basecase redc_basecase toom3_mul toom3_mul_n \
|
||||
toom4_mul toom4_mul_n toom7_mul_n $gmp_mpn_functions_optional"
|
||||
|
||||
define(GMP_MULFUNC_CHOICES,
|
||||
[# functions that can be provided by multi-function files
|
||||
|
@ -47,7 +47,7 @@ nodist_EXTRA_libmpn_la_SOURCES = \
|
||||
hamdist.c invert_limb.c lshift1.c rshift1.c lshift2.c rshift2.c \
|
||||
ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \
|
||||
mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c \
|
||||
toom3_mul_n.c toom4_mul_n.c toom7_mul_n.c \
|
||||
toom3_mul_n.c toom4_mul_n.c toom4.c toom7_mul_n.c \
|
||||
mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c \
|
||||
pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \
|
||||
rootrem.c redc_basecase.c sb_divrem_mn.c scan0.c scan1.c set_str.c \
|
||||
|
@ -146,7 +146,11 @@ mpn_mul (mp_ptr prodp,
|
||||
|
||||
k = (un + 3)/4; // ceil(un/3)
|
||||
|
||||
if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
|
||||
if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD) && (vn > 3*k))
|
||||
{
|
||||
mpn_toom4_mul(prodp, up, un, vp, vn);
|
||||
return prodp[un + vn - 1];
|
||||
} else if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
|
||||
{
|
||||
mp_ptr ws;
|
||||
TMP_DECL;
|
||||
|
899
mpn/generic/toom4_mul.c
Normal file
899
mpn/generic/toom4_mul.c
Normal file
@ -0,0 +1,899 @@
|
||||
/* mpn_toom4_mul -- Internal routine to multiply two natural numbers
|
||||
using unbalanced toom4.
|
||||
|
||||
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
|
||||
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
|
||||
*/
|
||||
|
||||
/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way.
|
||||
|
||||
Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
|
||||
Copyright 2009 William Hart
|
||||
|
||||
This file is part of the GNU MP Library.
|
||||
|
||||
The GNU MP Library is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
The GNU MP Library is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
|
||||
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||
MA 02110-1301, USA. */
|
||||
|
||||
/*
|
||||
This implementation is based on that of Paul Zimmmermann, which is available
|
||||
for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c
|
||||
*/
|
||||
|
||||
#include "mpir.h"
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
||||
{
|
||||
mp_limb_t cy;
|
||||
mp_size_t s1 = ABS(r1n);
|
||||
mp_size_t s2 = ABS(r2n);
|
||||
|
||||
if (!s1)
|
||||
{
|
||||
*rn = 0;
|
||||
} else if (!s2)
|
||||
{
|
||||
if (rp != r1) MPN_COPY(rp, r1, s1);
|
||||
*rn = r1n;
|
||||
} else if ((r1n ^ r2n) >= 0)
|
||||
{
|
||||
*rn = r1n;
|
||||
cy = mpn_add(rp, r1, s1, r2, s2);
|
||||
if (cy)
|
||||
{
|
||||
rp[s1] = cy;
|
||||
if ((*rn) < 0) (*rn)--;
|
||||
else (*rn)++;
|
||||
}
|
||||
} else
|
||||
{
|
||||
mp_size_t ct;
|
||||
if (s1 != s2) ct = 1;
|
||||
else MPN_CMP(ct, r1, r2, s1);
|
||||
|
||||
if (!ct) *rn = 0;
|
||||
else if (ct > 0)
|
||||
{
|
||||
mpn_sub(rp, r1, s1, r2, s2);
|
||||
*rn = s1;
|
||||
MPN_NORMALIZE(rp, (*rn));
|
||||
if (r1n < 0) *rn = -(*rn);
|
||||
}
|
||||
else
|
||||
{
|
||||
mpn_sub_n(rp, r2, r1, s1);
|
||||
*rn = s1;
|
||||
MPN_NORMALIZE(rp, (*rn));
|
||||
if (r1n > 0) *rn = -(*rn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
||||
{
|
||||
mp_size_t s1 = ABS(r1n);
|
||||
mp_size_t s2 = ABS(r2n);
|
||||
|
||||
if (s1 < s2) _tc4_add(rp, rn, r2, r2n, r1, r1n);
|
||||
else _tc4_add(rp, rn, r1, r1n, r2, r2n);
|
||||
}
|
||||
|
||||
#if HAVE_NATIVE_mpn_sumdiff_n
|
||||
void tc4_sumdiff(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
|
||||
{
|
||||
mp_limb_t cy, cy2;
|
||||
mp_size_t s1 = ABS(r1n);
|
||||
mp_size_t s2 = ABS(r2n);
|
||||
int swapped = 0;
|
||||
|
||||
if (s1 < s2)
|
||||
{
|
||||
MPN_PTR_SWAP(r1, r1n, r2, r2n);
|
||||
MP_SIZE_T_SWAP(s1, s2);
|
||||
swapped = 1;
|
||||
}
|
||||
|
||||
if (!s1)
|
||||
{
|
||||
*rn = 0;
|
||||
*sn = 0;
|
||||
} else if (!s2)
|
||||
{
|
||||
if (rp != r1) MPN_COPY(rp, r1, s1);
|
||||
if (sp != r1) MPN_COPY(sp, r1, s1);
|
||||
*rn = r1n;
|
||||
*sn = (swapped ? -r1n : r1n);
|
||||
} else
|
||||
{
|
||||
mp_size_t ct;
|
||||
if (s1 != s2) ct = 1;
|
||||
else MPN_CMP(ct, r1, r2, s1);
|
||||
|
||||
if (!ct)
|
||||
{
|
||||
if ((r1n ^ r2n) >= 0)
|
||||
{
|
||||
*sn = 0;
|
||||
*rn = r1n;
|
||||
cy = mpn_lshift(rp, r1, s1, 1);
|
||||
if (cy)
|
||||
{
|
||||
rp[s1] = cy;
|
||||
if ((*rn) < 0) (*rn)--;
|
||||
else (*rn)++;
|
||||
}
|
||||
} else
|
||||
{
|
||||
*rn = 0;
|
||||
*sn = (swapped ? -r1n : r1n);
|
||||
cy = mpn_lshift(sp, r1, s1, 1);
|
||||
if (cy)
|
||||
{
|
||||
sp[s1] = cy;
|
||||
if ((*sn) < 0) (*sn)--;
|
||||
else (*sn)++;
|
||||
}
|
||||
}
|
||||
} else if (ct > 0) // r1 is bigger than r2
|
||||
{
|
||||
if ((r1n ^ r2n) >= 0) // both inputs are same sign
|
||||
{
|
||||
cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
|
||||
if (s1 > s2) // r1 has more limbs than r2
|
||||
{
|
||||
*rn = r1n;
|
||||
cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
|
||||
if (cy2)
|
||||
{
|
||||
rp[s1] = cy2;
|
||||
if ((*rn) < 0) (*rn)--;
|
||||
else (*rn)++;
|
||||
}
|
||||
mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
|
||||
} else // both inputs have same number of limbs
|
||||
{
|
||||
*rn = r1n;
|
||||
if (cy>>1)
|
||||
{
|
||||
rp[s1] = 1;
|
||||
if ((*rn) < 0) (*rn)--;
|
||||
else (*rn)++;
|
||||
}
|
||||
}
|
||||
*sn = s1;
|
||||
MPN_NORMALIZE(sp, (*sn));
|
||||
if (r1n ^ (~swapped) < 0) *sn = -(*sn);
|
||||
} else // inputs are different sign
|
||||
{
|
||||
cy = mpn_sumdiff_n(sp, rp, r1, r2, s2);
|
||||
if (s1 > s2) // r1 has more limbs than r2
|
||||
{
|
||||
*sn = r1n;
|
||||
cy2 = mpn_add_1(sp + s2, r1 + s2, s1 - s2, cy>>1);
|
||||
if (cy2)
|
||||
{
|
||||
sp[s1] = cy2;
|
||||
if ((*sn) < 0) (*sn)--;
|
||||
else (*sn)++;
|
||||
}
|
||||
mpn_sub_1(rp + s2, r1 + s2, s1 - s2, cy&1);
|
||||
} else // both inputs have same number of limbs
|
||||
{
|
||||
*sn = r1n;
|
||||
if (cy>>1)
|
||||
{
|
||||
sp[s1] = 1;
|
||||
if ((*sn) < 0) (*sn)--;
|
||||
else (*sn)++;
|
||||
}
|
||||
}
|
||||
*rn = s1;
|
||||
MPN_NORMALIZE(rp, (*rn));
|
||||
if (r1n ^ (~swapped) < 0) *rn = -(*rn);
|
||||
}
|
||||
} else // r2 is bigger than r1 (but same number of limbs)
|
||||
{
|
||||
if ((r1n ^ r2n) >= 0) // both inputs are same sign
|
||||
{
|
||||
cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
|
||||
*rn = r1n;
|
||||
if (cy>>1)
|
||||
{
|
||||
rp[s1] = 1;
|
||||
if ((*rn) < 0) (*rn)--;
|
||||
else (*rn)++;
|
||||
}
|
||||
*sn = s1;
|
||||
MPN_NORMALIZE(sp, (*sn));
|
||||
if (r1n ^ (~swapped) > 0) *sn = -(*sn);
|
||||
} else // inputs are different sign
|
||||
{
|
||||
cy = mpn_sumdiff_n(sp, rp, r2, r1, s1);
|
||||
*sn = r1n;
|
||||
if (cy>>1)
|
||||
{
|
||||
sp[s1] = 1;
|
||||
if ((*sn) < 0) (*sn)--;
|
||||
else (*sn)++;
|
||||
}
|
||||
*rn = s1;
|
||||
MPN_NORMALIZE(rp, (*rn));
|
||||
if (r1n ^ (~swapped) > 0) *rn = -(*rn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void tc4_sumdiff_unsigned(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
|
||||
{
|
||||
mp_limb_t cy, cy2;
|
||||
mp_size_t s1 = ABS(r1n);
|
||||
mp_size_t s2 = ABS(r2n);
|
||||
int swapped = 0;
|
||||
|
||||
if (s1 < s2)
|
||||
{
|
||||
MPN_PTR_SWAP(r1, r1n, r2, r2n);
|
||||
MP_SIZE_T_SWAP(s1, s2);
|
||||
swapped = 1;
|
||||
}
|
||||
|
||||
if (!s1)
|
||||
{
|
||||
*rn = 0;
|
||||
*sn = 0;
|
||||
} else if (!s2)
|
||||
{
|
||||
if (rp != r1) MPN_COPY(rp, r1, s1);
|
||||
if (sp != r1) MPN_COPY(sp, r1, s1);
|
||||
*rn = r1n;
|
||||
*sn = (swapped ? -r1n : r1n);
|
||||
} else
|
||||
{
|
||||
mp_size_t ct;
|
||||
if (s1 != s2) ct = 1;
|
||||
else MPN_CMP(ct, r1, r2, s1);
|
||||
|
||||
if (!ct)
|
||||
{
|
||||
*sn = 0;
|
||||
*rn = r1n;
|
||||
cy = mpn_lshift(rp, r1, s1, 1);
|
||||
if (cy)
|
||||
{
|
||||
rp[s1] = cy;
|
||||
(*rn)++;
|
||||
}
|
||||
} else if (ct > 0) // r1 is bigger than r2
|
||||
{
|
||||
cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
|
||||
if (s1 > s2) // r1 has more limbs than r2
|
||||
{
|
||||
*rn = r1n;
|
||||
cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
|
||||
if (cy2)
|
||||
{
|
||||
rp[s1] = cy2;
|
||||
(*rn)++;
|
||||
}
|
||||
mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
|
||||
} else // both inputs have same number of limbs
|
||||
{
|
||||
*rn = r1n;
|
||||
if (cy>>1)
|
||||
{
|
||||
rp[s1] = 1;
|
||||
(*rn)++;
|
||||
}
|
||||
}
|
||||
*sn = s1;
|
||||
MPN_NORMALIZE(sp, (*sn));
|
||||
if (swapped) *sn = -(*sn);
|
||||
} else // r2 is bigger than r1 (but same number of limbs)
|
||||
{
|
||||
cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
|
||||
*rn = r1n;
|
||||
if (cy>>1)
|
||||
{
|
||||
rp[s1] = 1;
|
||||
(*rn)++;
|
||||
}
|
||||
*sn = s1;
|
||||
MPN_NORMALIZE(sp, (*sn));
|
||||
if (!swapped) *sn = -(*sn);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void _tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
||||
{
|
||||
mp_limb_t cy;
|
||||
mp_size_t s1 = r1n;
|
||||
mp_size_t s2 = r2n;
|
||||
|
||||
if (!s2)
|
||||
{
|
||||
if (!s1) *rn = 0;
|
||||
else
|
||||
{
|
||||
if (rp != r1) MPN_COPY(rp, r1, s1);
|
||||
*rn = r1n;
|
||||
}
|
||||
} else
|
||||
{
|
||||
*rn = r1n;
|
||||
cy = mpn_add(rp, r1, s1, r2, s2);
|
||||
if (cy)
|
||||
{
|
||||
rp[s1] = cy;
|
||||
if ((*rn) < 0) (*rn)--;
|
||||
else (*rn)++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
||||
{
|
||||
if (r1n < r2n) _tc4_add_unsigned(rp, rn, r2, r2n, r1, r1n);
|
||||
else _tc4_add_unsigned(rp, rn, r1, r1n, r2, r2n);
|
||||
}
|
||||
|
||||
void tc4_sub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
|
||||
{
|
||||
tc4_add(rp, rn, r1, r1n, r2, -r2n);
|
||||
}
|
||||
|
||||
void tc4_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_size_t bits)
|
||||
{
|
||||
if (xn == 0) *rn = 0;
|
||||
else
|
||||
{
|
||||
mp_size_t xu = ABS(xn);
|
||||
mp_limb_t msl = mpn_lshift(rp, xp, xu, bits);
|
||||
if (msl)
|
||||
{
|
||||
rp[xu] = msl;
|
||||
*rn = (xn >= 0 ? xn + 1 : xn - 1);
|
||||
} else
|
||||
*rn = xn;
|
||||
}
|
||||
}
|
||||
|
||||
void tc4_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits)
|
||||
{
|
||||
if (*rn)
|
||||
{
|
||||
if ((*rn) > 0)
|
||||
{
|
||||
mpn_rshift(rp, rp, *rn, bits);
|
||||
if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--;
|
||||
} else
|
||||
{
|
||||
mpn_rshift(rp, rp, -(*rn), bits);
|
||||
if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_NATIVE_mpn_addlsh1_n
|
||||
void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
|
||||
{
|
||||
if (xn)
|
||||
{
|
||||
if (xn >= *rn)
|
||||
{
|
||||
mp_limb_t cy;
|
||||
if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
|
||||
cy = mpn_addlsh1_n(rp, rp, xp, xn);
|
||||
if (cy)
|
||||
{
|
||||
rp[xn] = cy;
|
||||
*rn = xn + 1;
|
||||
} else *rn = xn;
|
||||
} else
|
||||
{
|
||||
mp_limb_t cy = mpn_addlsh1_n(rp, rp, xp, xn);
|
||||
if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
|
||||
if (cy)
|
||||
{
|
||||
rp[*rn] = cy;
|
||||
(*rn)++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void tc4_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c)
|
||||
{
|
||||
mp_size_t abs_size;
|
||||
if (xn == 0)
|
||||
{
|
||||
*rn = 0;
|
||||
return;
|
||||
}
|
||||
abs_size = ABS (xn);
|
||||
|
||||
MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c);
|
||||
abs_size -= (rp[abs_size-1] == 0);
|
||||
*rn = (xn >= 0 ? abs_size : -abs_size);
|
||||
}
|
||||
|
||||
void tc4_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
|
||||
{
|
||||
if (xn)
|
||||
{
|
||||
mp_size_t xu = ABS(xn);
|
||||
mpn_divexact_by3(rp, x, xu);
|
||||
if (xn > 0)
|
||||
{
|
||||
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
|
||||
else *rn = xn;
|
||||
} else
|
||||
{
|
||||
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
|
||||
else *rn = xn;
|
||||
}
|
||||
} else *rn = 0;
|
||||
}
|
||||
|
||||
#if HAVE_NATIVE_mpn_divexact_byBm1of
|
||||
void tc4_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
|
||||
{
|
||||
if (xn)
|
||||
{
|
||||
mp_size_t xu = ABS(xn);
|
||||
mpn_divexact_byBm1of(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); // works for 32 and 64 bits
|
||||
if (xn > 0)
|
||||
{
|
||||
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
|
||||
else *rn = xn;
|
||||
} else
|
||||
{
|
||||
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
|
||||
else *rn = xn;
|
||||
}
|
||||
} else *rn = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
void tc4_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
|
||||
{
|
||||
mp_size_t sign, wu, xu, ws, new_wn, min_size, dsize;
|
||||
mp_limb_t cy;
|
||||
|
||||
/* w unaffected if x==0 or y==0 */
|
||||
if (xn == 0 || y == 0)
|
||||
return;
|
||||
|
||||
sign = xn;
|
||||
xu = ABS (xn);
|
||||
|
||||
ws = *wn;
|
||||
if (*wn == 0)
|
||||
{
|
||||
/* nothing to add to, just set x*y, "sign" gives the sign */
|
||||
cy = mpn_mul_1 (wp, xp, xu, y);
|
||||
if (cy)
|
||||
{
|
||||
wp[xu] = cy;
|
||||
xu = xu + 1;
|
||||
}
|
||||
*wn = (sign >= 0 ? xu : -xu);
|
||||
return;
|
||||
}
|
||||
|
||||
sign ^= *wn;
|
||||
wu = ABS (*wn);
|
||||
|
||||
new_wn = MAX (wu, xu);
|
||||
min_size = MIN (wu, xu);
|
||||
|
||||
if (sign >= 0)
|
||||
{
|
||||
/* addmul of absolute values */
|
||||
|
||||
cy = mpn_addmul_1 (wp, xp, min_size, y);
|
||||
|
||||
dsize = xu - wu;
|
||||
#if HAVE_NATIVE_mpn_mul_1c
|
||||
if (dsize > 0)
|
||||
cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
|
||||
else if (dsize < 0)
|
||||
{
|
||||
dsize = -dsize;
|
||||
cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
|
||||
}
|
||||
#else
|
||||
if (dsize != 0)
|
||||
{
|
||||
mp_limb_t cy2;
|
||||
if (dsize > 0)
|
||||
cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
|
||||
else
|
||||
{
|
||||
dsize = -dsize;
|
||||
cy2 = 0;
|
||||
}
|
||||
cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (cy)
|
||||
{
|
||||
wp[dsize + min_size] = cy;
|
||||
new_wn ++;
|
||||
}
|
||||
} else
|
||||
{
|
||||
/* submul of absolute values */
|
||||
|
||||
cy = mpn_submul_1 (wp, xp, min_size, y);
|
||||
if (wu >= xu)
|
||||
{
|
||||
/* if w bigger than x, then propagate borrow through it */
|
||||
if (wu != xu)
|
||||
cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);
|
||||
|
||||
if (cy != 0)
|
||||
{
|
||||
/* Borrow out of w, take twos complement negative to get
|
||||
absolute value, flip sign of w. */
|
||||
wp[new_wn] = ~-cy; /* extra limb is 0-cy */
|
||||
mpn_com_n (wp, wp, new_wn);
|
||||
new_wn++;
|
||||
MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
|
||||
ws = -*wn;
|
||||
}
|
||||
} else /* wu < xu */
|
||||
{
|
||||
/* x bigger than w, so want x*y-w. Submul has given w-x*y, so
|
||||
take twos complement and use an mpn_mul_1 for the rest. */
|
||||
|
||||
mp_limb_t cy2;
|
||||
|
||||
/* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
|
||||
mpn_com_n (wp, wp, wu);
|
||||
cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
|
||||
cy -= 1;
|
||||
|
||||
/* If cy-1 == -1 then hold that -1 for latter. mpn_submul_1 never
|
||||
returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
|
||||
cy2 = (cy == MP_LIMB_T_MAX);
|
||||
cy += cy2;
|
||||
MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
|
||||
wp[new_wn] = cy;
|
||||
new_wn += (cy != 0);
|
||||
|
||||
/* Apply any -1 from above. The value at wp+wsize is non-zero
|
||||
because y!=0 and the high limb of x will be non-zero. */
|
||||
if (cy2)
|
||||
MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));
|
||||
|
||||
ws = -*wn;
|
||||
}
|
||||
|
||||
/* submul can produce high zero limbs due to cancellation, both when w
|
||||
has more limbs or x has more */
|
||||
MPN_NORMALIZE (wp, new_wn);
|
||||
}
|
||||
|
||||
*wn = (ws >= 0 ? new_wn : -new_wn);
|
||||
|
||||
ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
|
||||
}
|
||||
|
||||
void tc4_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y)
|
||||
{
|
||||
tc4_addmul_1(wp, wn, x, -xn, y);
|
||||
}
|
||||
|
||||
void tc4_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
|
||||
{
|
||||
mp_size_t yu = ABS(*yn);
|
||||
mp_size_t xu = ABS(xn);
|
||||
mp_limb_t cy = 0;
|
||||
|
||||
if (xn == 0)
|
||||
return;
|
||||
|
||||
if (offset < yu) /* low part of x overlaps with y */
|
||||
{
|
||||
if (offset + xu <= yu) /* x entirely inside y */
|
||||
{
|
||||
cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
|
||||
if (offset + xu < yu)
|
||||
cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
|
||||
yu - (offset + xu), cy);
|
||||
} else
|
||||
cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
|
||||
/* now cy is the carry at yp + yu */
|
||||
if (xu + offset > yu) /* high part of x exceeds y */
|
||||
{
|
||||
MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
|
||||
cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
|
||||
yu = xu + offset;
|
||||
}
|
||||
/* now cy is the carry at yp + yn */
|
||||
if (cy)
|
||||
yp[yu++] = cy;
|
||||
MPN_NORMALIZE(yp, yu);
|
||||
*yn = yu;
|
||||
} else /* x does not overlap */
|
||||
{
|
||||
if (offset > yu)
|
||||
MPN_ZERO (yp + yu, offset - yu);
|
||||
MPN_COPY (yp + offset, xp, xu);
|
||||
*yn = offset + xu;
|
||||
}
|
||||
}
|
||||
|
||||
#define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
|
||||
do \
|
||||
{ \
|
||||
if ((n1xx != 0) && (n2xx != 0)) \
|
||||
{ mp_size_t len; \
|
||||
if (n1xx == n2xx) \
|
||||
{ \
|
||||
if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \
|
||||
else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
|
||||
} else if (n1xx > n2xx) \
|
||||
mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
|
||||
else \
|
||||
mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
|
||||
len = n1xx + n2xx; \
|
||||
MPN_NORMALIZE(r3xx, len); \
|
||||
n3xx = len; \
|
||||
} else \
|
||||
n3xx = 0; \
|
||||
} while (0)
|
||||
|
||||
#define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
|
||||
do \
|
||||
{ \
|
||||
mp_size_t sign = n1xx ^ n2xx; \
|
||||
mp_size_t un1 = ABS(n1xx); \
|
||||
mp_size_t un2 = ABS(n2xx); \
|
||||
MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
|
||||
if (sign < 0) n3xx = -n3xx; \
|
||||
} while (0)
|
||||
|
||||
#define TC4_NORM(rxx, nxx, sxx) \
|
||||
do \
|
||||
{ \
|
||||
nxx = sxx; \
|
||||
MPN_NORMALIZE(rxx, nxx); \
|
||||
} while(0)
|
||||
|
||||
/* Zero out limbs to end of integer */
|
||||
#define TC4_DENORM(rxx, nxx, sxx) \
|
||||
do { \
|
||||
MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
|
||||
} while (0)
|
||||
|
||||
/* Two's complement divexact by power of 2 */
|
||||
#define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \
|
||||
do { \
|
||||
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
|
||||
mpn_rshift(rxx, rxx, nxx, sxx); \
|
||||
rxx[nxx-1] |= sign; \
|
||||
} while (0)
|
||||
|
||||
#if HAVE_NATIVE_mpn_rshift1
|
||||
#define TC4_RSHIFT1(rxx, nxx) \
|
||||
do { \
|
||||
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
|
||||
mpn_rshift1(rxx, rxx, nxx); \
|
||||
rxx[nxx-1] |= sign; \
|
||||
} while (0)
|
||||
#else
|
||||
#define TC4_RSHIFT1(rxx, nxx) \
|
||||
do { \
|
||||
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
|
||||
mpn_rshift(rxx, rxx, nxx, 1); \
|
||||
rxx[nxx-1] |= sign; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define r1 (tp)
|
||||
#define r2 (tp + t4)
|
||||
#define r4 (tp + 2*t4)
|
||||
#define r6 (tp + 3*t4)
|
||||
|
||||
#define r3 (rp + 4*sn)
|
||||
#define r5 (rp + 2*sn)
|
||||
#define r7 (rp)
|
||||
|
||||
#define mpn_clearit(rxx, nxx) \
|
||||
do { \
|
||||
mp_size_t ind = 0; \
|
||||
for ( ; ind < nxx; ind++) \
|
||||
(rxx)[ind] = CNST_LIMB(0); \
|
||||
} while (0)
|
||||
|
||||
/* Multiply {up, un} by {vp, vn} and write the result to
|
||||
{prodp, un + vn}.
|
||||
|
||||
Note that prodp gets un + vn limbs stored, even if the actual
|
||||
result only needs un + vn - 1.
|
||||
*/
|
||||
void
|
||||
mpn_toom4_mul (mp_ptr rp, mp_srcptr up, mp_srcptr un,
|
||||
mp_srcptr vp, mp_size_t vn)
|
||||
{
|
||||
mp_size_t ind;
|
||||
mp_limb_t cy, r30, r31;
|
||||
mp_ptr tp;
|
||||
mp_size_t a0n, a1n, a2n, a3n, b0n, b1n, b2n, b3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4;
|
||||
|
||||
MPN_NORMALIZE(up, len1);
|
||||
MPN_NORMALIZE(vp, len2);
|
||||
|
||||
sn = (un + 3) / 4;
|
||||
|
||||
ASSERT (vn > 3*sn);
|
||||
|
||||
#define a0 (up)
|
||||
#define a1 (up + sn)
|
||||
#define a2 (up + 2*sn)
|
||||
#define a3 (up + 3*sn)
|
||||
#define b0 (vp)
|
||||
#define b1 (vp + sn)
|
||||
#define b2 (vp + 2*sn)
|
||||
#define b3 (vp + 3*sn)
|
||||
|
||||
TC4_NORM(a0, a0n, sn);
|
||||
TC4_NORM(a1, a1n, sn);
|
||||
TC4_NORM(a2, a2n, sn);
|
||||
TC4_NORM(a3, a3n, un - 3*sn);
|
||||
TC4_NORM(b0, b0n, sn);
|
||||
TC4_NORM(b1, b1n, sn);
|
||||
TC4_NORM(b2, b2n, sn);
|
||||
TC4_NORM(b3, b3n, vn - 3*sn);
|
||||
|
||||
t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
|
||||
|
||||
tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 5*(sn + 1));
|
||||
|
||||
#define u2 (tp + 4*t4)
|
||||
#define u3 (tp + 4*t4 + (sn+1))
|
||||
#define u4 (tp + 4*t4 + 2*(sn+1))
|
||||
#define u5 (tp + 4*t4 + 3*(sn+1))
|
||||
#define u6 (tp + 4*t4 + 4*(sn+1))
|
||||
|
||||
tc4_add_unsigned(u6, &n6, a3, a3n, a1, a1n);
|
||||
tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n);
|
||||
#if HAVE_NATIVE_mpn_sumdiff_n
|
||||
tc4_sumdiff_unsigned(u3, &n3, u4, &n4, u5, n5, u6, n6);
|
||||
#else
|
||||
tc4_add_unsigned(u3, &n3, u5, n5, u6, n6);
|
||||
tc4_sub(u4, &n4, u5, n5, u6, n6);
|
||||
#endif
|
||||
tc4_add_unsigned(u6, &n6, b3, b3n, b1, b1n);
|
||||
tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n);
|
||||
#if HAVE_NATIVE_mpn_sumdiff_n
|
||||
tc4_sumdiff_unsigned(r2, &n8, u5, &n5, u5, n5, u6, n6);
|
||||
#else
|
||||
tc4_add_unsigned(r2, &n8, u5, n5, u6, n6);
|
||||
tc4_sub(u5, &n5, u5, n5, u6, n6);
|
||||
#endif
|
||||
|
||||
MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8);
|
||||
MUL_TC4(r4, n4, u4, n4, u5, n5);
|
||||
|
||||
tc4_lshift(r1, &n1, a0, a0n, 3);
|
||||
#if HAVE_NATIVE_mpn_addlsh1_n
|
||||
tc4_addlsh1_unsigned(r1, &n1, a2, a2n);
|
||||
#else
|
||||
tc4_addmul_1(r1, &n1, a2, a2n, 2);
|
||||
#endif
|
||||
tc4_lshift(r2, &n8, a1, a1n, 2);
|
||||
tc4_add(r2, &n8, r2, n8, a3, a3n);
|
||||
tc4_add(u5, &n5, r1, n1, r2, n8);
|
||||
tc4_sub(u6, &n6, r1, n1, r2, n8);
|
||||
tc4_lshift(r1, &n1, b0, b0n, 3);
|
||||
#if HAVE_NATIVE_mpn_addlsh1_n
|
||||
tc4_addlsh1_unsigned(r1, &n1, b2, b2n);
|
||||
#else
|
||||
tc4_addmul_1(r1, &n1, b2, b2n, 2);
|
||||
#endif
|
||||
tc4_lshift(r2, &n8, b1, b1n, 2);
|
||||
tc4_add(r2, &n8, r2, n8, b3, b3n);
|
||||
tc4_add(u2, &n2, r1, n1, r2, n8);
|
||||
tc4_sub(r2, &n8, r1, n1, r2, n8);
|
||||
|
||||
r30 = r3[0];
|
||||
if (!n3) r30 = CNST_LIMB(0);
|
||||
r31 = r3[1];
|
||||
MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2);
|
||||
MUL_TC4(r6, n6, u6, n6, r2, n8);
|
||||
r3[1] = r31;
|
||||
|
||||
tc4_lshift(u2, &n2, a3, a3n, 3);
|
||||
tc4_addmul_1(u2, &n2, a2, a2n, 4);
|
||||
#if HAVE_NATIVE_mpn_addlsh1_n
|
||||
tc4_addlsh1_unsigned(u2, &n2, a1, a1n);
|
||||
#else
|
||||
tc4_addmul_1(u2, &n2, a1, a1n, 2);
|
||||
#endif
|
||||
tc4_add(u2, &n2, u2, n2, a0, a0n);
|
||||
tc4_lshift(r1, &n1, b3, b3n, 3);
|
||||
tc4_addmul_1(r1, &n1, b2, b2n, 4);
|
||||
#if HAVE_NATIVE_mpn_addlsh1_n
|
||||
tc4_addlsh1_unsigned(r1, &n1, b1, b1n);
|
||||
#else
|
||||
tc4_addmul_1(r1, &n1, b1, b1n, 2);
|
||||
#endif
|
||||
tc4_add(r1, &n1, r1, n1, b0, b0n);
|
||||
|
||||
MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1);
|
||||
MUL_TC4_UNSIGNED(r1, n1, a3, a3n, b3, b3n);
|
||||
MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n);
|
||||
|
||||
TC4_DENORM(r1, n1, t4 - 1);
|
||||
TC4_DENORM(r2, n2, t4 - 1);
|
||||
if (n3)
|
||||
TC4_DENORM(r3, n3, t4 - 1);
|
||||
else {
|
||||
/* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */
|
||||
for (ind = 1 ; ind < t4 - 1; ind++)
|
||||
(r3)[ind] = CNST_LIMB(0);
|
||||
}
|
||||
TC4_DENORM(r4, n4, t4 - 1);
|
||||
TC4_DENORM(r5, n5, t4 - 1);
|
||||
TC4_DENORM(r6, n6, t4 - 1);
|
||||
TC4_DENORM(r7, n7, t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)
|
||||
|
||||
/* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7
|
||||
<----------- r7-----------><------------r5-------------->
|
||||
<-------------r3------------->
|
||||
|
||||
<-------------r6-------------> < -----------r2------------>{ }
|
||||
<-------------r4--------------> <--------------r1---->
|
||||
*/
|
||||
|
||||
toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);
|
||||
|
||||
if (rpn != un + vn)
|
||||
{
|
||||
MPN_ZERO((rp + rpn), un + vn - rpn);
|
||||
}
|
||||
|
||||
__GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 5*(sn+1));
|
||||
}
|
||||
|
||||
/*
|
||||
Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a
|
||||
polynomial p(x) with 7 coefficients given the values
|
||||
p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
|
||||
The output is placed in rp and the final number of limbs of the
|
||||
output is given in rpn.
|
||||
The 4th and 6th values may be negative, and if so, n4 and n6
|
||||
should be set to a negative value respectively.
|
||||
To save space we pass r3, r5, r7 in place in the output rp.
|
||||
The other r's are stored separately in space tp.
|
||||
The low limb of r3 is stored in r30, as it will be overwritten
|
||||
by the high limb of r5.
|
||||
|
||||
rp rp1 rp2 rp3 rp4 rp5 rp6 rp7
|
||||
<----------- r7-----------><------------r5-------------->
|
||||
<-------------r3------------->
|
||||
|
||||
We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4)
|
||||
and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
|
||||
*/
|
Loading…
Reference in New Issue
Block a user