mpir/mpn/generic/toom7_mul_n.c

2009 lines
49 KiB
C

/* mpn_toom7_mul_n -- Internal routine to multiply two natural numbers
of length n.
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
*/
/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 7-way.
Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
Copyright Marco Bodrato, November 2006
Copyright 2009 William Hart
This file is part of the MPIR Library.
The MPIR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The MPIR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the MPIR Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/*
This implementation is based on that of Paul Zimmmermann, which is available
for Toom 4 for mpz_t's at http://www.loria.fr/~zimmerma/software/toom7.c
and uses the Toom 7 sequence as generated by Bodrato and Zanoni at
http://bodrato.it/software/tc3-7.27nov2006.tar.bz2
Please see the papers of Bodrato (and Zanoni) at http://www.bodrato.it/
In particular see the paper by these authors:
Integer and Polynomial Multiplication: Towards
Optimal Toom-Cook Matrices in Proceedings of the ISSAC 2007 conference,
Ontario, Canada, July 29-August 1, 2007, ACM press.
*/
#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"
#define TC7_LIB 1 // library build only
#if !TC7_LIB
#include <stdio.h>
#include <stdlib.h>
#define TC7_TEST 0 // test code
#define TC7_TIME 1 // timing code
#else
#define TC7_TEST 0
#define TC7_TIME 0
#endif
void
mpn_toom7_mul_n (mp_ptr rp, mp_srcptr up,
mp_srcptr vp, mp_size_t n);
void toom7_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,
mp_ptr tp, mp_size_t s7, mp_size_t n2, mp_size_t n6,
mp_size_t n8, mp_size_t n9, mp_size_t n11);
void _tc7_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1,
mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
{
mp_limb_t cy;
mp_size_t s1 = ABS(r1n);
mp_size_t s2 = ABS(r2n);
if (!s1)
{
*rn = 0;
} else if (!s2)
{
if (rp != r1) MPN_COPY(rp, r1, s1);
*rn = r1n;
} else if ((r1n ^ r2n) >= 0)
{
*rn = r1n;
cy = mpn_add(rp, r1, s1, r2, s2);
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
} else
{
mp_size_t ct;
if (s1 != s2) ct = 1;
else MPN_CMP(ct, r1, r2, s1);
if (!ct) *rn = 0;
else if (ct > 0)
{
mpn_sub(rp, r1, s1, r2, s2);
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n < 0) *rn = -(*rn);
}
else
{
mpn_sub_n(rp, r2, r1, s1);
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n > 0) *rn = -(*rn);
}
}
}
void tc7_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
{
mp_size_t s1 = ABS(r1n);
mp_size_t s2 = ABS(r2n);
if (s1 < s2) _tc7_add(rp, rn, r2, r2n, r1, r1n);
else _tc7_add(rp, rn, r1, r1n, r2, r2n);
}
#if HAVE_NATIVE_mpn_sumdiff_n
void tc7_sumdiff(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
{
mp_limb_t cy, cy2;
mp_size_t s1 = ABS(r1n);
mp_size_t s2 = ABS(r2n);
int swapped = 0;
if (s1 < s2)
{
MPN_PTR_SWAP(r1, r1n, r2, r2n);
MP_SIZE_T_SWAP(s1, s2);
swapped = 1;
}
if (!s1)
{
*rn = 0;
*sn = 0;
} else if (!s2)
{
if (rp != r1) MPN_COPY(rp, r1, s1);
if (sp != r1) MPN_COPY(sp, r1, s1);
*rn = r1n;
*sn = (swapped ? -r1n : r1n);
} else
{
mp_size_t ct;
if (s1 != s2) ct = 1;
else MPN_CMP(ct, r1, r2, s1);
if (!ct)
{
if ((r1n ^ r2n) >= 0)
{
*sn = 0;
*rn = r1n;
cy = mpn_lshift1(rp, r1, s1);
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
} else
{
*rn = 0;
*sn = (swapped ? -r1n : r1n);
cy = mpn_lshift1(sp, r1, s1);
if (cy)
{
sp[s1] = cy;
if ((*sn) < 0) (*sn)--;
else (*sn)++;
}
}
} else if (ct > 0) // r1 is bigger than r2
{
if ((r1n ^ r2n) >= 0) // both inputs are same sign
{
cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
if (s1 > s2) // r1 has more limbs than r2
{
*rn = r1n;
cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
if (cy2)
{
rp[s1] = cy2;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
} else // both inputs have same number of limbs
{
*rn = r1n;
if (cy>>1)
{
rp[s1] = 1;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
}
*sn = s1;
MPN_NORMALIZE(sp, (*sn));
if (r1n ^ (~swapped) < 0) *sn = -(*sn);
} else // inputs are different sign
{
cy = mpn_sumdiff_n(sp, rp, r1, r2, s2);
if (s1 > s2) // r1 has more limbs than r2
{
*sn = r1n;
cy2 = mpn_add_1(sp + s2, r1 + s2, s1 - s2, cy>>1);
if (cy2)
{
sp[s1] = cy2;
if ((*sn) < 0) (*sn)--;
else (*sn)++;
}
mpn_sub_1(rp + s2, r1 + s2, s1 - s2, cy&1);
} else // both inputs have same number of limbs
{
*sn = r1n;
if (cy>>1)
{
sp[s1] = 1;
if ((*sn) < 0) (*sn)--;
else (*sn)++;
}
}
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n ^ (~swapped) < 0) *rn = -(*rn);
}
} else // r2 is bigger than r1 (but same number of limbs)
{
if ((r1n ^ r2n) >= 0) // both inputs are same sign
{
cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
*rn = r1n;
if (cy>>1)
{
rp[s1] = 1;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
*sn = s1;
MPN_NORMALIZE(sp, (*sn));
if (r1n ^ (~swapped) > 0) *sn = -(*sn);
} else // inputs are different sign
{
cy = mpn_sumdiff_n(sp, rp, r2, r1, s1);
*sn = r1n;
if (cy>>1)
{
sp[s1] = 1;
if ((*sn) < 0) (*sn)--;
else (*sn)++;
}
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n ^ (~swapped) > 0) *rn = -(*rn);
}
}
}
}
void tc7_sumdiff_unsigned(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
{
mp_limb_t cy, cy2;
mp_size_t s1 = ABS(r1n);
mp_size_t s2 = ABS(r2n);
int swapped = 0;
if (s1 < s2)
{
MPN_PTR_SWAP(r1, r1n, r2, r2n);
MP_SIZE_T_SWAP(s1, s2);
swapped = 1;
}
if (!s1)
{
*rn = 0;
*sn = 0;
} else if (!s2)
{
if (rp != r1) MPN_COPY(rp, r1, s1);
if (sp != r1) MPN_COPY(sp, r1, s1);
*rn = r1n;
*sn = (swapped ? -r1n : r1n);
} else
{
mp_size_t ct;
if (s1 != s2) ct = 1;
else MPN_CMP(ct, r1, r2, s1);
if (!ct)
{
*sn = 0;
*rn = r1n;
cy = mpn_lshift1(rp, r1, s1);
if (cy)
{
rp[s1] = cy;
(*rn)++;
}
} else if (ct > 0) // r1 is bigger than r2
{
cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
if (s1 > s2) // r1 has more limbs than r2
{
*rn = r1n;
cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
if (cy2)
{
rp[s1] = cy2;
(*rn)++;
}
mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
} else // both inputs have same number of limbs
{
*rn = r1n;
if (cy>>1)
{
rp[s1] = 1;
(*rn)++;
}
}
*sn = s1;
MPN_NORMALIZE(sp, (*sn));
if (swapped) *sn = -(*sn);
} else // r2 is bigger than r1 (but same number of limbs)
{
cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
*rn = r1n;
if (cy>>1)
{
rp[s1] = 1;
(*rn)++;
}
*sn = s1;
MPN_NORMALIZE(sp, (*sn));
if (!swapped) *sn = -(*sn);
}
}
}
#endif
/* ~~~~~~~~~~~~~~~~~~~~UNTESTED CODE~~~~~~~~~~~~~~~~~~~~~~~~~~
#define MPN_ADDSUB_CMP(cyxx, r1xx, r2xx, r3xx, snxx) \
do { \
mp_limb_t t[2]; \
add_ssaaaa(t[1], t[0], CNST_LIMB(0), r1xx[snxx - 1], CNST_LIMB(0), r2xx[snxx - 1]); \
if (t[1]) cyxx = 1; \
else if (t[0] > r3xx[snxx - 1]) cyxx = 1; \
else if (t[0] < r3xx[snxx - 1] - CNST_LIMB(1)) cyxx = -1; \
else cyxx = 0; \
} while (0)
void tc7_addadd(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n, mp_ptr r3, n3, mp_size_t r3n)
{
mp_size_t s1 = ABS(r1n);
mp_size_t s2 = ABS(r2n);
mp_size_t s3 = ABS(r3n);
if ((s1 != s2) || (s1 != s3))
{
tc7_add(rp, rn, r1, r1n, r2, r2n);
tc7_add(rp, rn, rp, *rn, r3, r3n);
} else
{
mp_limb_t cy;
mp_size_t cy2;
if (((r1n ^ r2n) >= 0) && ((r1n ^ r3n) >= 0)) // all same sign addadd
{
cy = mpn_addadd_n(rp, r1, r2, r3, s1);
*rn = r1n;
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
} else if (((r1n ^ r2n) >= 0) && ((r1n ^ r3n) < 0)) // addsub
{
MPN_ADDSUB_CMP(cy2, r1, r2, r3, s1);
if (cy2 > 0)
{
cy = mpn_addsub_n(rp, r1, r2, r3, s1);
*rn = r1n;
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
} else if (cy2 < 0)
{
cy = mpn_subadd_n(rp, r3, r1, r2, s1);
if (cy) abort();
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n < 0) *rn = -(*rn);
} else
{
tc7_add(rp, rn, r1, r1n, r2, r2n);
tc7_add(rp, rn, rp, *rn, r3, r3n);
}
} else if (((r1n ^ r2n) < 0) && ((r1n ^ r3n) >= 0)) // subadd
{
MPN_ADDSUB_CMP(cy2, r1, r3, r2, s1);
if (cy2 > 0)
{
cy = mpn_addsub_n(rp, r1, r3, r2, s1);
*rn = r1n;
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
} else if (cy2 < 0)
{
mpn_subadd_n(rp, r2, r1, r3, s1);
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n > 0) *rn = -(*rn);
} else
{
tc7_add(rp, rn, r1, r1n, r2, r2n);
tc7_add(rp, rn, rp, *rn, r3, r3n);
}
} else // add final two and subtract first
{
MPN_ADDSUB_CMP(cy2, r2, r3, r1, s1);
if (cy2 > 0)
{
cy = mpn_addsub_n(rp, r2, r3, r1, s1);
*rn = r1n;
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
} else if (cy2 < 0)
{
mpn_subadd_n(rp, r1, r2, r3, s1);
*rn = s1;
MPN_NORMALIZE(rp, (*rn));
if (r1n < 0) *rn = -(*rn);
} else
{
tc7_add(rp, rn, r1, r1n, r2, r2n);
tc7_add(rp, rn, rp, *rn, r3, r3n);
}
}
}
}
void tc7_addsub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n, mp_ptr r3, mp_size_t r3n)
{
tc7_addadd(rp, rn, r1, r1n, r2, r2n, r3, -r3n);
}
void tc7_subsub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n, mp_ptr r3, mp_size_t r3n)
{
tc7_addadd(rp, rn, r1, r1n, r2, -r2n, r3, -r3n);
}
*/
void _tc7_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
{
mp_limb_t cy;
mp_size_t s1 = r1n;
mp_size_t s2 = r2n;
if (!s2)
{
if (!s1) *rn = 0;
else
{
if (rp != r1) MPN_COPY(rp, r1, s1);
*rn = r1n;
}
} else
{
*rn = r1n;
cy = mpn_add(rp, r1, s1, r2, s2);
if (cy)
{
rp[s1] = cy;
if ((*rn) < 0) (*rn)--;
else (*rn)++;
}
}
}
void tc7_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
{
if (r1n < r2n) _tc7_add_unsigned(rp, rn, r2, r2n, r1, r1n);
else _tc7_add_unsigned(rp, rn, r1, r1n, r2, r2n);
}
void tc7_sub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
{
tc7_add(rp, rn, r1, r1n, r2, -r2n);
}
void tc7_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_size_t bits)
{
if (xn == 0) *rn = 0;
else
{
mp_size_t xu = ABS(xn);
mp_limb_t msl = mpn_lshift(rp, xp, xu, bits);
if (msl)
{
rp[xu] = msl;
*rn = (xn >= 0 ? xn + 1 : xn - 1);
} else
*rn = xn;
}
}
void tc7_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits)
{
if (*rn)
{
if ((*rn) > 0)
{
mpn_rshift(rp, rp, *rn, bits);
if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--;
} else
{
mpn_rshift(rp, rp, -(*rn), bits);
if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++;
}
}
}
#if HAVE_NATIVE_mpn_addlsh1_n
void tc7_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
{
if (xn)
{
if (xn >= *rn)
{
mp_limb_t cy;
if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
cy = mpn_addlsh1_n(rp, rp, xp, xn);
if (cy)
{
rp[xn] = cy;
*rn = xn + 1;
} else *rn = xn;
} else
{
mp_limb_t cy = mpn_addlsh1_n(rp, rp, xp, xn);
if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
if (cy)
{
rp[*rn] = cy;
(*rn)++;
}
}
}
}
#endif
/*
void tc7_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_srcptr x, mp_size_t xn, mp_limb_t c)
{
if (xn)
{
mp_size_t xu = ABS(xn);
mp_limb_t cy = mpn_divmod_1(rp, x, xu, c);
if (xn > 0)
{
if (rp[xu - 1] == 0) *rn = xn - 1;
else *rn = xn;
} else
{
if (rp[xu - 1] == 0) *rn = xn + 1;
else *rn = xn;
}
} else *rn = 0;
}
*/
void tc7_mul_1(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_limb_t c)
{
if (xn == 0) *rn = 0; // c won't be zero in tc7
else
{
mp_size_t xu = ABS(xn);
mp_limb_t msl = mpn_mul_1(rp, xp, xu, c);
if (msl)
{
rp[xu] = msl;
*rn = (xn >= 0 ? xn + 1 : xn - 1);
} else
*rn = xn;
}
}
void tc7_divexact_1(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c)
{
mp_size_t abs_size;
if (xn == 0)
{
*rn = 0;
return;
}
abs_size = ABS (xn);
MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c);
abs_size -= (rp[abs_size-1] == 0);
*rn = (xn >= 0 ? abs_size : -abs_size);
}
void tc7_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
{
if (xn)
{
mp_size_t xu = ABS(xn);
mpn_divexact_by3(rp, x, xu);
if (xn > 0)
{
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
else *rn = xn;
} else
{
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
else *rn = xn;
}
} else *rn = 0;
}
#if HAVE_NATIVE_mpn_divexact_byBm1of
void tc7_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
{
if (xn)
{
mp_size_t xu = ABS(xn);
mpn_divexact_byBm1of(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); // works for 32 and 64 bits
if (xn > 0)
{
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
else *rn = xn;
} else
{
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
else *rn = xn;
}
} else *rn = 0;
}
#endif
#if HAVE_NATIVE_mpn_mul_1c
#define MPN_MUL_1C(cout, dst, src, size, n, cin) \
do { \
(cout) = mpn_mul_1c (dst, src, size, n, cin); \
} while (0)
#else
#define MPN_MUL_1C(cout, dst, src, size, n, cin) \
do { \
mp_limb_t __cy; \
__cy = mpn_mul_1 (dst, src, size, n); \
(cout) = __cy + mpn_add_1 (dst, dst, size, cin); \
} while (0)
#endif
void tc7_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
{
mp_size_t sign, wu, xu, ws, new_wn, min_size, dsize;
mp_limb_t cy;
/* w unaffected if x==0 or y==0 */
if (xn == 0 || y == 0)
return;
sign = xn;
xu = ABS (xn);
ws = *wn;
if (*wn == 0)
{
/* nothing to add to, just set x*y, "sign" gives the sign */
cy = mpn_mul_1 (wp, xp, xu, y);
if (cy)
{
wp[xu] = cy;
xu = xu + 1;
}
*wn = (sign >= 0 ? xu : -xu);
return;
}
sign ^= *wn;
wu = ABS (*wn);
new_wn = MAX (wu, xu);
min_size = MIN (wu, xu);
if (sign >= 0)
{
/* addmul of absolute values */
cy = mpn_addmul_1 (wp, xp, min_size, y);
dsize = xu - wu;
#if HAVE_NATIVE_mpn_mul_1c
if (dsize > 0)
cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
else if (dsize < 0)
{
dsize = -dsize;
cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
}
#else
if (dsize != 0)
{
mp_limb_t cy2;
if (dsize > 0)
cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
else
{
dsize = -dsize;
cy2 = 0;
}
cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
}
#endif
if (cy)
{
wp[dsize + min_size] = cy;
new_wn ++;
}
} else
{
/* submul of absolute values */
cy = mpn_submul_1 (wp, xp, min_size, y);
if (wu >= xu)
{
/* if w bigger than x, then propagate borrow through it */
if (wu != xu)
cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);
if (cy != 0)
{
/* Borrow out of w, take twos complement negative to get
absolute value, flip sign of w. */
wp[new_wn] = ~-cy; /* extra limb is 0-cy */
mpn_com_n (wp, wp, new_wn);
new_wn++;
MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
ws = -*wn;
}
} else /* wu < xu */
{
/* x bigger than w, so want x*y-w. Submul has given w-x*y, so
take twos complement and use an mpn_mul_1 for the rest. */
mp_limb_t cy2;
/* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
mpn_com_n (wp, wp, wu);
cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
cy -= 1;
/* If cy-1 == -1 then hold that -1 for latter. mpn_submul_1 never
returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
cy2 = (cy == MP_LIMB_T_MAX);
cy += cy2;
MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
wp[new_wn] = cy;
new_wn += (cy != 0);
/* Apply any -1 from above. The value at wp+wsize is non-zero
because y!=0 and the high limb of x will be non-zero. */
if (cy2)
MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));
ws = -*wn;
}
/* submul can produce high zero limbs due to cancellation, both when w
has more limbs or x has more */
MPN_NORMALIZE (wp, new_wn);
}
*wn = (ws >= 0 ? new_wn : -new_wn);
ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
}
void tc7_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y)
{
tc7_addmul_1(wp, wn, x, -xn, y);
}
void tc7_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
{
mp_size_t yu = ABS(*yn);
mp_size_t xu = ABS(xn);
mp_limb_t cy = 0;
if (xn == 0)
return;
if (offset < yu) /* low part of x overlaps with y */
{
if (offset + xu <= yu) /* x entirely inside y */
{
cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
if (offset + xu < yu)
cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
yu - (offset + xu), cy);
} else
cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
/* now cy is the carry at yp + yu */
if (xu + offset > yu) /* high part of x exceeds y */
{
MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
yu = xu + offset;
}
/* now cy is the carry at yp + yn */
if (cy)
yp[yu++] = cy;
MPN_NORMALIZE(yp, yu);
*yn = yu;
} else /* x does not overlap */
{
if (offset > yu)
MPN_ZERO (yp + yu, offset - yu);
MPN_COPY (yp + offset, xp, xu);
*yn = offset + xu;
}
}
#define MUL_TC7_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
do \
{ \
if ((n1xx != 0) && (n2xx != 0)) \
{ mp_size_t len; \
if (n1xx == n2xx) \
{ \
if (n1xx > MUL_TOOM7_THRESHOLD) mpn_toom7_mul_n(r3xx, r1xx, r2xx, n1xx); \
else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
} else if (n1xx > n2xx) \
mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
else \
mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
len = n1xx + n2xx; \
MPN_NORMALIZE(r3xx, len); \
n3xx = len; \
} else \
n3xx = 0; \
} while (0)
#define MUL_TC7(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
do \
{ \
mp_size_t sign = n1xx ^ n2xx; \
mp_size_t un1 = ABS(n1xx); \
mp_size_t un2 = ABS(n2xx); \
MUL_TC7_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
if (sign < 0) n3xx = -n3xx; \
} while (0)
#define SQR_TC7_UNSIGNED(r3xx, n3xx, r1xx, n1xx) \
do \
{ \
if (n1xx != 0) \
{ mp_size_t len; \
if (n1xx > MUL_TOOM7_THRESHOLD) mpn_toom7_sqr_n(r3xx, r1xx, n1xx); \
else mpn_sqr_n(r3xx, r1xx, n1xx); \
len = 2*n1xx; \
MPN_NORMALIZE(r3xx, len); \
n3xx = len; \
} else \
n3xx = 0; \
} while (0)
#define SQR_TC7(r3xx, n3xx, r1xx, n1xx) \
do \
{ \
mp_size_t un1 = ABS(n1xx); \
SQR_TC7_UNSIGNED(r3xx, n3xx, r1xx, un1); \
} while (0)
#define TC7_NORM(rxx, nxx, sxx) \
do \
{ \
nxx = sxx; \
MPN_NORMALIZE(rxx, nxx); \
} while(0)
#if TC7_TEST || TC7_TIME
#define p2(axx, anxx, bxx, bnxx) \
do \
{ \
printf("s1 = "); \
if (anxx < 0) printf("-"); \
if (anxx == 0) printf("0, "); \
else printf("%ld, ", axx[0]); \
printf("s2 = "); \
if (bnxx < 0) printf("-"); \
if (bnxx == 0) printf("0"); \
else printf("%ld\n", bxx[0]); \
} while (0)
#define p(axx, anxx) \
do \
{ \
printf("r = "); \
if (anxx < 0) printf("-"); \
if (anxx == 0) printf("0\n"); \
else printf("%ld\n", axx[0]); \
} while (0)
#endif
/* Zero out limbs past end of integer */
#define TC7_DENORM(rxx, nxx, sxx) \
do { \
MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
} while (0)
/* Two's complement divexact by power of 2 */
#define TC7_DIVEXACT_2EXP(rxx, nxx, sxx) \
do { \
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
mpn_rshift(rxx, rxx, nxx, sxx); \
rxx[nxx-1] |= sign; \
} while (0)
#if HAVE_NATIVE_mpn_rshift1
#define TC7_RSHIFT1(rxx, nxx) \
do { \
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
mpn_rshift1(rxx, rxx, nxx); \
rxx[nxx-1] |= sign; \
} while (0)
#else
#define TC7_RSHIFT1(rxx, nxx) \
do { \
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
mpn_rshift(rxx, rxx, nxx, 1); \
rxx[nxx-1] |= sign; \
} while (0)
#endif
#define r1 (tp)
#define r2 (tp + t7)
#define r3 (tp + 2*t7)
#define r4 (tp + 3*t7)
#define r5 (tp + 4*t7)
#define r6 (tp + 5*t7)
#define r7 (tp + 6*t7)
#define r8 (tp + 7*t7)
#define r9 (tp + 8*t7)
#define r10 (tp + 9*t7)
#define r11 (tp + 10*t7)
#define r12 (tp + 11*t7)
#define r13 (tp + 12*t7)
/* Multiply {up, n} by {vp, n} and write the result to
{prodp, 2n}.
Note that prodp gets 2n limbs stored, even if the actual result
only needs 2n - 1.
*/
void
mpn_toom7_mul_n (mp_ptr rp, mp_srcptr up,
mp_srcptr vp, mp_size_t n)
{
mp_size_t len1, len2;
mp_limb_t cy;
mp_ptr tp;
mp_size_t a0n, a1n, a2n, a3n, a4n, a5n, a6n;
mp_size_t b0n, b1n, b2n, b3n, b4n, b5n, b6n;
mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, rpn, t7;
len1 = n;
len2 = n;
ASSERT (n >= 1);
MPN_NORMALIZE(up, len1);
MPN_NORMALIZE(vp, len2);
sn = (n - 1) / 7 + 1;
#define a0 (up)
#define a1 (up + sn)
#define a2 (up + 2*sn)
#define a3 (up + 3*sn)
#define a4 (up + 4*sn)
#define a5 (up + 5*sn)
#define a6 (up + 6*sn)
#define b0 (vp)
#define b1 (vp + sn)
#define b2 (vp + 2*sn)
#define b3 (vp + 3*sn)
#define b4 (vp + 4*sn)
#define b5 (vp + 5*sn)
#define b6 (vp + 6*sn)
TC7_NORM(a0, a0n, sn);
TC7_NORM(a1, a1n, sn);
TC7_NORM(a2, a2n, sn);
TC7_NORM(a3, a3n, sn);
TC7_NORM(a4, a4n, sn);
TC7_NORM(a5, a5n, sn);
TC7_NORM(a6, a6n, n - 6*sn);
TC7_NORM(b0, b0n, sn);
TC7_NORM(b1, b1n, sn);
TC7_NORM(b2, b2n, sn);
TC7_NORM(b3, b3n, sn);
TC7_NORM(b4, b4n, sn);
TC7_NORM(b5, b5n, sn);
TC7_NORM(b6, b6n, n - 6*sn);
t7 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
tp = __GMP_ALLOCATE_FUNC_LIMBS(13*t7 + 11*(sn+1));
#define u2 (tp + 13*t7)
#define u3 (tp + 13*t7 + (sn+1))
#define u4 (tp + 13*t7 + 2*(sn+1))
#define u5 (tp + 13*t7 + 3*(sn+1))
#define u6 (tp + 13*t7 + 4*(sn+1))
#define u7 (tp + 13*t7 + 5*(sn+1))
#define u8 (tp + 13*t7 + 6*(sn+1))
#define u9 (tp + 13*t7 + 7*(sn+1))
#define u10 (tp + 13*t7 + 8*(sn+1))
#define u11 (tp + 13*t7 + 9*(sn+1))
#define u12 (tp + 13*t7 + 10*(sn+1))
tc7_lshift (r6, &n6, a2, a2n, 4);
tc7_lshift (r7, &n7, b2, b2n, 4);
tc7_lshift (u10, &n10, a4, a4n, 4);
tc7_lshift (u11, &n11, b4, b4n, 4);
tc7_lshift (u5, &n5, a3, a3n, 3);
tc7_lshift (u2, &n2, b3, b3n, 3);
tc7_lshift (r1, &n1, a1, a1n, 5);
tc7_add (r1, &n1, r1, n1, u5, n5);
tc7_addmul_1 (r1, &n1, a5, a5n, 2);
tc7_add (u3, &n3, r6, n6, a6, a6n);
tc7_addmul_1 (u3, &n3, a0, a0n, 64);
tc7_addmul_1 (u3, &n3, a4, a4n, 4);
tc7_sub (r13, &n13, r1, n1, u3, n3);
tc7_add (u3, &n3, u3, n3, r1, n1);
tc7_lshift (r1, &n1, b1, b1n, 5);
tc7_add (r1, &n1, r1, n1, u2, n2);
tc7_addmul_1 (r1, &n1, b5, b5n, 2);
tc7_add (u8, &n8, r7, n7, b6, b6n);
tc7_addmul_1 (u8, &n8, b0, b0n, 64);
tc7_addmul_1 (u8, &n8, b4, b4n, 4);
tc7_add (r12, &n12, r1, n1, u8, n8);
tc7_sub (u8, &n8, r1, n1, u8, n8);
MUL_TC7 (r3, n3, u3, n3, r12, n12);
MUL_TC7 (r8, n8, u8, n8, r13, n13);
tc7_add (r1, &n1, u10, n10, a0, a0n);
tc7_addmul_1 (r1, &n1, a6, a6n,64);
tc7_addmul_1 (r1, &n1, a2, a2n, 4);
tc7_addmul_1 (u5, &n5, a1, a1n, 2);
tc7_addmul_1 (u5, &n5, a5, a5n, 32);
tc7_sub (r13, &n13, r1, n1, u5, n5);
tc7_add (u5, &n5, u5, n5, r1, n1);
tc7_add (r1, &n1, u11, n11, b0, b0n);
tc7_addmul_1 (r1, &n1, b6, b6n,64);
tc7_addmul_1 (r1, &n1, b2, b2n, 4);
tc7_addmul_1 (u2, &n2, b1, b1n, 2);
tc7_addmul_1 (u2, &n2, b5, b5n, 32);
tc7_add (r12, &n12, u2, n2, r1, n1);
tc7_sub (u2, &n2, r1, n1, u2, n2);
MUL_TC7 (r5, n5, u5, n5, r12, n12);
MUL_TC7 (r2, n2, u2, n2, r13, n13);
tc7_add (r1, &n1, r6, n6, a0, a0n);
tc7_addmul_1 (r1, &n1, u10, n10, 16);
tc7_addmul_1 (r1, &n1, a6, a6n, 4096);
tc7_add (u10, &n10, u10, n10, a6, a6n);
tc7_addmul_1 (u10, &n10, r6, n6, 16);
tc7_addmul_1 (u10, &n10, a0, a0n, 4096);
tc7_lshift (u6, &n6, a3, a3n, 4);
tc7_add (u4, &n4, u6, n6, a1, a1n);
tc7_addmul_1 (u4, &n4, a5, a5n, 256);
tc7_lshift (u4, &n4, u4, n4, 2);
tc7_add (u6, &n6, u6, n6, a5, a5n);
tc7_addmul_1 (u6, &n6, a1, a1n, 256);
tc7_lshift (u6, &n6, u6, n6, 2);
tc7_sub (u9, &n9, u4, n4, r1, n1);
tc7_add (u4, &n4, u4, n4, r1, n1);
tc7_add (r1, &n1, r7, n7, b0, b0n);
tc7_addmul_1 (r1, &n1, u11, n11, 16);
tc7_addmul_1 (r1, &n1, b6, b6n, 4096);
tc7_add (u11, &n11, u11, n11, b6, b6n);
tc7_addmul_1 (u11, &n11, r7, n7, 16);
tc7_addmul_1 (u11, &n11, b0, b0n, 4096);
tc7_lshift (r7, &n7, b3, b3n, 4);
tc7_add (r13, &n13, r7, n7, b1, b1n);
tc7_addmul_1 (r13, &n13, b5, b5n, 256);
tc7_lshift (r13, &n13, r13, n13, 2);
tc7_add (r7, &n7, r7, n7, b5, b5n);
tc7_addmul_1 (r7, &n7, b1, b1n, 256);
tc7_lshift (r7, &n7, r7, n7, 2);
tc7_sub (r12, &n12, r13, n13, r1, n1);
tc7_add (r13, &n13, r13, n13, r1, n1);
MUL_TC7 (r9, n9, u9, n9, r12, n12);
MUL_TC7 (r4, n4, u4, n4, r13, n13);
tc7_sub (r12, &n12, u10, n10, u6, n6);
tc7_add (u10, &n10, u10, n10, u6, n6);
tc7_add (r13, &n13, u11, n11, r7, n7);
tc7_sub (u11, &n11, u11, n11, r7, n7);
MUL_TC7 (r10, n10, u10, n10, r13, n13);
MUL_TC7 (r11, n11, u11, n11, r12, n12);
tc7_add (u7, &n7, a3, a3n, a1, a1n);
tc7_add (u6, &n6, a2, a2n, a0, a0n);
tc7_add (u6, &n6, u6, n6, a4, a4n);
tc7_add (u6, &n6, u6, n6, a6, a6n);
tc7_add (u7, &n7, u7, n7, a5, a5n);
tc7_add (r1, &n1, u7, n7, u6, n6);
tc7_sub (u6, &n6, u6, n6, u7, n7);
tc7_add (u7, &n7, b3, b3n, b1, b1n);
tc7_add (r13, &n13, b2, b2n, b0, b0n);
tc7_add (r13, &n13, r13, n13, b4, b4n);
tc7_add (r13, &n13, r13, n13, b6, b6n);
tc7_add (u7, &n7, u7, n7, b5, b5n);
tc7_sub (r12, &n12, r13, n13, u7, n7);
tc7_add (u7, &n7, u7, n7, r13, n13);
MUL_TC7 (r7, n7, u7, n7, r1, n1);
MUL_TC7 (r6, n6, u6, n6, r12, n12);
tc7_mul_1 (u12, &n12, b6, b6n, 729);
tc7_addmul_1 (u12, &n12, b5, b5n, 243);
tc7_addmul_1 (u12, &n12, b4, b4n, 81);
tc7_addmul_1 (u12, &n12, b3, b3n, 27);
tc7_addmul_1 (u12, &n12, b2, b2n, 9);
tc7_addmul_1 (u12, &n12, b1, b1n, 3);
tc7_add (u12, &n12, u12, n12, b0, b0n);
tc7_mul_1 (r13, &n13, a6, a6n, 729);
tc7_addmul_1 (r13, &n13, a5, a5n, 243);
tc7_addmul_1 (r13, &n13, a4, a4n, 81);
tc7_addmul_1 (r13, &n13, a3, a3n, 27);
tc7_addmul_1 (r13, &n13, a2, a2n, 9);
tc7_addmul_1 (r13, &n13, a1, a1n, 3);
tc7_add (r13, &n13, r13, n13, a0, a0n);
MUL_TC7 (r12, n12, u12, n12, r13, n13);
MUL_TC7 (r1, n1, a6, a6n, b6, b6n);
MUL_TC7 (r13, n13, a0, a0n, b0, b0n);
TC7_DENORM(r1, n1, t7);
TC7_DENORM(r2, n2, t7);
TC7_DENORM(r3, n3, t7);
TC7_DENORM(r4, n4, t7);
TC7_DENORM(r5, n5, t7);
TC7_DENORM(r6, n6, t7);
TC7_DENORM(r7, n7, t7);
TC7_DENORM(r8, n8, t7);
TC7_DENORM(r9, n9, t7);
TC7_DENORM(r10, n10, t7);
TC7_DENORM(r11, n11, t7);
TC7_DENORM(r12, n12, t7);
TC7_DENORM(r13, n13, t7);
toom7_interpolate(rp, &rpn, sn, tp, t7 - 1, n2, n6, n8, n9, n11);
if (rpn != 2*n)
{
MPN_ZERO((rp + rpn), 2*n - rpn);
}
__GMP_FREE_FUNC_LIMBS (tp, 13*t7 + 11*(sn+1));
}
/* Square {up, n} and write the result to {prodp, 2n}.
Note that prodp gets 2n limbs stored, even if the actual result
only needs 2n - 1.
*/
void
mpn_toom7_sqr_n (mp_ptr rp, mp_srcptr up, mp_size_t n)
{
mp_size_t len1, len2;
mp_limb_t cy;
mp_ptr tp;
mp_size_t a0n, a1n, a2n, a3n, a4n, a5n, a6n;
mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15, rpn, t7;
len1 = n;
ASSERT (n >= 1);
MPN_NORMALIZE(up, len1);
sn = (n - 1) / 7 + 1;
TC7_NORM(a0, a0n, sn);
TC7_NORM(a1, a1n, sn);
TC7_NORM(a2, a2n, sn);
TC7_NORM(a3, a3n, sn);
TC7_NORM(a4, a4n, sn);
TC7_NORM(a5, a5n, sn);
TC7_NORM(a6, a6n, n - 6*sn);
t7 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
tp = __GMP_ALLOCATE_FUNC_LIMBS(13*t7 + 7*(sn+1));
tc7_lshift (r6, &n6, a2, a2n, 4);
tc7_lshift (u8, &n15, a4, a4n, 4);
tc7_lshift (u5, &n5, a3, a3n, 3);
tc7_lshift (r1, &n1, a1, a1n, 5);
tc7_add (r1, &n1, r1, n1, u5, n5);
tc7_addmul_1 (r1, &n1, a5, a5n, 2);
tc7_add (u3, &n3, r6, n6, a6, a6n);
tc7_addmul_1 (u3, &n3, a0, a0n, 64);
tc7_addmul_1 (u3, &n3, a4, a4n, 4);
tc7_sub (r13, &n13, r1, n1, u3, n3);
tc7_add (u3, &n3, u3, n3, r1, n1);
SQR_TC7 (r3, n3, u3, n3);
SQR_TC7 (r8, n8, r13, n13);
tc7_add (r1, &n1, u8, n15, a0, a0n);
tc7_addmul_1 (r1, &n1, a6, a6n,64);
tc7_addmul_1 (r1, &n1, a2, a2n, 4);
tc7_addmul_1 (u5, &n5, a1, a1n, 2);
tc7_addmul_1 (u5, &n5, a5, a5n, 32);
tc7_sub (r13, &n13, r1, n1, u5, n5);
tc7_add (u5, &n5, u5, n5, r1, n1);
SQR_TC7 (r5, n5, u5, n5);
SQR_TC7 (r2, n2, r13, n13);
tc7_add (r1, &n1, r6, n6, a0, a0n);
tc7_addmul_1 (r1, &n1, u8, n15, 16);
tc7_addmul_1 (r1, &n1, a6, a6n, 4096);
tc7_add (u8, &n15, u8, n15, a6, a6n);
tc7_addmul_1 (u8, &n15, r6, n6, 16);
tc7_addmul_1 (u8, &n15, a0, a0n, 4096);
tc7_lshift (u6, &n6, a3, a3n, 4);
tc7_add (u4, &n4, u6, n6, a1, a1n);
tc7_addmul_1 (u4, &n4, a5, a5n, 256);
tc7_lshift (u4, &n4, u4, n4, 2);
tc7_add (u6, &n6, u6, n6, a5, a5n);
tc7_addmul_1 (u6, &n6, a1, a1n, 256);
tc7_lshift (u6, &n6, u6, n6, 2);
tc7_sub (u2, &n14, u4, n4, r1, n1);
tc7_add (u4, &n4, u4, n4, r1, n1);
SQR_TC7 (r9, n9, u2, n14);
SQR_TC7 (r4, n4, u4, n4);
tc7_sub (r12, &n12, u8, n15, u6, n6);
tc7_add (u8, &n15, u8, n15, u6, n6);
SQR_TC7 (r10, n10, u8, n15);
SQR_TC7 (r11, n11, r12, n12);
tc7_add (u7, &n7, a3, a3n, a1, a1n);
tc7_add (u6, &n6, a2, a2n, a0, a0n);
tc7_add (u6, &n6, u6, n6, a4, a4n);
tc7_add (u6, &n6, u6, n6, a6, a6n);
tc7_add (u7, &n7, u7, n7, a5, a5n);
tc7_add (r1, &n1, u7, n7, u6, n6);
tc7_sub (u6, &n6, u6, n6, u7, n7);
SQR_TC7 (r7, n7, r1, n1);
SQR_TC7 (r6, n6, u6, n6);
tc7_mul_1 (r13, &n13, a6, a6n, 729);
tc7_addmul_1 (r13, &n13, a5, a5n, 243);
tc7_addmul_1 (r13, &n13, a4, a4n, 81);
tc7_addmul_1 (r13, &n13, a3, a3n, 27);
tc7_addmul_1 (r13, &n13, a2, a2n, 9);
tc7_addmul_1 (r13, &n13, a1, a1n, 3);
tc7_add (r13, &n13, r13, n13, a0, a0n);
SQR_TC7 (r12, n12, r13, n13);
SQR_TC7 (r1, n1, a6, a6n);
SQR_TC7 (r13, n13, a0, a0n);
TC7_DENORM(r1, n1, t7);
TC7_DENORM(r2, n2, t7);
TC7_DENORM(r3, n3, t7);
TC7_DENORM(r4, n4, t7);
TC7_DENORM(r5, n5, t7);
TC7_DENORM(r6, n6, t7);
TC7_DENORM(r7, n7, t7);
TC7_DENORM(r8, n8, t7);
TC7_DENORM(r9, n9, t7);
TC7_DENORM(r10, n10, t7);
TC7_DENORM(r11, n11, t7);
TC7_DENORM(r12, n12, t7);
TC7_DENORM(r13, n13, t7);
toom7_interpolate(rp, &rpn, sn, tp, t7 - 1, n2, n6, n8, n9, n11);
if (rpn != 2*n)
{
MPN_ZERO((rp + rpn), 2*n - rpn);
}
__GMP_FREE_FUNC_LIMBS (tp, 13*t7 + 7*(sn+1));
}
/*
Toom 7 interpolation. Interpolates the value at 2^(sn*B) of a
polynomial p(x) with 13 coefficients given the values
p(oo), p(-2), 2^12*p(1/2), p(4), p(2), p(-1), p(1),
2^12*p(-1/2), p(-4), 4^12*p(1/4), 4^12*p(-1/4), p(3), p(0)
The values are assumed to be stored in tp, each separated by
s7 + 1 limbs, each of no more than s7 limbs.
The output is placed in rp and the final number of limbs of the
output is given in rpn.
The 2nd, 6th, 8th, 9th and 11th values may be negative, and if so,
n2, n6, n8, n9 and n11 should be set to a negative value respectively.
*/
void toom7_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,
mp_ptr tp, mp_size_t s7, mp_size_t n2, mp_size_t n6,
mp_size_t n8, mp_size_t n9, mp_size_t n11)
{
mp_size_t n1, n3, n4, n5, n7, n10, n12, n13, t7;
t7 = s7 + 1;
// Marco Bodrato's auto generated sequence
if (n9 < 0)
mpn_add_n(r9, r4, r9, s7);
else
mpn_sub_n(r9, r4, r9, s7);
/* r9 is now in 2s complement form */
TC7_RSHIFT1(r9, s7);
mpn_submul_1(r4, r1, s7, 16777216);
mpn_sub_n(r12, r12, r5, s7);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r4, r4, r9, r13, s7);
#else
mpn_sub_n(r4, r4, r9, s7);
mpn_sub_n(r4, r4, r13, s7);
#endif
if (n2 < 0)
mpn_add_n(r2, r5, r2, s7);
else
mpn_sub_n(r2, r5, r2, s7);
/* r2 is now in 2s complement form */
TC7_RSHIFT1(r2, s7);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r5, r5, r13, r2, s7);
#else
mpn_sub_n(r5, r5, r13, s7);
mpn_sub_n(r5, r5, r2, s7);
#endif
mpn_submul_1(r5, r1, s7, 4096);
if (n6 < 0)
mpn_add_n(r6, r7, r6, s7);
else
mpn_sub_n(r6, r7, r6, s7);
/* r6 is now in 2s complement form */
TC7_RSHIFT1(r6, s7);
mpn_sub_n(r7, r7, r13, s7);
mpn_sub_n(r12, r12, r7, s7);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r7, r7, r6, r1, s7);
#else
mpn_sub_n(r7, r7, r6, s7);
mpn_sub_n(r7, r7, r1, s7);
#endif
if (n8 < 0)
mpn_add_n(r8, r3, r8, s7);
else
mpn_sub_n(r8, r3, r8, s7);
/* r8 is now in 2s complement form */
TC7_RSHIFT1(r8, s7);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r3, r3, r8, r1, s7);
#else
mpn_sub_n(r3, r3, r8, s7);
mpn_sub_n(r3, r3, r1, s7);
#endif
mpn_add_n(r3, r5, r3, s7);
mpn_submul_1(r3, r13, s7, 4096);
mpn_submul_1(r3, r7, s7, 128);
if (n11 < 0)
mpn_add_n(r11, r10, r11, s7);
else
mpn_sub_n(r11, r10, r11, s7);
/* r11 is now in 2s complement form */
TC7_RSHIFT1(r11, s7);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r10, r10, r11, r1, s7);
#else
mpn_sub_n(r10, r10, r11, s7);
mpn_sub_n(r10, r10, r1, s7);
#endif
mpn_add_n(r10, r4, r10, s7);
mpn_submul_1(r10, r13, s7, 16777216);
mpn_submul_1(r10, r7, s7, 8192);
mpn_submul_1(r10, r3, s7, 400);
mpn_divexact_1(r10, r10, s7, 680400);
mpn_submul_1(r3, r10, s7, 900);
mpn_divexact_1(r3, r3, s7, 144);
mpn_submul_1(r5, r3, s7, 16);
mpn_sub_n(r12, r12, r5, s7);
mpn_submul_1(r12, r3, s7, 64);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r7, r7, r3, r10, s7);
#else
mpn_sub_n(r7, r7, r3, s7);
mpn_sub_n(r7, r7, r10, s7);
#endif
mpn_submul_1(r5, r7, s7, 64);
mpn_submul_1(r4, r7, s7, 4096);
mpn_submul_1(r4, r5, s7, 4);
mpn_submul_1(r4, r3, s7, 256);
mpn_submul_1(r5, r10, s7, 4);
mpn_submul_1(r4, r5, s7, 268);
mpn_divexact_1(r4, r4, s7, 771120);
mpn_submul_1(r12, r5, s7, 25);
mpn_submul_1(r12, r7, s7, 600);
mpn_submul_1(r12, r4, s7, 31500);
mpn_submul_1(r12, r1, s7, 527344);
mpn_submul_1(r5, r4, s7, 1020);
mpn_divexact_byBm1of(r5, r5, s7, CNST_LIMB(15), CNST_LIMB(~0/15));
TC7_DIVEXACT_2EXP(r5, s7, 4);
mpn_sub_n(r10, r10, r4, s7);
mpn_sub_n(r3, r3, r5, s7);
mpn_add_n(r11, r9, r11, s7);
mpn_add_n(r8, r2, r8, s7);
mpn_submul_1(r11, r6, s7, 17408);
mpn_submul_1(r8, r6, s7, 160);
mpn_submul_1(r11, r8, s7, 680);
mpn_divexact_1(r11, r11, s7, 2891700);
mpn_submul_1(r8, r11, s7, 1890);
mpn_divexact_1(r8, r8, s7, 360);
#if HAVE_NATIVE_mpn_subadd_n
mpn_subadd_n(r6, r6, r11, r8, s7);
#else
mpn_sub_n(r6, r6, r11, s7);
mpn_sub_n(r6, r6, r8, s7);
#endif
mpn_submul_1(r12, r6, s7, 210);
mpn_submul_1(r12, r8, s7, 18);
mpn_submul_1(r9, r6, s7, 1024);
mpn_submul_1(r9, r8, s7, 64);
mpn_submul_1(r9, r11, s7, 4);
mpn_submul_1(r2, r6, s7, 32);
mpn_submul_1(r2, r8, s7, 8);
mpn_submul_1(r2, r11, s7, 2);
mpn_submul_1(r9, r2, s7, 160);
mpn_divexact_1(r9, r9, s7, 11340);
mpn_divexact_by3(r2, r2, s7);
TC7_RSHIFT1(r2, s7);
mpn_sub_n(r2, r2, r9, s7);
mpn_lshift(r12, r12, s7, 3);
mpn_submul_1(r12, r9, s7, 5649);
mpn_com_n(r12, r12, s7); //r12 = -r12;
mpn_add_1(r12, r12, s7, 1);
mpn_addmul_1(r12, r2, s7, 924);
mpn_divexact_1(r12, r12, s7, 525525);
mpn_submul_1(r9, r12, s7, 341);
mpn_sub_n(r11, r11, r12, s7);
TC7_DIVEXACT_2EXP(r9, s7, 4);
mpn_submul_1(r2, r9, s7, 68);
mpn_sub_n(r8, r8, r9, s7);
TC7_DIVEXACT_2EXP(r2, s7, 4);
mpn_sub_n(r6, r6, r2, s7);
TC7_NORM(r1, n1, s7);
TC7_NORM(r2, n2, s7);
TC7_NORM(r3, n3, s7);
TC7_NORM(r4, n4, s7);
TC7_NORM(r5, n5, s7);
TC7_NORM(r6, n6, s7);
TC7_NORM(r7, n7, s7);
TC7_NORM(r8, n8, s7);
TC7_NORM(r9, n9, s7);
TC7_NORM(r10, n10, s7);
TC7_NORM(r11, n11, s7);
TC7_NORM(r12, n12, s7);
TC7_NORM(r13, n13, s7);
*rpn = 0;
tc7_copy(rp, rpn, 0, r13, n13);
tc7_copy(rp, rpn, sn, r11, n11);
tc7_copy(rp, rpn, 2*sn, r10, n10);
tc7_copy(rp, rpn, 3*sn, r8, n8);
tc7_copy(rp, rpn, 4*sn, r3, n3);
tc7_copy(rp, rpn, 5*sn, r6, n6);
tc7_copy(rp, rpn, 6*sn, r7, n7);
tc7_copy(rp, rpn, 7*sn, r2, n2);
tc7_copy(rp, rpn, 8*sn, r5, n5);
tc7_copy(rp, rpn, 9*sn, r9, n9);
tc7_copy(rp, rpn, 10*sn, r4, n4);
tc7_copy(rp, rpn, 11*sn, r12, n12);
tc7_copy(rp, rpn, 12*sn, r1, n1);
}
#if TC7_TEST
int tc7_test(mp_ptr up, mp_ptr vp, mp_size_t n)
{
mp_limb_t * rp1 = malloc(2*n*sizeof(mp_limb_t));
mp_limb_t * rp2 = malloc(2*n*sizeof(mp_limb_t));
mpn_mul_n(rp1, up, vp, n);
mpn_toom7_mul_n(rp2, up, vp, n);
mp_size_t i;
for (i = 0; i < 2*n; i++)
{
if (rp1[i] != rp2[i])
{
printf("First error in limb %d\n", i);
free(rp1);
free(rp2);
return 0;
}
}
free(rp1);
free(rp2);
return 1;
}
mp_size_t randsize(mp_size_t limit)
{
static uint64_t randval = 4035456057U;
randval = ((uint64_t)randval*(uint64_t)1025416097U+(uint64_t)286824430U)%(uint64_t)4294967311U;
if (limit == 0L) return (mp_size_t) randval;
return (mp_size_t) randval%limit;
}
int main(void)
{
mp_limb_t * up = malloc(20000*sizeof(mp_limb_t));
mp_limb_t * vp = malloc(20000*sizeof(mp_limb_t));
mp_size_t i, n;
for (i = 0; i < 20000; i++)
{
n = randsize(15000) + 500;
printf("n = %d\n", n);
mpn_random2(up, n);
mpn_random2(vp, n);
if (!tc7_test(up, vp, n)) break;
}
free(up);
free(vp);
return 0;
}
#endif
#if TC7_TIME
int main(void)
{
mp_limb_t * up = malloc(40096*sizeof(mp_limb_t));
mp_limb_t * vp = malloc(40096*sizeof(mp_limb_t));
mp_limb_t * rp = malloc(80192*sizeof(mp_limb_t));
mp_size_t i, n;
n = 2048;
mpn_random(up, n);
mpn_random(vp, n);
for (i = 0; i < 50000; i++)
{
if ((i & 31) == 0)
{
mpn_random(up, n);
mpn_random(vp, n);
}
//mpn_mul_n(rp, up, vp, n);
mpn_toom7_mul_n(rp, up, vp, n);
}
free(up);
free(vp);
free(rp);
return 0;
}
#endif
/* Bill Hart's hand derived interpolation sequence (slightly slower than Bodrato's)
The following code verifies this sequence in Sage:
A=Matrix(ZZ,13)
A[0]=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
A[1]=[16777216, 4194304, 1048576, 262144, 65536, 16384, 4096, 1024, 256, 64, 16, 4, 1]
A[2]=[16777216, -4194304, 1048576, -262144, 65536, -16384, 4096, -1024, 256, -64, 16, -4, 1]
A[3]=[531441, 177147, 59049, 19683, 6561, 2187, 729, 243, 81, 27, 9, 3, 1]
A[4]=[4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1]
A[5]=[4096, -2048, 1024, -512, 256, -128, 64, -32, 16, -8, 4, -2, 1]
A[6]=[1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216]
A[7]=[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
A[8]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
A[9]=[1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1]
A[10]=[1, -2, 4, -8, 16, -32, 64, -128, 256, -512, 1024, -2048, 4096]
A[11]=[1, -4, 16, -64, 256, -1024, 4096, -16384, 65536, -262144, 1048576, -4194304, 16777216]
A[12]=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
A[1]=A[1]-A[2]
A[1]=A[1]/8
A[2]=A[2]-16777216*A[0]
A[4]=A[4]-A[5]
A[5]=A[5]-4096*A[0]
A[3]=A[3]-531441*A[0]
A[6]=A[6]-A[11]
A[11]=A[11]-16777216*A[12]
A[10]=A[10]-A[7]
A[7]=A[7]-4096*A[12]
A[8]=A[8]-A[9]
A[8]=A[8]/2
A[10]=A[10]/4
A[6]=A[6]+16384*A[10]
A[2]=A[2]+4*A[1]
A[6]=A[6]/8
A[11]=A[11]-4*A[6]
A[6]=A[6]-1024*A[10]
A[7]=A[7]+2*A[10]
A[11]=A[11]-1024*A[7]
A[9]=A[9]+A[8]
A[4]=A[4]+2*A[5]
A[4]=A[4]/2
A[5]=A[5]-A[4]
A[1]=A[1]+512*A[5]
A[5]=A[5]+2048*A[8]
A[10]=A[10]+A[8]
A[5]=A[5]+2*A[10]
A[5]=A[5]/90
A[11]=A[11]+1023*A[0]
A[7]=A[7]-A[0]
A[1]=A[1]-A[10]
A[1]=A[1]+11565*A[5]
A[9]=A[9]-A[0]
A[9]=A[9]-A[12]
A[4]=A[4]-A[12]
A[2]=A[2]-1024*A[4]
A[2]=A[2]-A[12]
A[10]=A[10]+1023*A[8]
A[11]=A[11]-8*A[10]
A[6]=A[6]+A[10]
A[6]=A[6]+11520*A[5]
A[6]=A[6]/11340
A[5]=A[5]-A[6]
A[11]=A[11]-92160*A[5]
A[1]=A[1]-11340*A[5]
A[1]=A[1]/170100
A[7]=A[7]-4*A[9]
A[5]=A[5]-A[1]
A[8]=A[8]-A[1]
A[10]=A[10]+15*A[5]
A[10]=A[10]-1023*A[8]
A[10]=A[10]+3*A[6]
A[10]=A[10]-1068*A[1]
A[2]=A[2]+4*A[7]
A[2]=A[2]/720
A[4]=A[4]-4*A[9]
A[11]=A[11]+4*A[4]
A[11]=A[11]-9360*A[2]
A[11]=A[11]-1440*A[6]
A[7]=A[7]-1020*A[9]
A[7]=A[7]+A[4]
A[7]=A[7]-36*A[2]
A[11]=A[11]-280*A[7]
A[11]=A[11]/129600
A[7]=A[7]/432
A[4]=A[4]-12*A[7]
A[2]=A[2]+13*A[7]
A[2]=A[2]+20*A[11]
A[2]=A[2]/21
A[2]=-A[2]
A[7]=A[7]-5*A[11]
A[7]=A[7]/21
A[9]=A[9]-A[11]
A[9]=A[9]-A[7]
A[6]=A[6]-16*A[1]
A[5]=A[5]-17*A[8]
A[5]=A[5]+A[6]
A[5]=A[5]-4*A[1]
A[4]=A[4]-12*A[2]
A[4]=A[4]/1020
A[9]=A[9]-A[2]
A[5]=A[5]/17
A[5]=-A[5]
A[8]=A[8]-A[5]
A[9]=A[9]-A[4]
A[3]=A[3]-A[12]
A[3]=A[3]-3*A[5]
A[3]=A[3]-9*A[9]
A[3]=A[3]-27*A[8]
A[3]=A[3]-81*A[2]
A[3]=A[3]-243*A[1]
A[3]=A[3]-729*A[11]
A[3]=A[3]-486*A[6]
A[3]=A[3]-6561*A[7]
A[10]=A[10]+1023*A[5]
A[3]=A[3]-59049*A[4]
A[3]=A[3]*31-A[10]*5368
A[3]=A[3]/95550
A[6]=A[6]-17*A[3]
A[10]=A[10]-48*A[3]
A[10]=A[10]/1023
A[6]=A[6]/4
A[1]=A[1]-A[6]
A[8]=A[8]-A[3]
A[5]=A[5]-A[10]
tc7_sub(r4, &n4, r4, n4, r9, n9);
tc7_rshift_inplace(r4, &n4, 3);
tc7_submul_1(r9, &n9, r1, n1, 16777216);
tc7_sub(r5, &n5, r5, n5, r2, n2);
tc7_submul_1(r2, &n2, r1, n1, 4096);
tc7_submul_1(r12, &n12, r1, n1, 531441);
tc7_sub(r10, &n10, r10, n10, r11, n11);
tc7_submul_1(r11, &n11, r13, n13, 16777216);
tc7_sub(r8, &n8, r8, n8, r3, n3);
tc7_submul_1(r3, &n3, r13, n13, 4096);
tc7_sub(r7, &n7, r7, n7, r6, n6);
tc7_rshift_inplace(r7, &n7, 1);
tc7_rshift_inplace(r8, &n8, 2);
tc7_addmul_1(r10, &n10, r8, n8, 16384);
tc7_addmul_1(r9, &n9, r4, n4, 4);
tc7_rshift_inplace(r10, &n10, 3);
tc7_submul_1(r11, &n11, r10, n10, 4);
tc7_submul_1(r10, &n10, r8, n8, 1024);
tc7_addmul_1(r3, &n3, r8, n8, 2);
tc7_submul_1(r11, &n11, r3, n3, 1024);
tc7_add(r6, &n6, r6, n6, r7, n7);
tc7_addmul_1(r5, &n5, r2, n2, 2);
tc7_rshift_inplace(r5, &n5, 1);
tc7_sub(r2, &n2, r2, n2, r5, n5);
tc7_addmul_1(r4, &n4, r2, n2, 512);
tc7_addmul_1(r2, &n2, r7, n7, 2048);
tc7_add(r8, &n8, r8, n8, r7, n7);
tc7_addmul_1(r2, &n2, r8, n8, 2);
tc7_divexact_1(r2, &n2, r2, n2, 90);
tc7_addmul_1(r11, &n11, r1, n1, 1023);
tc7_sub(r3, &n3, r3, n3, r1, n1);
tc7_sub(r4, &n4, r4, n4, r8, n8);
tc7_addmul_1(r4, &n4, r2, n2, 11565);
tc7_sub(r6, &n6, r6, n6, r1, n1);
tc7_sub(r6, &n6, r6, n6, r13, n13);
tc7_sub(r5, &n5, r5, n5, r13, n13);
tc7_submul_1(r9, &n9, r5, n5, 1024);
tc7_sub(r9, &n9, r9, n9, r13, n13);
tc7_addmul_1(r8, &n8, r7, n7, 1023);
tc7_submul_1(r11, &n11, r8, n8, 8);
tc7_add(r10, &n10, r10, n10, r8, n8);
tc7_addmul_1(r10, &n10, r2, n2, 11520);
tc7_divexact_1(r10, &n10, r10, n10, 11340);
tc7_sub(r2, &n2, r2, n2, r10, n10);
tc7_submul_1(r11, &n11, r2, n2, 92160);
tc7_submul_1(r4, &n4, r2, n2, 11340);
tc7_divexact_1(r4, &n4, r4, n4, 170100);
tc7_submul_1(r3, &n3, r6, n6, 4);
tc7_sub(r2, &n2, r2, n2, r4, n4);
tc7_sub(r7, &n7, r7, n7, r4, n4);
tc7_addmul_1(r8, &n8, r2, n2, 15);
tc7_submul_1(r8, &n8, r7, n7, 1023);
tc7_addmul_1(r8, &n8, r10, n10, 3);
tc7_submul_1(r8, &n8, r4, n4, 1068);
tc7_addmul_1(r9, &n9, r3, n3, 4);
tc7_divexact_1(r9, &n9, r9, n9, 720);
tc7_submul_1(r5, &n5, r6, n6, 4);
tc7_addmul_1(r11, &n11, r5, n5, 4);
tc7_submul_1(r11, &n11, r9, n9, 9360);
tc7_submul_1(r11, &n11, r10, n10, 1440);
tc7_submul_1(r3, &n3, r6, n6, 1020);
tc7_add(r3, &n3, r3, n3, r5, n5);
tc7_submul_1(r3, &n3, r9, n9, 36);
tc7_submul_1(r11, &n11, r3, n3, 280);
tc7_divexact_1(r11, &n11, r11, n11, 129600);
tc7_divexact_1(r3, &n3, r3, n3, 432);
tc7_submul_1(r5, &n5, r3, n3, 12);
tc7_addmul_1(r9, &n9, r3, n3, 13);
tc7_addmul_1(r9, &n9, r11, n11, 20);
tc7_divexact_1(r9, &n9, r9, n9, 21);
n9 = -n9;
tc7_submul_1(r3, &n3, r11, n11, 5);
tc7_divexact_1(r3, &n3, r3, n3, 21);
tc7_sub(r6, &n6, r6, n6, r11, n11);
tc7_sub(r6, &n6, r6, n6, r3, n3);
tc7_submul_1(r10, &n10, r4, n4, 16);
tc7_submul_1(r2, &n2, r7, n7, 17);
tc7_add(r2, &n2, r2, n2, r10, n10);
tc7_submul_1(r2, &n2, r4, n4, 4);
tc7_submul_1(r5, &n5, r9, n9, 12);
tc7_divexact_1(r5, &n5, r5, n5, 1020);
tc7_sub(r6, &n6, r6, n6, r9, n9);
tc7_divexact_1(r2, &n2, r2, n2, 17);
n2 = -n2;
tc7_sub(r7, &n7, r7, n7, r2, n2);
tc7_sub(r6, &n6, r6, n6, r5, n5);
tc7_sub(r12, &n12, r12, n12, r13, n13);
tc7_submul_1(r12, &n12, r2, n2, 3);
tc7_submul_1(r12, &n12, r6, n6, 9);
tc7_submul_1(r12, &n12, r7, n7, 27);
tc7_submul_1(r12, &n12, r9, n9, 81);
tc7_submul_1(r12, &n12, r4, n4, 243);
tc7_submul_1(r12, &n12, r11, n11, 729);
tc7_submul_1(r12, &n12, r10, n10, 486);
tc7_submul_1(r12, &n12, r3, n3, 6561);
tc7_addmul_1(r8, &n8, r2, n2, 1023);
tc7_submul_1(r12, &n12, r5, n5, 59049);
tc7_mul_1(r12, &n12, r12, n12, 31);
tc7_submul_1(r12, &n12, r8, n8, 5368);
tc7_divexact_1(r12, &n12, r12, n12, 95550);
tc7_submul_1(r10, &n10, r12, n12, 17);
tc7_submul_1(r8, &n8, r12, n12, 48);
tc7_divexact_1(r8, &n8, r8, n8, 1023);
tc7_rshift_inplace(r10, &n10, 2);
tc7_sub(r4, &n4, r4, n4, r10, n10);
tc7_sub(r7, &n7, r7, n7, r12, n12);
tc7_sub(r2, &n2, r2, n2, r8, n8);
rpn = 0;
tc7_copy(rp, &rpn, 0, r13, n13);
tc7_copy(rp, &rpn, sn, r2, n2);
tc7_copy(rp, &rpn, 2*sn, r6, n6);
tc7_copy(rp, &rpn, 3*sn, r7, n7);
tc7_copy(rp, &rpn, 4*sn, r9, n9);
tc7_copy(rp, &rpn, 5*sn, r4, n4);
tc7_copy(rp, &rpn, 6*sn, r11, n11);
tc7_copy(rp, &rpn, 7*sn, r10, n10);
tc7_copy(rp, &rpn, 8*sn, r3, n3);
tc7_copy(rp, &rpn, 9*sn, r12, n12);
tc7_copy(rp, &rpn, 10*sn, r5, n5);
tc7_copy(rp, &rpn, 11*sn, r8, n8);
tc7_copy(rp, &rpn, 12*sn, r1, n1);
*/