2013 lines
49 KiB
C
2013 lines
49 KiB
C
/* mpn_toom7_mul_n -- Internal routine to multiply two natural numbers
|
|
of length n.
|
|
|
|
THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
|
|
SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
|
|
*/
|
|
|
|
/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 7-way.
|
|
|
|
Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
|
|
Copyright Marco Bodrato, November 2006
|
|
Copyright 2009 William Hart
|
|
|
|
This file is part of the MPIR Library.
|
|
|
|
The MPIR Library is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License as published by
|
|
the Free Software Foundation; either version 2.1 of the License, or (at your
|
|
option) any later version.
|
|
|
|
The MPIR Library is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with the MPIR Library; see the file COPYING.LIB. If not, write to
|
|
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
MA 02110-1301, USA. */
|
|
|
|
/*
|
|
This implementation is based on that of Paul Zimmmermann, which is available
|
|
for Toom 4 for mpz_t's at http://www.loria.fr/~zimmerma/software/toom7.c
|
|
and uses the Toom 7 sequence as generated by Bodrato and Zanoni at
|
|
http://bodrato.it/software/tc3-7.27nov2006.tar.bz2
|
|
|
|
Please see the papers of Bodrato (and Zanoni) at http://www.bodrato.it/
|
|
In particular see the paper by these authors:
|
|
|
|
Integer and Polynomial Multiplication: Towards
|
|
Optimal Toom-Cook Matrices in Proceedings of the ISSAC 2007 conference,
|
|
Ontario, Canada, July 29-August 1, 2007, ACM press.
|
|
*/
|
|
|
|
#include "mpir.h"
|
|
#include "gmp-impl.h"
|
|
#include "longlong.h"
|
|
|
|
#define TC7_LIB 1 // library build only
|
|
|
|
#if !TC7_LIB
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#define TC7_TEST 0 // test code
|
|
#define TC7_TIME 1 // timing code
|
|
#else
|
|
#define TC7_TEST 0
|
|
#define TC7_TIME 0
|
|
#endif
|
|
|
|
void
|
|
mpn_toom7_mul_n (mp_ptr rp, mp_srcptr up,
|
|
mp_srcptr vp, mp_size_t n);
|
|
|
|
void toom7_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,
|
|
mp_ptr tp, mp_size_t s7, mp_size_t n2, mp_size_t n6,
|
|
mp_size_t n8, mp_size_t n9, mp_size_t n11);
|
|
|
|
void _tc7_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1,
|
|
mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
|
{
|
|
mp_limb_t cy;
|
|
mp_size_t s1 = ABS(r1n);
|
|
mp_size_t s2 = ABS(r2n);
|
|
|
|
if (!s1)
|
|
{
|
|
*rn = 0;
|
|
} else if (!s2)
|
|
{
|
|
if (rp != r1) MPN_COPY(rp, r1, s1);
|
|
*rn = r1n;
|
|
} else if ((r1n ^ r2n) >= 0)
|
|
{
|
|
*rn = r1n;
|
|
cy = mpn_add(rp, r1, s1, r2, s2);
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
} else
|
|
{
|
|
mp_size_t ct;
|
|
if (s1 != s2) ct = 1;
|
|
else MPN_CMP(ct, r1, r2, s1);
|
|
|
|
if (!ct) *rn = 0;
|
|
else if (ct > 0)
|
|
{
|
|
mpn_sub(rp, r1, s1, r2, s2);
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n < 0) *rn = -(*rn);
|
|
}
|
|
else
|
|
{
|
|
mpn_sub_n(rp, r2, r1, s1);
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n > 0) *rn = -(*rn);
|
|
}
|
|
}
|
|
}
|
|
|
|
void tc7_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
|
{
|
|
mp_size_t s1 = ABS(r1n);
|
|
mp_size_t s2 = ABS(r2n);
|
|
|
|
if (s1 < s2) _tc7_add(rp, rn, r2, r2n, r1, r1n);
|
|
else _tc7_add(rp, rn, r1, r1n, r2, r2n);
|
|
}
|
|
|
|
#if HAVE_NATIVE_mpn_sumdiff_n
|
|
void tc7_sumdiff(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
|
|
{
|
|
mp_limb_t cy, cy2;
|
|
mp_size_t s1 = ABS(r1n);
|
|
mp_size_t s2 = ABS(r2n);
|
|
int swapped = 0;
|
|
|
|
if (s1 < s2)
|
|
{
|
|
MPN_PTR_SWAP(r1, r1n, r2, r2n);
|
|
MP_SIZE_T_SWAP(s1, s2);
|
|
swapped = 1;
|
|
}
|
|
|
|
if (!s1)
|
|
{
|
|
*rn = 0;
|
|
*sn = 0;
|
|
} else if (!s2)
|
|
{
|
|
if (rp != r1) MPN_COPY(rp, r1, s1);
|
|
if (sp != r1) MPN_COPY(sp, r1, s1);
|
|
*rn = r1n;
|
|
*sn = (swapped ? -r1n : r1n);
|
|
} else
|
|
{
|
|
mp_size_t ct;
|
|
if (s1 != s2) ct = 1;
|
|
else MPN_CMP(ct, r1, r2, s1);
|
|
|
|
if (!ct)
|
|
{
|
|
if ((r1n ^ r2n) >= 0)
|
|
{
|
|
*sn = 0;
|
|
*rn = r1n;
|
|
cy = mpn_lshift1(rp, r1, s1);
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
} else
|
|
{
|
|
*rn = 0;
|
|
*sn = (swapped ? -r1n : r1n);
|
|
cy = mpn_lshift1(sp, r1, s1);
|
|
if (cy)
|
|
{
|
|
sp[s1] = cy;
|
|
if ((*sn) < 0) (*sn)--;
|
|
else (*sn)++;
|
|
}
|
|
}
|
|
} else if (ct > 0) // r1 is bigger than r2
|
|
{
|
|
if ((r1n ^ r2n) >= 0) // both inputs are same sign
|
|
{
|
|
cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
|
|
if (s1 > s2) // r1 has more limbs than r2
|
|
{
|
|
*rn = r1n;
|
|
cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
|
|
if (cy2)
|
|
{
|
|
rp[s1] = cy2;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
|
|
} else // both inputs have same number of limbs
|
|
{
|
|
*rn = r1n;
|
|
if (cy>>1)
|
|
{
|
|
rp[s1] = 1;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
}
|
|
*sn = s1;
|
|
MPN_NORMALIZE(sp, (*sn));
|
|
if (r1n ^ (~swapped) < 0) *sn = -(*sn);
|
|
} else // inputs are different sign
|
|
{
|
|
cy = mpn_sumdiff_n(sp, rp, r1, r2, s2);
|
|
if (s1 > s2) // r1 has more limbs than r2
|
|
{
|
|
*sn = r1n;
|
|
cy2 = mpn_add_1(sp + s2, r1 + s2, s1 - s2, cy>>1);
|
|
if (cy2)
|
|
{
|
|
sp[s1] = cy2;
|
|
if ((*sn) < 0) (*sn)--;
|
|
else (*sn)++;
|
|
}
|
|
mpn_sub_1(rp + s2, r1 + s2, s1 - s2, cy&1);
|
|
} else // both inputs have same number of limbs
|
|
{
|
|
*sn = r1n;
|
|
if (cy>>1)
|
|
{
|
|
sp[s1] = 1;
|
|
if ((*sn) < 0) (*sn)--;
|
|
else (*sn)++;
|
|
}
|
|
}
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n ^ (~swapped) < 0) *rn = -(*rn);
|
|
}
|
|
} else // r2 is bigger than r1 (but same number of limbs)
|
|
{
|
|
if ((r1n ^ r2n) >= 0) // both inputs are same sign
|
|
{
|
|
cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
|
|
*rn = r1n;
|
|
if (cy>>1)
|
|
{
|
|
rp[s1] = 1;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
*sn = s1;
|
|
MPN_NORMALIZE(sp, (*sn));
|
|
if (r1n ^ (~swapped) > 0) *sn = -(*sn);
|
|
} else // inputs are different sign
|
|
{
|
|
cy = mpn_sumdiff_n(sp, rp, r2, r1, s1);
|
|
*sn = r1n;
|
|
if (cy>>1)
|
|
{
|
|
sp[s1] = 1;
|
|
if ((*sn) < 0) (*sn)--;
|
|
else (*sn)++;
|
|
}
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n ^ (~swapped) > 0) *rn = -(*rn);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void tc7_sumdiff_unsigned(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
|
|
{
|
|
mp_limb_t cy, cy2;
|
|
mp_size_t s1 = ABS(r1n);
|
|
mp_size_t s2 = ABS(r2n);
|
|
int swapped = 0;
|
|
|
|
if (s1 < s2)
|
|
{
|
|
MPN_PTR_SWAP(r1, r1n, r2, r2n);
|
|
MP_SIZE_T_SWAP(s1, s2);
|
|
swapped = 1;
|
|
}
|
|
|
|
if (!s1)
|
|
{
|
|
*rn = 0;
|
|
*sn = 0;
|
|
} else if (!s2)
|
|
{
|
|
if (rp != r1) MPN_COPY(rp, r1, s1);
|
|
if (sp != r1) MPN_COPY(sp, r1, s1);
|
|
*rn = r1n;
|
|
*sn = (swapped ? -r1n : r1n);
|
|
} else
|
|
{
|
|
mp_size_t ct;
|
|
if (s1 != s2) ct = 1;
|
|
else MPN_CMP(ct, r1, r2, s1);
|
|
|
|
if (!ct)
|
|
{
|
|
*sn = 0;
|
|
*rn = r1n;
|
|
cy = mpn_lshift1(rp, r1, s1);
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
(*rn)++;
|
|
}
|
|
} else if (ct > 0) // r1 is bigger than r2
|
|
{
|
|
cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
|
|
if (s1 > s2) // r1 has more limbs than r2
|
|
{
|
|
*rn = r1n;
|
|
cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
|
|
if (cy2)
|
|
{
|
|
rp[s1] = cy2;
|
|
(*rn)++;
|
|
}
|
|
mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
|
|
} else // both inputs have same number of limbs
|
|
{
|
|
*rn = r1n;
|
|
if (cy>>1)
|
|
{
|
|
rp[s1] = 1;
|
|
(*rn)++;
|
|
}
|
|
}
|
|
*sn = s1;
|
|
MPN_NORMALIZE(sp, (*sn));
|
|
if (swapped) *sn = -(*sn);
|
|
} else // r2 is bigger than r1 (but same number of limbs)
|
|
{
|
|
cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
|
|
*rn = r1n;
|
|
if (cy>>1)
|
|
{
|
|
rp[s1] = 1;
|
|
(*rn)++;
|
|
}
|
|
*sn = s1;
|
|
MPN_NORMALIZE(sp, (*sn));
|
|
if (!swapped) *sn = -(*sn);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* ~~~~~~~~~~~~~~~~~~~~UNTESTED CODE~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
#define MPN_ADDSUB_CMP(cyxx, r1xx, r2xx, r3xx, snxx) \
|
|
do { \
|
|
mp_limb_t t[2]; \
|
|
add_ssaaaa(t[1], t[0], CNST_LIMB(0), r1xx[snxx - 1], CNST_LIMB(0), r2xx[snxx - 1]); \
|
|
if (t[1]) cyxx = 1; \
|
|
else if (t[0] > r3xx[snxx - 1]) cyxx = 1; \
|
|
else if (t[0] < r3xx[snxx - 1] - CNST_LIMB(1)) cyxx = -1; \
|
|
else cyxx = 0; \
|
|
} while (0)
|
|
|
|
void tc7_addadd(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n, mp_ptr r3, n3, mp_size_t r3n)
|
|
{
|
|
mp_size_t s1 = ABS(r1n);
|
|
mp_size_t s2 = ABS(r2n);
|
|
mp_size_t s3 = ABS(r3n);
|
|
|
|
if ((s1 != s2) || (s1 != s3))
|
|
{
|
|
tc7_add(rp, rn, r1, r1n, r2, r2n);
|
|
tc7_add(rp, rn, rp, *rn, r3, r3n);
|
|
} else
|
|
{
|
|
mp_limb_t cy;
|
|
mp_size_t cy2;
|
|
if (((r1n ^ r2n) >= 0) && ((r1n ^ r3n) >= 0)) // all same sign addadd
|
|
{
|
|
cy = mpn_addadd_n(rp, r1, r2, r3, s1);
|
|
*rn = r1n;
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
} else if (((r1n ^ r2n) >= 0) && ((r1n ^ r3n) < 0)) // addsub
|
|
{
|
|
MPN_ADDSUB_CMP(cy2, r1, r2, r3, s1);
|
|
|
|
if (cy2 > 0)
|
|
{
|
|
cy = mpn_addsub_n(rp, r1, r2, r3, s1);
|
|
*rn = r1n;
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
} else if (cy2 < 0)
|
|
{
|
|
cy = mpn_subadd_n(rp, r3, r1, r2, s1);
|
|
if (cy) abort();
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n < 0) *rn = -(*rn);
|
|
} else
|
|
{
|
|
tc7_add(rp, rn, r1, r1n, r2, r2n);
|
|
tc7_add(rp, rn, rp, *rn, r3, r3n);
|
|
}
|
|
} else if (((r1n ^ r2n) < 0) && ((r1n ^ r3n) >= 0)) // subadd
|
|
{
|
|
MPN_ADDSUB_CMP(cy2, r1, r3, r2, s1);
|
|
|
|
if (cy2 > 0)
|
|
{
|
|
cy = mpn_addsub_n(rp, r1, r3, r2, s1);
|
|
*rn = r1n;
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
} else if (cy2 < 0)
|
|
{
|
|
mpn_subadd_n(rp, r2, r1, r3, s1);
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n > 0) *rn = -(*rn);
|
|
} else
|
|
{
|
|
tc7_add(rp, rn, r1, r1n, r2, r2n);
|
|
tc7_add(rp, rn, rp, *rn, r3, r3n);
|
|
}
|
|
} else // add final two and subtract first
|
|
{
|
|
MPN_ADDSUB_CMP(cy2, r2, r3, r1, s1);
|
|
|
|
if (cy2 > 0)
|
|
{
|
|
cy = mpn_addsub_n(rp, r2, r3, r1, s1);
|
|
*rn = r1n;
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
} else if (cy2 < 0)
|
|
{
|
|
mpn_subadd_n(rp, r1, r2, r3, s1);
|
|
*rn = s1;
|
|
MPN_NORMALIZE(rp, (*rn));
|
|
if (r1n < 0) *rn = -(*rn);
|
|
} else
|
|
{
|
|
tc7_add(rp, rn, r1, r1n, r2, r2n);
|
|
tc7_add(rp, rn, rp, *rn, r3, r3n);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void tc7_addsub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n, mp_ptr r3, mp_size_t r3n)
|
|
{
|
|
tc7_addadd(rp, rn, r1, r1n, r2, r2n, r3, -r3n);
|
|
}
|
|
|
|
void tc7_subsub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n, mp_ptr r3, mp_size_t r3n)
|
|
{
|
|
tc7_addadd(rp, rn, r1, r1n, r2, -r2n, r3, -r3n);
|
|
}
|
|
|
|
*/
|
|
|
|
void _tc7_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
|
{
|
|
mp_limb_t cy;
|
|
mp_size_t s1 = r1n;
|
|
mp_size_t s2 = r2n;
|
|
|
|
if (!s2)
|
|
{
|
|
if (!s1) *rn = 0;
|
|
else
|
|
{
|
|
if (rp != r1) MPN_COPY(rp, r1, s1);
|
|
*rn = r1n;
|
|
}
|
|
} else
|
|
{
|
|
*rn = r1n;
|
|
cy = mpn_add(rp, r1, s1, r2, s2);
|
|
if (cy)
|
|
{
|
|
rp[s1] = cy;
|
|
if ((*rn) < 0) (*rn)--;
|
|
else (*rn)++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void tc7_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
|
|
{
|
|
if (r1n < r2n) _tc7_add_unsigned(rp, rn, r2, r2n, r1, r1n);
|
|
else _tc7_add_unsigned(rp, rn, r1, r1n, r2, r2n);
|
|
}
|
|
|
|
void tc7_sub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
|
|
{
|
|
tc7_add(rp, rn, r1, r1n, r2, -r2n);
|
|
}
|
|
|
|
void tc7_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_size_t bits)
|
|
{
|
|
if (xn == 0) *rn = 0;
|
|
else
|
|
{
|
|
mp_size_t xu = ABS(xn);
|
|
mp_limb_t msl = mpn_lshift(rp, xp, xu, bits);
|
|
if (msl)
|
|
{
|
|
rp[xu] = msl;
|
|
*rn = (xn >= 0 ? xn + 1 : xn - 1);
|
|
} else
|
|
*rn = xn;
|
|
}
|
|
}
|
|
|
|
void tc7_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits)
|
|
{
|
|
if (*rn)
|
|
{
|
|
if ((*rn) > 0)
|
|
{
|
|
mpn_rshift(rp, rp, *rn, bits);
|
|
if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--;
|
|
} else
|
|
{
|
|
mpn_rshift(rp, rp, -(*rn), bits);
|
|
if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if HAVE_NATIVE_mpn_addlsh1_n
|
|
void tc7_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
|
|
{
|
|
if (xn)
|
|
{
|
|
if (xn >= *rn)
|
|
{
|
|
mp_limb_t cy;
|
|
if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
|
|
cy = mpn_addlsh1_n(rp, rp, xp, xn);
|
|
if (cy)
|
|
{
|
|
rp[xn] = cy;
|
|
*rn = xn + 1;
|
|
} else *rn = xn;
|
|
} else
|
|
{
|
|
mp_limb_t cy = mpn_addlsh1_n(rp, rp, xp, xn);
|
|
if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
|
|
if (cy)
|
|
{
|
|
rp[*rn] = cy;
|
|
(*rn)++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
/*
|
|
void tc7_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_srcptr x, mp_size_t xn, mp_limb_t c)
|
|
{
|
|
if (xn)
|
|
{
|
|
mp_size_t xu = ABS(xn);
|
|
mp_limb_t cy = mpn_divmod_1(rp, x, xu, c);
|
|
if (xn > 0)
|
|
{
|
|
if (rp[xu - 1] == 0) *rn = xn - 1;
|
|
else *rn = xn;
|
|
} else
|
|
{
|
|
if (rp[xu - 1] == 0) *rn = xn + 1;
|
|
else *rn = xn;
|
|
}
|
|
} else *rn = 0;
|
|
}
|
|
*/
|
|
|
|
void tc7_mul_1(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_limb_t c)
|
|
{
|
|
if (xn == 0) *rn = 0; // c won't be zero in tc7
|
|
else
|
|
{
|
|
mp_size_t xu = ABS(xn);
|
|
mp_limb_t msl = mpn_mul_1(rp, xp, xu, c);
|
|
if (msl)
|
|
{
|
|
rp[xu] = msl;
|
|
*rn = (xn >= 0 ? xn + 1 : xn - 1);
|
|
} else
|
|
*rn = xn;
|
|
}
|
|
}
|
|
|
|
void tc7_divexact_1(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c)
|
|
{
|
|
mp_size_t abs_size;
|
|
if (xn == 0)
|
|
{
|
|
*rn = 0;
|
|
return;
|
|
}
|
|
abs_size = ABS (xn);
|
|
|
|
MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c);
|
|
abs_size -= (rp[abs_size-1] == 0);
|
|
*rn = (xn >= 0 ? abs_size : -abs_size);
|
|
}
|
|
|
|
void tc7_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
|
|
{
|
|
if (xn)
|
|
{
|
|
mp_size_t xu = ABS(xn);
|
|
mpn_divexact_by3(rp, x, xu);
|
|
if (xn > 0)
|
|
{
|
|
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
|
|
else *rn = xn;
|
|
} else
|
|
{
|
|
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
|
|
else *rn = xn;
|
|
}
|
|
} else *rn = 0;
|
|
}
|
|
|
|
#if HAVE_NATIVE_mpn_divexact_byBm1of
|
|
void tc7_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
|
|
{
|
|
if (xn)
|
|
{
|
|
mp_size_t xu = ABS(xn);
|
|
mpn_divexact_byBm1of(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); // works for 32 and 64 bits
|
|
if (xn > 0)
|
|
{
|
|
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
|
|
else *rn = xn;
|
|
} else
|
|
{
|
|
if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
|
|
else *rn = xn;
|
|
}
|
|
} else *rn = 0;
|
|
}
|
|
#endif
|
|
|
|
#if HAVE_NATIVE_mpn_mul_1c
|
|
#define MPN_MUL_1C(cout, dst, src, size, n, cin) \
|
|
do { \
|
|
(cout) = mpn_mul_1c (dst, src, size, n, cin); \
|
|
} while (0)
|
|
#else
|
|
#define MPN_MUL_1C(cout, dst, src, size, n, cin) \
|
|
do { \
|
|
mp_limb_t __cy; \
|
|
__cy = mpn_mul_1 (dst, src, size, n); \
|
|
(cout) = __cy + mpn_add_1 (dst, dst, size, cin); \
|
|
} while (0)
|
|
#endif
|
|
|
|
void tc7_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
|
|
{
|
|
mp_size_t sign, wu, xu, ws, new_wn, min_size, dsize;
|
|
mp_limb_t cy;
|
|
|
|
/* w unaffected if x==0 or y==0 */
|
|
if (xn == 0 || y == 0)
|
|
return;
|
|
|
|
sign = xn;
|
|
xu = ABS (xn);
|
|
|
|
ws = *wn;
|
|
if (*wn == 0)
|
|
{
|
|
/* nothing to add to, just set x*y, "sign" gives the sign */
|
|
cy = mpn_mul_1 (wp, xp, xu, y);
|
|
if (cy)
|
|
{
|
|
wp[xu] = cy;
|
|
xu = xu + 1;
|
|
}
|
|
*wn = (sign >= 0 ? xu : -xu);
|
|
return;
|
|
}
|
|
|
|
sign ^= *wn;
|
|
wu = ABS (*wn);
|
|
|
|
new_wn = MAX (wu, xu);
|
|
min_size = MIN (wu, xu);
|
|
|
|
if (sign >= 0)
|
|
{
|
|
/* addmul of absolute values */
|
|
|
|
cy = mpn_addmul_1 (wp, xp, min_size, y);
|
|
|
|
dsize = xu - wu;
|
|
#if HAVE_NATIVE_mpn_mul_1c
|
|
if (dsize > 0)
|
|
cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
|
|
else if (dsize < 0)
|
|
{
|
|
dsize = -dsize;
|
|
cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
|
|
}
|
|
#else
|
|
if (dsize != 0)
|
|
{
|
|
mp_limb_t cy2;
|
|
if (dsize > 0)
|
|
cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
|
|
else
|
|
{
|
|
dsize = -dsize;
|
|
cy2 = 0;
|
|
}
|
|
cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
|
|
}
|
|
#endif
|
|
|
|
if (cy)
|
|
{
|
|
wp[dsize + min_size] = cy;
|
|
new_wn ++;
|
|
}
|
|
} else
|
|
{
|
|
/* submul of absolute values */
|
|
|
|
cy = mpn_submul_1 (wp, xp, min_size, y);
|
|
if (wu >= xu)
|
|
{
|
|
/* if w bigger than x, then propagate borrow through it */
|
|
if (wu != xu)
|
|
cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);
|
|
|
|
if (cy != 0)
|
|
{
|
|
/* Borrow out of w, take twos complement negative to get
|
|
absolute value, flip sign of w. */
|
|
wp[new_wn] = ~-cy; /* extra limb is 0-cy */
|
|
mpn_com_n (wp, wp, new_wn);
|
|
new_wn++;
|
|
MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
|
|
ws = -*wn;
|
|
}
|
|
} else /* wu < xu */
|
|
{
|
|
/* x bigger than w, so want x*y-w. Submul has given w-x*y, so
|
|
take twos complement and use an mpn_mul_1 for the rest. */
|
|
|
|
mp_limb_t cy2;
|
|
|
|
/* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
|
|
mpn_com_n (wp, wp, wu);
|
|
cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
|
|
cy -= 1;
|
|
|
|
/* If cy-1 == -1 then hold that -1 for latter. mpn_submul_1 never
|
|
returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
|
|
cy2 = (cy == MP_LIMB_T_MAX);
|
|
cy += cy2;
|
|
MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
|
|
wp[new_wn] = cy;
|
|
new_wn += (cy != 0);
|
|
|
|
/* Apply any -1 from above. The value at wp+wsize is non-zero
|
|
because y!=0 and the high limb of x will be non-zero. */
|
|
if (cy2)
|
|
MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));
|
|
|
|
ws = -*wn;
|
|
}
|
|
|
|
/* submul can produce high zero limbs due to cancellation, both when w
|
|
has more limbs or x has more */
|
|
MPN_NORMALIZE (wp, new_wn);
|
|
}
|
|
|
|
*wn = (ws >= 0 ? new_wn : -new_wn);
|
|
|
|
ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
|
|
}
|
|
|
|
void tc7_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y)
|
|
{
|
|
tc7_addmul_1(wp, wn, x, -xn, y);
|
|
}
|
|
|
|
void tc7_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
|
|
{
|
|
mp_size_t yu = ABS(*yn);
|
|
mp_size_t xu = ABS(xn);
|
|
mp_limb_t cy = 0;
|
|
|
|
if (xn == 0)
|
|
return;
|
|
|
|
if (offset < yu) /* low part of x overlaps with y */
|
|
{
|
|
if (offset + xu <= yu) /* x entirely inside y */
|
|
{
|
|
cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
|
|
if (offset + xu < yu)
|
|
cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
|
|
yu - (offset + xu), cy);
|
|
} else
|
|
cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
|
|
/* now cy is the carry at yp + yu */
|
|
if (xu + offset > yu) /* high part of x exceeds y */
|
|
{
|
|
MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
|
|
cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
|
|
yu = xu + offset;
|
|
}
|
|
/* now cy is the carry at yp + yn */
|
|
if (cy)
|
|
yp[yu++] = cy;
|
|
MPN_NORMALIZE(yp, yu);
|
|
*yn = yu;
|
|
} else /* x does not overlap */
|
|
{
|
|
if (offset > yu)
|
|
MPN_ZERO (yp + yu, offset - yu);
|
|
MPN_COPY (yp + offset, xp, xu);
|
|
*yn = offset + xu;
|
|
}
|
|
}
|
|
|
|
#define MUL_TC7_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
|
|
do \
|
|
{ \
|
|
if ((n1xx != 0) && (n2xx != 0)) \
|
|
{ mp_size_t len; \
|
|
if (n1xx == n2xx) \
|
|
{ \
|
|
if (n1xx > MUL_TOOM7_THRESHOLD) mpn_toom7_mul_n(r3xx, r1xx, r2xx, n1xx); \
|
|
else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
|
|
} else if (n1xx > n2xx) \
|
|
mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
|
|
else \
|
|
mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
|
|
len = n1xx + n2xx; \
|
|
MPN_NORMALIZE(r3xx, len); \
|
|
n3xx = len; \
|
|
} else \
|
|
n3xx = 0; \
|
|
} while (0)
|
|
|
|
#define MUL_TC7(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
|
|
do \
|
|
{ \
|
|
mp_size_t sign = n1xx ^ n2xx; \
|
|
mp_size_t un1 = ABS(n1xx); \
|
|
mp_size_t un2 = ABS(n2xx); \
|
|
MUL_TC7_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
|
|
if (sign < 0) n3xx = -n3xx; \
|
|
} while (0)
|
|
|
|
#define SQR_TC7_UNSIGNED(r3xx, n3xx, r1xx, n1xx) \
|
|
do \
|
|
{ \
|
|
if (n1xx != 0) \
|
|
{ mp_size_t len; \
|
|
if (n1xx > SQR_TOOM7_THRESHOLD) mpn_toom7_sqr_n(r3xx, r1xx, n1xx); \
|
|
else mpn_sqr_n(r3xx, r1xx, n1xx); \
|
|
len = 2*n1xx; \
|
|
MPN_NORMALIZE(r3xx, len); \
|
|
n3xx = len; \
|
|
} else \
|
|
n3xx = 0; \
|
|
} while (0)
|
|
|
|
#define SQR_TC7(r3xx, n3xx, r1xx, n1xx) \
|
|
do \
|
|
{ \
|
|
mp_size_t un1 = ABS(n1xx); \
|
|
SQR_TC7_UNSIGNED(r3xx, n3xx, r1xx, un1); \
|
|
} while (0)
|
|
|
|
#define TC7_NORM(rxx, nxx, sxx) \
|
|
do \
|
|
{ \
|
|
nxx = sxx; \
|
|
MPN_NORMALIZE(rxx, nxx); \
|
|
} while(0)
|
|
|
|
#if TC7_TEST || TC7_TIME
|
|
#define p2(axx, anxx, bxx, bnxx) \
|
|
do \
|
|
{ \
|
|
printf("s1 = "); \
|
|
if (anxx < 0) printf("-"); \
|
|
if (anxx == 0) printf("0, "); \
|
|
else printf("%ld, ", axx[0]); \
|
|
printf("s2 = "); \
|
|
if (bnxx < 0) printf("-"); \
|
|
if (bnxx == 0) printf("0"); \
|
|
else printf("%ld\n", bxx[0]); \
|
|
} while (0)
|
|
|
|
#define p(axx, anxx) \
|
|
do \
|
|
{ \
|
|
printf("r = "); \
|
|
if (anxx < 0) printf("-"); \
|
|
if (anxx == 0) printf("0\n"); \
|
|
else printf("%ld\n", axx[0]); \
|
|
} while (0)
|
|
#endif
|
|
|
|
/* Zero out limbs past end of integer */
|
|
#define TC7_DENORM(rxx, nxx, sxx) \
|
|
do { \
|
|
MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
|
|
} while (0)
|
|
|
|
/* Two's complement divexact by power of 2 */
|
|
#define TC7_DIVEXACT_2EXP(rxx, nxx, sxx) \
|
|
do { \
|
|
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
|
|
mpn_rshift(rxx, rxx, nxx, sxx); \
|
|
rxx[nxx-1] |= sign; \
|
|
} while (0)
|
|
|
|
#if HAVE_NATIVE_mpn_rshift1
|
|
#define TC7_RSHIFT1(rxx, nxx) \
|
|
do { \
|
|
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
|
|
mpn_rshift1(rxx, rxx, nxx); \
|
|
rxx[nxx-1] |= sign; \
|
|
} while (0)
|
|
#else
|
|
#define TC7_RSHIFT1(rxx, nxx) \
|
|
do { \
|
|
mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
|
|
mpn_rshift(rxx, rxx, nxx, 1); \
|
|
rxx[nxx-1] |= sign; \
|
|
} while (0)
|
|
#endif
|
|
|
|
#define r1 (tp)
|
|
#define r2 (tp + t7)
|
|
#define r3 (tp + 2*t7)
|
|
#define r4 (tp + 3*t7)
|
|
#define r5 (tp + 4*t7)
|
|
#define r6 (tp + 5*t7)
|
|
#define r7 (tp + 6*t7)
|
|
#define r8 (tp + 7*t7)
|
|
#define r9 (tp + 8*t7)
|
|
#define r10 (tp + 9*t7)
|
|
#define r11 (tp + 10*t7)
|
|
#define r12 (tp + 11*t7)
|
|
#define r13 (tp + 12*t7)
|
|
|
|
/* Multiply {up, n} by {vp, n} and write the result to
|
|
{prodp, 2n}.
|
|
|
|
Note that prodp gets 2n limbs stored, even if the actual result
|
|
only needs 2n - 1.
|
|
*/
|
|
|
|
void
|
|
mpn_toom7_mul_n (mp_ptr rp, mp_srcptr up,
|
|
mp_srcptr vp, mp_size_t n)
|
|
{
|
|
mp_size_t len1, len2;
|
|
mp_limb_t cy;
|
|
mp_ptr tp;
|
|
mp_size_t a0n, a1n, a2n, a3n, a4n, a5n, a6n;
|
|
mp_size_t b0n, b1n, b2n, b3n, b4n, b5n, b6n;
|
|
mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, rpn, t7;
|
|
|
|
len1 = n;
|
|
len2 = n;
|
|
ASSERT (n >= 1);
|
|
|
|
MPN_NORMALIZE(up, len1);
|
|
MPN_NORMALIZE(vp, len2);
|
|
|
|
sn = (n - 1) / 7 + 1;
|
|
|
|
#define a0 (up)
|
|
#define a1 (up + sn)
|
|
#define a2 (up + 2*sn)
|
|
#define a3 (up + 3*sn)
|
|
#define a4 (up + 4*sn)
|
|
#define a5 (up + 5*sn)
|
|
#define a6 (up + 6*sn)
|
|
#define b0 (vp)
|
|
#define b1 (vp + sn)
|
|
#define b2 (vp + 2*sn)
|
|
#define b3 (vp + 3*sn)
|
|
#define b4 (vp + 4*sn)
|
|
#define b5 (vp + 5*sn)
|
|
#define b6 (vp + 6*sn)
|
|
|
|
TC7_NORM(a0, a0n, sn);
|
|
TC7_NORM(a1, a1n, sn);
|
|
TC7_NORM(a2, a2n, sn);
|
|
TC7_NORM(a3, a3n, sn);
|
|
TC7_NORM(a4, a4n, sn);
|
|
TC7_NORM(a5, a5n, sn);
|
|
TC7_NORM(a6, a6n, n - 6*sn);
|
|
TC7_NORM(b0, b0n, sn);
|
|
TC7_NORM(b1, b1n, sn);
|
|
TC7_NORM(b2, b2n, sn);
|
|
TC7_NORM(b3, b3n, sn);
|
|
TC7_NORM(b4, b4n, sn);
|
|
TC7_NORM(b5, b5n, sn);
|
|
TC7_NORM(b6, b6n, n - 6*sn);
|
|
|
|
t7 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
|
|
|
|
tp = __GMP_ALLOCATE_FUNC_LIMBS(13*t7 + 11*(sn+1));
|
|
|
|
#define u2 (tp + 13*t7)
|
|
#define u3 (tp + 13*t7 + (sn+1))
|
|
#define u4 (tp + 13*t7 + 2*(sn+1))
|
|
#define u5 (tp + 13*t7 + 3*(sn+1))
|
|
#define u6 (tp + 13*t7 + 4*(sn+1))
|
|
#define u7 (tp + 13*t7 + 5*(sn+1))
|
|
#define u8 (tp + 13*t7 + 6*(sn+1))
|
|
#define u9 (tp + 13*t7 + 7*(sn+1))
|
|
#define u10 (tp + 13*t7 + 8*(sn+1))
|
|
#define u11 (tp + 13*t7 + 9*(sn+1))
|
|
#define u12 (tp + 13*t7 + 10*(sn+1))
|
|
|
|
tc7_lshift (r6, &n6, a2, a2n, 4);
|
|
tc7_lshift (r7, &n7, b2, b2n, 4);
|
|
|
|
tc7_lshift (u10, &n10, a4, a4n, 4);
|
|
tc7_lshift (u11, &n11, b4, b4n, 4);
|
|
|
|
tc7_lshift (u5, &n5, a3, a3n, 3);
|
|
tc7_lshift (u2, &n2, b3, b3n, 3);
|
|
|
|
tc7_lshift (r1, &n1, a1, a1n, 5);
|
|
tc7_add (r1, &n1, r1, n1, u5, n5);
|
|
tc7_addmul_1 (r1, &n1, a5, a5n, 2);
|
|
tc7_add (u3, &n3, r6, n6, a6, a6n);
|
|
tc7_addmul_1 (u3, &n3, a0, a0n, 64);
|
|
tc7_addmul_1 (u3, &n3, a4, a4n, 4);
|
|
|
|
tc7_sub (r13, &n13, r1, n1, u3, n3);
|
|
tc7_add (u3, &n3, u3, n3, r1, n1);
|
|
|
|
tc7_lshift (r1, &n1, b1, b1n, 5);
|
|
tc7_add (r1, &n1, r1, n1, u2, n2);
|
|
tc7_addmul_1 (r1, &n1, b5, b5n, 2);
|
|
tc7_add (u8, &n8, r7, n7, b6, b6n);
|
|
tc7_addmul_1 (u8, &n8, b0, b0n, 64);
|
|
tc7_addmul_1 (u8, &n8, b4, b4n, 4);
|
|
|
|
tc7_add (r12, &n12, r1, n1, u8, n8);
|
|
tc7_sub (u8, &n8, r1, n1, u8, n8);
|
|
|
|
MUL_TC7 (r3, n3, u3, n3, r12, n12);
|
|
MUL_TC7 (r8, n8, u8, n8, r13, n13);
|
|
|
|
tc7_add (r1, &n1, u10, n10, a0, a0n);
|
|
tc7_addmul_1 (r1, &n1, a6, a6n,64);
|
|
tc7_addmul_1 (r1, &n1, a2, a2n, 4);
|
|
tc7_addmul_1 (u5, &n5, a1, a1n, 2);
|
|
tc7_addmul_1 (u5, &n5, a5, a5n, 32);
|
|
|
|
tc7_sub (r13, &n13, r1, n1, u5, n5);
|
|
tc7_add (u5, &n5, u5, n5, r1, n1);
|
|
|
|
tc7_add (r1, &n1, u11, n11, b0, b0n);
|
|
tc7_addmul_1 (r1, &n1, b6, b6n,64);
|
|
tc7_addmul_1 (r1, &n1, b2, b2n, 4);
|
|
tc7_addmul_1 (u2, &n2, b1, b1n, 2);
|
|
tc7_addmul_1 (u2, &n2, b5, b5n, 32);
|
|
|
|
tc7_add (r12, &n12, u2, n2, r1, n1);
|
|
tc7_sub (u2, &n2, r1, n1, u2, n2);
|
|
|
|
MUL_TC7 (r5, n5, u5, n5, r12, n12);
|
|
MUL_TC7 (r2, n2, u2, n2, r13, n13);
|
|
|
|
tc7_add (r1, &n1, r6, n6, a0, a0n);
|
|
tc7_addmul_1 (r1, &n1, u10, n10, 16);
|
|
tc7_addmul_1 (r1, &n1, a6, a6n, 4096);
|
|
tc7_add (u10, &n10, u10, n10, a6, a6n);
|
|
tc7_addmul_1 (u10, &n10, r6, n6, 16);
|
|
tc7_addmul_1 (u10, &n10, a0, a0n, 4096);
|
|
tc7_lshift (u6, &n6, a3, a3n, 4);
|
|
tc7_add (u4, &n4, u6, n6, a1, a1n);
|
|
tc7_addmul_1 (u4, &n4, a5, a5n, 256);
|
|
tc7_lshift (u4, &n4, u4, n4, 2);
|
|
tc7_add (u6, &n6, u6, n6, a5, a5n);
|
|
tc7_addmul_1 (u6, &n6, a1, a1n, 256);
|
|
tc7_lshift (u6, &n6, u6, n6, 2);
|
|
|
|
tc7_sub (u9, &n9, u4, n4, r1, n1);
|
|
tc7_add (u4, &n4, u4, n4, r1, n1);
|
|
|
|
tc7_add (r1, &n1, r7, n7, b0, b0n);
|
|
tc7_addmul_1 (r1, &n1, u11, n11, 16);
|
|
tc7_addmul_1 (r1, &n1, b6, b6n, 4096);
|
|
tc7_add (u11, &n11, u11, n11, b6, b6n);
|
|
tc7_addmul_1 (u11, &n11, r7, n7, 16);
|
|
tc7_addmul_1 (u11, &n11, b0, b0n, 4096);
|
|
tc7_lshift (r7, &n7, b3, b3n, 4);
|
|
tc7_add (r13, &n13, r7, n7, b1, b1n);
|
|
tc7_addmul_1 (r13, &n13, b5, b5n, 256);
|
|
tc7_lshift (r13, &n13, r13, n13, 2);
|
|
tc7_add (r7, &n7, r7, n7, b5, b5n);
|
|
tc7_addmul_1 (r7, &n7, b1, b1n, 256);
|
|
tc7_lshift (r7, &n7, r7, n7, 2);
|
|
|
|
tc7_sub (r12, &n12, r13, n13, r1, n1);
|
|
tc7_add (r13, &n13, r13, n13, r1, n1);
|
|
|
|
MUL_TC7 (r9, n9, u9, n9, r12, n12);
|
|
MUL_TC7 (r4, n4, u4, n4, r13, n13);
|
|
|
|
tc7_sub (r12, &n12, u10, n10, u6, n6);
|
|
tc7_add (u10, &n10, u10, n10, u6, n6);
|
|
|
|
tc7_add (r13, &n13, u11, n11, r7, n7);
|
|
tc7_sub (u11, &n11, u11, n11, r7, n7);
|
|
|
|
MUL_TC7 (r10, n10, u10, n10, r13, n13);
|
|
MUL_TC7 (r11, n11, u11, n11, r12, n12);
|
|
|
|
tc7_add (u7, &n7, a3, a3n, a1, a1n);
|
|
tc7_add (u6, &n6, a2, a2n, a0, a0n);
|
|
tc7_add (u6, &n6, u6, n6, a4, a4n);
|
|
tc7_add (u6, &n6, u6, n6, a6, a6n);
|
|
tc7_add (u7, &n7, u7, n7, a5, a5n);
|
|
tc7_add (r1, &n1, u7, n7, u6, n6);
|
|
tc7_sub (u6, &n6, u6, n6, u7, n7);
|
|
tc7_add (u7, &n7, b3, b3n, b1, b1n);
|
|
tc7_add (r13, &n13, b2, b2n, b0, b0n);
|
|
tc7_add (r13, &n13, r13, n13, b4, b4n);
|
|
tc7_add (r13, &n13, r13, n13, b6, b6n);
|
|
tc7_add (u7, &n7, u7, n7, b5, b5n);
|
|
tc7_sub (r12, &n12, r13, n13, u7, n7);
|
|
tc7_add (u7, &n7, u7, n7, r13, n13);
|
|
|
|
MUL_TC7 (r7, n7, u7, n7, r1, n1);
|
|
MUL_TC7 (r6, n6, u6, n6, r12, n12);
|
|
|
|
tc7_mul_1 (u12, &n12, b6, b6n, 729);
|
|
tc7_addmul_1 (u12, &n12, b5, b5n, 243);
|
|
tc7_addmul_1 (u12, &n12, b4, b4n, 81);
|
|
tc7_addmul_1 (u12, &n12, b3, b3n, 27);
|
|
tc7_addmul_1 (u12, &n12, b2, b2n, 9);
|
|
tc7_addmul_1 (u12, &n12, b1, b1n, 3);
|
|
tc7_add (u12, &n12, u12, n12, b0, b0n);
|
|
tc7_mul_1 (r13, &n13, a6, a6n, 729);
|
|
tc7_addmul_1 (r13, &n13, a5, a5n, 243);
|
|
tc7_addmul_1 (r13, &n13, a4, a4n, 81);
|
|
tc7_addmul_1 (r13, &n13, a3, a3n, 27);
|
|
tc7_addmul_1 (r13, &n13, a2, a2n, 9);
|
|
tc7_addmul_1 (r13, &n13, a1, a1n, 3);
|
|
tc7_add (r13, &n13, r13, n13, a0, a0n);
|
|
|
|
MUL_TC7 (r12, n12, u12, n12, r13, n13);
|
|
|
|
MUL_TC7 (r1, n1, a6, a6n, b6, b6n);
|
|
|
|
MUL_TC7 (r13, n13, a0, a0n, b0, b0n);
|
|
|
|
TC7_DENORM(r1, n1, t7);
|
|
TC7_DENORM(r2, n2, t7);
|
|
TC7_DENORM(r3, n3, t7);
|
|
TC7_DENORM(r4, n4, t7);
|
|
TC7_DENORM(r5, n5, t7);
|
|
TC7_DENORM(r6, n6, t7);
|
|
TC7_DENORM(r7, n7, t7);
|
|
TC7_DENORM(r8, n8, t7);
|
|
TC7_DENORM(r9, n9, t7);
|
|
TC7_DENORM(r10, n10, t7);
|
|
TC7_DENORM(r11, n11, t7);
|
|
TC7_DENORM(r12, n12, t7);
|
|
TC7_DENORM(r13, n13, t7);
|
|
|
|
toom7_interpolate(rp, &rpn, sn, tp, t7 - 1, n2, n6, n8, n9, n11);
|
|
|
|
if (rpn != 2*n)
|
|
{
|
|
MPN_ZERO((rp + rpn), 2*n - rpn);
|
|
}
|
|
|
|
__GMP_FREE_FUNC_LIMBS (tp, 13*t7 + 11*(sn+1));
|
|
}
|
|
|
|
/* Square {up, n} and write the result to {prodp, 2n}.
|
|
|
|
Note that prodp gets 2n limbs stored, even if the actual result
|
|
only needs 2n - 1.
|
|
*/
|
|
|
|
void
|
|
mpn_toom7_sqr_n (mp_ptr rp, mp_srcptr up, mp_size_t n)
|
|
{
|
|
mp_size_t len1, len2;
|
|
mp_limb_t cy;
|
|
mp_ptr tp;
|
|
mp_size_t a0n, a1n, a2n, a3n, a4n, a5n, a6n;
|
|
mp_size_t sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15, rpn, t7;
|
|
|
|
len1 = n;
|
|
ASSERT (n >= 1);
|
|
|
|
MPN_NORMALIZE(up, len1);
|
|
|
|
sn = (n - 1) / 7 + 1;
|
|
|
|
TC7_NORM(a0, a0n, sn);
|
|
TC7_NORM(a1, a1n, sn);
|
|
TC7_NORM(a2, a2n, sn);
|
|
TC7_NORM(a3, a3n, sn);
|
|
TC7_NORM(a4, a4n, sn);
|
|
TC7_NORM(a5, a5n, sn);
|
|
TC7_NORM(a6, a6n, n - 6*sn);
|
|
|
|
t7 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
|
|
|
|
tp = __GMP_ALLOCATE_FUNC_LIMBS(13*t7 + 7*(sn+1));
|
|
|
|
tc7_lshift (r6, &n6, a2, a2n, 4);
|
|
|
|
tc7_lshift (u8, &n15, a4, a4n, 4);
|
|
|
|
tc7_lshift (u5, &n5, a3, a3n, 3);
|
|
|
|
tc7_lshift (r1, &n1, a1, a1n, 5);
|
|
tc7_add (r1, &n1, r1, n1, u5, n5);
|
|
tc7_addmul_1 (r1, &n1, a5, a5n, 2);
|
|
tc7_add (u3, &n3, r6, n6, a6, a6n);
|
|
tc7_addmul_1 (u3, &n3, a0, a0n, 64);
|
|
tc7_addmul_1 (u3, &n3, a4, a4n, 4);
|
|
|
|
tc7_sub (r13, &n13, r1, n1, u3, n3);
|
|
tc7_add (u3, &n3, u3, n3, r1, n1);
|
|
|
|
SQR_TC7 (r3, n3, u3, n3);
|
|
SQR_TC7 (r8, n8, r13, n13);
|
|
|
|
tc7_add (r1, &n1, u8, n15, a0, a0n);
|
|
tc7_addmul_1 (r1, &n1, a6, a6n,64);
|
|
tc7_addmul_1 (r1, &n1, a2, a2n, 4);
|
|
tc7_addmul_1 (u5, &n5, a1, a1n, 2);
|
|
tc7_addmul_1 (u5, &n5, a5, a5n, 32);
|
|
|
|
tc7_sub (r13, &n13, r1, n1, u5, n5);
|
|
tc7_add (u5, &n5, u5, n5, r1, n1);
|
|
|
|
SQR_TC7 (r5, n5, u5, n5);
|
|
SQR_TC7 (r2, n2, r13, n13);
|
|
|
|
tc7_add (r1, &n1, r6, n6, a0, a0n);
|
|
tc7_addmul_1 (r1, &n1, u8, n15, 16);
|
|
tc7_addmul_1 (r1, &n1, a6, a6n, 4096);
|
|
tc7_add (u8, &n15, u8, n15, a6, a6n);
|
|
tc7_addmul_1 (u8, &n15, r6, n6, 16);
|
|
tc7_addmul_1 (u8, &n15, a0, a0n, 4096);
|
|
tc7_lshift (u6, &n6, a3, a3n, 4);
|
|
tc7_add (u4, &n4, u6, n6, a1, a1n);
|
|
tc7_addmul_1 (u4, &n4, a5, a5n, 256);
|
|
tc7_lshift (u4, &n4, u4, n4, 2);
|
|
tc7_add (u6, &n6, u6, n6, a5, a5n);
|
|
tc7_addmul_1 (u6, &n6, a1, a1n, 256);
|
|
tc7_lshift (u6, &n6, u6, n6, 2);
|
|
|
|
tc7_sub (u2, &n14, u4, n4, r1, n1);
|
|
tc7_add (u4, &n4, u4, n4, r1, n1);
|
|
|
|
SQR_TC7 (r9, n9, u2, n14);
|
|
SQR_TC7 (r4, n4, u4, n4);
|
|
|
|
tc7_sub (r12, &n12, u8, n15, u6, n6);
|
|
tc7_add (u8, &n15, u8, n15, u6, n6);
|
|
|
|
SQR_TC7 (r10, n10, u8, n15);
|
|
SQR_TC7 (r11, n11, r12, n12);
|
|
|
|
tc7_add (u7, &n7, a3, a3n, a1, a1n);
|
|
tc7_add (u6, &n6, a2, a2n, a0, a0n);
|
|
tc7_add (u6, &n6, u6, n6, a4, a4n);
|
|
tc7_add (u6, &n6, u6, n6, a6, a6n);
|
|
tc7_add (u7, &n7, u7, n7, a5, a5n);
|
|
tc7_add (r1, &n1, u7, n7, u6, n6);
|
|
tc7_sub (u6, &n6, u6, n6, u7, n7);
|
|
|
|
SQR_TC7 (r7, n7, r1, n1);
|
|
SQR_TC7 (r6, n6, u6, n6);
|
|
|
|
tc7_mul_1 (r13, &n13, a6, a6n, 729);
|
|
tc7_addmul_1 (r13, &n13, a5, a5n, 243);
|
|
tc7_addmul_1 (r13, &n13, a4, a4n, 81);
|
|
tc7_addmul_1 (r13, &n13, a3, a3n, 27);
|
|
tc7_addmul_1 (r13, &n13, a2, a2n, 9);
|
|
tc7_addmul_1 (r13, &n13, a1, a1n, 3);
|
|
tc7_add (r13, &n13, r13, n13, a0, a0n);
|
|
|
|
SQR_TC7 (r12, n12, r13, n13);
|
|
|
|
SQR_TC7 (r1, n1, a6, a6n);
|
|
|
|
SQR_TC7 (r13, n13, a0, a0n);
|
|
|
|
TC7_DENORM(r1, n1, t7);
|
|
TC7_DENORM(r2, n2, t7);
|
|
TC7_DENORM(r3, n3, t7);
|
|
TC7_DENORM(r4, n4, t7);
|
|
TC7_DENORM(r5, n5, t7);
|
|
TC7_DENORM(r6, n6, t7);
|
|
TC7_DENORM(r7, n7, t7);
|
|
TC7_DENORM(r8, n8, t7);
|
|
TC7_DENORM(r9, n9, t7);
|
|
TC7_DENORM(r10, n10, t7);
|
|
TC7_DENORM(r11, n11, t7);
|
|
TC7_DENORM(r12, n12, t7);
|
|
TC7_DENORM(r13, n13, t7);
|
|
|
|
toom7_interpolate(rp, &rpn, sn, tp, t7 - 1, n2, n6, n8, n9, n11);
|
|
|
|
if (rpn != 2*n)
|
|
{
|
|
MPN_ZERO((rp + rpn), 2*n - rpn);
|
|
}
|
|
|
|
__GMP_FREE_FUNC_LIMBS (tp, 13*t7 + 7*(sn+1));
|
|
}
|
|
|
|
/*
|
|
Toom 7 interpolation. Interpolates the value at 2^(sn*B) of a
|
|
polynomial p(x) with 13 coefficients given the values
|
|
p(oo), p(-2), 2^12*p(1/2), p(4), p(2), p(-1), p(1),
|
|
2^12*p(-1/2), p(-4), 4^12*p(1/4), 4^12*p(-1/4), p(3), p(0)
|
|
The values are assumed to be stored in tp, each separated by
|
|
s7 + 1 limbs, each of no more than s7 limbs.
|
|
The output is placed in rp and the final number of limbs of the
|
|
output is given in rpn.
|
|
The 2nd, 6th, 8th, 9th and 11th values may be negative, and if so,
|
|
n2, n6, n8, n9 and n11 should be set to a negative value respectively.
|
|
|
|
*/
|
|
|
|
void toom7_interpolate(mp_ptr rp, mp_size_t * rpn, mp_size_t sn,
|
|
mp_ptr tp, mp_size_t s7, mp_size_t n2, mp_size_t n6,
|
|
mp_size_t n8, mp_size_t n9, mp_size_t n11)
|
|
{
|
|
|
|
mp_size_t n1, n3, n4, n5, n7, n10, n12, n13, t7;
|
|
|
|
t7 = s7 + 1;
|
|
|
|
// Marco Bodrato's auto generated sequence
|
|
|
|
if (n9 < 0)
|
|
mpn_add_n(r9, r4, r9, s7);
|
|
else
|
|
mpn_sub_n(r9, r4, r9, s7);
|
|
/* r9 is now in 2s complement form */
|
|
|
|
TC7_RSHIFT1(r9, s7);
|
|
|
|
mpn_submul_1(r4, r1, s7, 16777216);
|
|
|
|
mpn_sub_n(r12, r12, r5, s7);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r4, r4, r9, r13, s7);
|
|
#else
|
|
mpn_sub_n(r4, r4, r9, s7);
|
|
mpn_sub_n(r4, r4, r13, s7);
|
|
#endif
|
|
|
|
if (n2 < 0)
|
|
mpn_add_n(r2, r5, r2, s7);
|
|
else
|
|
mpn_sub_n(r2, r5, r2, s7);
|
|
/* r2 is now in 2s complement form */
|
|
|
|
TC7_RSHIFT1(r2, s7);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r5, r5, r13, r2, s7);
|
|
#else
|
|
mpn_sub_n(r5, r5, r13, s7);
|
|
mpn_sub_n(r5, r5, r2, s7);
|
|
#endif
|
|
|
|
mpn_submul_1(r5, r1, s7, 4096);
|
|
|
|
if (n6 < 0)
|
|
mpn_add_n(r6, r7, r6, s7);
|
|
else
|
|
mpn_sub_n(r6, r7, r6, s7);
|
|
/* r6 is now in 2s complement form */
|
|
|
|
TC7_RSHIFT1(r6, s7);
|
|
|
|
mpn_sub_n(r7, r7, r13, s7);
|
|
|
|
mpn_sub_n(r12, r12, r7, s7);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r7, r7, r6, r1, s7);
|
|
#else
|
|
mpn_sub_n(r7, r7, r6, s7);
|
|
mpn_sub_n(r7, r7, r1, s7);
|
|
#endif
|
|
|
|
if (n8 < 0)
|
|
mpn_add_n(r8, r3, r8, s7);
|
|
else
|
|
mpn_sub_n(r8, r3, r8, s7);
|
|
/* r8 is now in 2s complement form */
|
|
|
|
TC7_RSHIFT1(r8, s7);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r3, r3, r8, r1, s7);
|
|
#else
|
|
mpn_sub_n(r3, r3, r8, s7);
|
|
mpn_sub_n(r3, r3, r1, s7);
|
|
#endif
|
|
|
|
mpn_add_n(r3, r5, r3, s7);
|
|
|
|
mpn_submul_1(r3, r13, s7, 4096);
|
|
|
|
mpn_submul_1(r3, r7, s7, 128);
|
|
|
|
if (n11 < 0)
|
|
mpn_add_n(r11, r10, r11, s7);
|
|
else
|
|
mpn_sub_n(r11, r10, r11, s7);
|
|
/* r11 is now in 2s complement form */
|
|
|
|
TC7_RSHIFT1(r11, s7);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r10, r10, r11, r1, s7);
|
|
#else
|
|
mpn_sub_n(r10, r10, r11, s7);
|
|
mpn_sub_n(r10, r10, r1, s7);
|
|
#endif
|
|
|
|
mpn_add_n(r10, r4, r10, s7);
|
|
|
|
mpn_submul_1(r10, r13, s7, 16777216);
|
|
|
|
mpn_submul_1(r10, r7, s7, 8192);
|
|
|
|
mpn_submul_1(r10, r3, s7, 400);
|
|
|
|
mpn_divexact_1(r10, r10, s7, 680400);
|
|
|
|
mpn_submul_1(r3, r10, s7, 900);
|
|
|
|
mpn_divexact_1(r3, r3, s7, 144);
|
|
|
|
mpn_submul_1(r5, r3, s7, 16);
|
|
|
|
mpn_sub_n(r12, r12, r5, s7);
|
|
|
|
mpn_submul_1(r12, r3, s7, 64);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r7, r7, r3, r10, s7);
|
|
#else
|
|
mpn_sub_n(r7, r7, r3, s7);
|
|
mpn_sub_n(r7, r7, r10, s7);
|
|
#endif
|
|
|
|
mpn_submul_1(r5, r7, s7, 64);
|
|
|
|
mpn_submul_1(r4, r7, s7, 4096);
|
|
|
|
mpn_submul_1(r4, r5, s7, 4);
|
|
|
|
mpn_submul_1(r4, r3, s7, 256);
|
|
|
|
mpn_submul_1(r5, r10, s7, 4);
|
|
|
|
mpn_submul_1(r4, r5, s7, 268);
|
|
|
|
mpn_divexact_1(r4, r4, s7, 771120);
|
|
|
|
mpn_submul_1(r12, r5, s7, 25);
|
|
|
|
mpn_submul_1(r12, r7, s7, 600);
|
|
|
|
mpn_submul_1(r12, r4, s7, 31500);
|
|
|
|
mpn_submul_1(r12, r1, s7, 527344);
|
|
|
|
mpn_submul_1(r5, r4, s7, 1020);
|
|
|
|
mpn_divexact_byBm1of(r5, r5, s7, CNST_LIMB(15), CNST_LIMB(~0/15));
|
|
|
|
TC7_DIVEXACT_2EXP(r5, s7, 4);
|
|
|
|
mpn_sub_n(r10, r10, r4, s7);
|
|
|
|
mpn_sub_n(r3, r3, r5, s7);
|
|
|
|
mpn_add_n(r11, r9, r11, s7);
|
|
|
|
mpn_add_n(r8, r2, r8, s7);
|
|
|
|
mpn_submul_1(r11, r6, s7, 17408);
|
|
|
|
mpn_submul_1(r8, r6, s7, 160);
|
|
|
|
mpn_submul_1(r11, r8, s7, 680);
|
|
|
|
mpn_divexact_1(r11, r11, s7, 2891700);
|
|
|
|
mpn_submul_1(r8, r11, s7, 1890);
|
|
|
|
mpn_divexact_1(r8, r8, s7, 360);
|
|
|
|
#if HAVE_NATIVE_mpn_subadd_n
|
|
mpn_subadd_n(r6, r6, r11, r8, s7);
|
|
#else
|
|
mpn_sub_n(r6, r6, r11, s7);
|
|
mpn_sub_n(r6, r6, r8, s7);
|
|
#endif
|
|
|
|
mpn_submul_1(r12, r6, s7, 210);
|
|
|
|
mpn_submul_1(r12, r8, s7, 18);
|
|
|
|
mpn_submul_1(r9, r6, s7, 1024);
|
|
|
|
mpn_submul_1(r9, r8, s7, 64);
|
|
|
|
mpn_submul_1(r9, r11, s7, 4);
|
|
|
|
mpn_submul_1(r2, r6, s7, 32);
|
|
|
|
mpn_submul_1(r2, r8, s7, 8);
|
|
|
|
mpn_submul_1(r2, r11, s7, 2);
|
|
|
|
mpn_submul_1(r9, r2, s7, 160);
|
|
|
|
mpn_divexact_1(r9, r9, s7, 11340);
|
|
|
|
mpn_divexact_by3(r2, r2, s7);
|
|
|
|
TC7_RSHIFT1(r2, s7);
|
|
|
|
mpn_sub_n(r2, r2, r9, s7);
|
|
|
|
mpn_lshift(r12, r12, s7, 3);
|
|
|
|
mpn_submul_1(r12, r9, s7, 5649);
|
|
|
|
mpn_com_n(r12, r12, s7); //r12 = -r12;
|
|
|
|
mpn_add_1(r12, r12, s7, 1);
|
|
|
|
mpn_addmul_1(r12, r2, s7, 924);
|
|
|
|
mpn_divexact_1(r12, r12, s7, 525525);
|
|
|
|
mpn_submul_1(r9, r12, s7, 341);
|
|
|
|
mpn_sub_n(r11, r11, r12, s7);
|
|
|
|
TC7_DIVEXACT_2EXP(r9, s7, 4);
|
|
|
|
mpn_submul_1(r2, r9, s7, 68);
|
|
|
|
mpn_sub_n(r8, r8, r9, s7);
|
|
|
|
TC7_DIVEXACT_2EXP(r2, s7, 4);
|
|
|
|
mpn_sub_n(r6, r6, r2, s7);
|
|
|
|
TC7_NORM(r1, n1, s7);
|
|
TC7_NORM(r2, n2, s7);
|
|
TC7_NORM(r3, n3, s7);
|
|
TC7_NORM(r4, n4, s7);
|
|
TC7_NORM(r5, n5, s7);
|
|
TC7_NORM(r6, n6, s7);
|
|
TC7_NORM(r7, n7, s7);
|
|
TC7_NORM(r8, n8, s7);
|
|
TC7_NORM(r9, n9, s7);
|
|
TC7_NORM(r10, n10, s7);
|
|
TC7_NORM(r11, n11, s7);
|
|
TC7_NORM(r12, n12, s7);
|
|
TC7_NORM(r13, n13, s7);
|
|
|
|
*rpn = 0;
|
|
tc7_copy(rp, rpn, 0, r13, n13);
|
|
tc7_copy(rp, rpn, sn, r11, n11);
|
|
tc7_copy(rp, rpn, 2*sn, r10, n10);
|
|
tc7_copy(rp, rpn, 3*sn, r8, n8);
|
|
tc7_copy(rp, rpn, 4*sn, r3, n3);
|
|
tc7_copy(rp, rpn, 5*sn, r6, n6);
|
|
tc7_copy(rp, rpn, 6*sn, r7, n7);
|
|
tc7_copy(rp, rpn, 7*sn, r2, n2);
|
|
tc7_copy(rp, rpn, 8*sn, r5, n5);
|
|
tc7_copy(rp, rpn, 9*sn, r9, n9);
|
|
tc7_copy(rp, rpn, 10*sn, r4, n4);
|
|
tc7_copy(rp, rpn, 11*sn, r12, n12);
|
|
tc7_copy(rp, rpn, 12*sn, r1, n1);
|
|
}
|
|
|
|
#if TC7_TEST
|
|
int tc7_test(mp_ptr up, mp_ptr vp, mp_size_t n)
|
|
{
|
|
mp_limb_t * rp1 = malloc(2*n*sizeof(mp_limb_t));
|
|
mp_limb_t * rp2 = malloc(2*n*sizeof(mp_limb_t));
|
|
|
|
mpn_mul_n(rp1, up, vp, n);
|
|
mpn_toom7_mul_n(rp2, up, vp, n);
|
|
|
|
mp_size_t i;
|
|
for (i = 0; i < 2*n; i++)
|
|
{
|
|
if (rp1[i] != rp2[i])
|
|
{
|
|
printf("First error in limb %d\n", i);
|
|
free(rp1);
|
|
free(rp2);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
free(rp1);
|
|
free(rp2);
|
|
return 1;
|
|
}
|
|
|
|
mp_size_t randsize(mp_size_t limit)
|
|
{
|
|
static uint64_t randval = 4035456057U;
|
|
randval = ((uint64_t)randval*(uint64_t)1025416097U+(uint64_t)286824430U)%(uint64_t)4294967311U;
|
|
|
|
if (limit == 0L) return (mp_size_t) randval;
|
|
|
|
return (mp_size_t) randval%limit;
|
|
}
|
|
|
|
int main(void)
|
|
{
|
|
mp_limb_t * up = malloc(20000*sizeof(mp_limb_t));
|
|
mp_limb_t * vp = malloc(20000*sizeof(mp_limb_t));
|
|
gmp_randstate_t rands;
|
|
gmp_randinit_default(rands);
|
|
|
|
mp_size_t i, n;
|
|
for (i = 0; i < 20000; i++)
|
|
{
|
|
n = randsize(15000) + 500;
|
|
printf("n = %d\n", n);
|
|
mpn_rrandom(up, rands, n);
|
|
mpn_rrandom(vp, rands, n);
|
|
if (!tc7_test(up, vp, n)) break;
|
|
}
|
|
|
|
free(up);
|
|
free(vp);
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#if TC7_TIME
|
|
int main(void)
|
|
{
|
|
mp_limb_t * up = malloc(40096*sizeof(mp_limb_t));
|
|
mp_limb_t * vp = malloc(40096*sizeof(mp_limb_t));
|
|
mp_limb_t * rp = malloc(80192*sizeof(mp_limb_t));
|
|
|
|
mp_size_t i, n;
|
|
gmp_randstate_t rands;
|
|
gmp_randinit_default(rands);
|
|
n = 2048;
|
|
mpn_randomb(up, rands, n);
|
|
mpn_randomb(vp, rands, n);
|
|
for (i = 0; i < 50000; i++)
|
|
{
|
|
if ((i & 31) == 0)
|
|
{
|
|
mpn_randomb(up, rands, n);
|
|
mpn_randomb(vp, rands, n);
|
|
}
|
|
//mpn_mul_n(rp, up, vp, n);
|
|
mpn_toom7_mul_n(rp, up, vp, n);
|
|
}
|
|
|
|
free(up);
|
|
free(vp);
|
|
free(rp);
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
/* Bill Hart's hand derived interpolation sequence (slightly slower than Bodrato's)
|
|
|
|
The following code verifies this sequence in Sage:
|
|
|
|
A=Matrix(ZZ,13)
|
|
A[0]=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
|
A[1]=[16777216, 4194304, 1048576, 262144, 65536, 16384, 4096, 1024, 256, 64, 16, 4, 1]
|
|
A[2]=[16777216, -4194304, 1048576, -262144, 65536, -16384, 4096, -1024, 256, -64, 16, -4, 1]
|
|
A[3]=[531441, 177147, 59049, 19683, 6561, 2187, 729, 243, 81, 27, 9, 3, 1]
|
|
A[4]=[4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1]
|
|
A[5]=[4096, -2048, 1024, -512, 256, -128, 64, -32, 16, -8, 4, -2, 1]
|
|
A[6]=[1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216]
|
|
A[7]=[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
|
|
A[8]=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
A[9]=[1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1]
|
|
A[10]=[1, -2, 4, -8, 16, -32, 64, -128, 256, -512, 1024, -2048, 4096]
|
|
A[11]=[1, -4, 16, -64, 256, -1024, 4096, -16384, 65536, -262144, 1048576, -4194304, 16777216]
|
|
A[12]=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
|
|
|
|
A[1]=A[1]-A[2]
|
|
A[1]=A[1]/8
|
|
A[2]=A[2]-16777216*A[0]
|
|
A[4]=A[4]-A[5]
|
|
A[5]=A[5]-4096*A[0]
|
|
|
|
A[3]=A[3]-531441*A[0]
|
|
A[6]=A[6]-A[11]
|
|
A[11]=A[11]-16777216*A[12]
|
|
A[10]=A[10]-A[7]
|
|
|
|
A[7]=A[7]-4096*A[12]
|
|
A[8]=A[8]-A[9]
|
|
A[8]=A[8]/2
|
|
A[10]=A[10]/4
|
|
A[6]=A[6]+16384*A[10]
|
|
A[2]=A[2]+4*A[1]
|
|
|
|
A[6]=A[6]/8
|
|
A[11]=A[11]-4*A[6]
|
|
A[6]=A[6]-1024*A[10]
|
|
A[7]=A[7]+2*A[10]
|
|
A[11]=A[11]-1024*A[7]
|
|
A[9]=A[9]+A[8]
|
|
|
|
A[4]=A[4]+2*A[5]
|
|
A[4]=A[4]/2
|
|
A[5]=A[5]-A[4]
|
|
A[1]=A[1]+512*A[5]
|
|
A[5]=A[5]+2048*A[8]
|
|
A[10]=A[10]+A[8]
|
|
A[5]=A[5]+2*A[10]
|
|
A[5]=A[5]/90
|
|
A[11]=A[11]+1023*A[0]
|
|
|
|
A[7]=A[7]-A[0]
|
|
A[1]=A[1]-A[10]
|
|
A[1]=A[1]+11565*A[5]
|
|
|
|
A[9]=A[9]-A[0]
|
|
A[9]=A[9]-A[12]
|
|
A[4]=A[4]-A[12]
|
|
A[2]=A[2]-1024*A[4]
|
|
A[2]=A[2]-A[12]
|
|
|
|
A[10]=A[10]+1023*A[8]
|
|
A[11]=A[11]-8*A[10]
|
|
A[6]=A[6]+A[10]
|
|
A[6]=A[6]+11520*A[5]
|
|
A[6]=A[6]/11340
|
|
A[5]=A[5]-A[6]
|
|
A[11]=A[11]-92160*A[5]
|
|
A[1]=A[1]-11340*A[5]
|
|
A[1]=A[1]/170100
|
|
A[7]=A[7]-4*A[9]
|
|
|
|
A[5]=A[5]-A[1]
|
|
A[8]=A[8]-A[1]
|
|
A[10]=A[10]+15*A[5]
|
|
A[10]=A[10]-1023*A[8]
|
|
A[10]=A[10]+3*A[6]
|
|
A[10]=A[10]-1068*A[1]
|
|
|
|
A[2]=A[2]+4*A[7]
|
|
A[2]=A[2]/720
|
|
|
|
A[4]=A[4]-4*A[9]
|
|
A[11]=A[11]+4*A[4]
|
|
A[11]=A[11]-9360*A[2]
|
|
A[11]=A[11]-1440*A[6]
|
|
|
|
A[7]=A[7]-1020*A[9]
|
|
A[7]=A[7]+A[4]
|
|
A[7]=A[7]-36*A[2]
|
|
A[11]=A[11]-280*A[7]
|
|
A[11]=A[11]/129600
|
|
|
|
A[7]=A[7]/432
|
|
A[4]=A[4]-12*A[7]
|
|
A[2]=A[2]+13*A[7]
|
|
A[2]=A[2]+20*A[11]
|
|
A[2]=A[2]/21
|
|
A[2]=-A[2]
|
|
|
|
A[7]=A[7]-5*A[11]
|
|
A[7]=A[7]/21
|
|
|
|
A[9]=A[9]-A[11]
|
|
A[9]=A[9]-A[7]
|
|
A[6]=A[6]-16*A[1]
|
|
|
|
A[5]=A[5]-17*A[8]
|
|
A[5]=A[5]+A[6]
|
|
A[5]=A[5]-4*A[1]
|
|
|
|
A[4]=A[4]-12*A[2]
|
|
A[4]=A[4]/1020
|
|
|
|
A[9]=A[9]-A[2]
|
|
A[5]=A[5]/17
|
|
A[5]=-A[5]
|
|
A[8]=A[8]-A[5]
|
|
A[9]=A[9]-A[4]
|
|
|
|
A[3]=A[3]-A[12]
|
|
A[3]=A[3]-3*A[5]
|
|
A[3]=A[3]-9*A[9]
|
|
A[3]=A[3]-27*A[8]
|
|
A[3]=A[3]-81*A[2]
|
|
A[3]=A[3]-243*A[1]
|
|
A[3]=A[3]-729*A[11]
|
|
A[3]=A[3]-486*A[6]
|
|
A[3]=A[3]-6561*A[7]
|
|
A[10]=A[10]+1023*A[5]
|
|
A[3]=A[3]-59049*A[4]
|
|
A[3]=A[3]*31-A[10]*5368
|
|
A[3]=A[3]/95550
|
|
A[6]=A[6]-17*A[3]
|
|
A[10]=A[10]-48*A[3]
|
|
A[10]=A[10]/1023
|
|
A[6]=A[6]/4
|
|
A[1]=A[1]-A[6]
|
|
A[8]=A[8]-A[3]
|
|
A[5]=A[5]-A[10]
|
|
|
|
tc7_sub(r4, &n4, r4, n4, r9, n9);
|
|
tc7_rshift_inplace(r4, &n4, 3);
|
|
tc7_submul_1(r9, &n9, r1, n1, 16777216);
|
|
tc7_sub(r5, &n5, r5, n5, r2, n2);
|
|
tc7_submul_1(r2, &n2, r1, n1, 4096);
|
|
|
|
tc7_submul_1(r12, &n12, r1, n1, 531441);
|
|
tc7_sub(r10, &n10, r10, n10, r11, n11);
|
|
tc7_submul_1(r11, &n11, r13, n13, 16777216);
|
|
tc7_sub(r8, &n8, r8, n8, r3, n3);
|
|
|
|
tc7_submul_1(r3, &n3, r13, n13, 4096);
|
|
tc7_sub(r7, &n7, r7, n7, r6, n6);
|
|
tc7_rshift_inplace(r7, &n7, 1);
|
|
tc7_rshift_inplace(r8, &n8, 2);
|
|
tc7_addmul_1(r10, &n10, r8, n8, 16384);
|
|
tc7_addmul_1(r9, &n9, r4, n4, 4);
|
|
|
|
tc7_rshift_inplace(r10, &n10, 3);
|
|
tc7_submul_1(r11, &n11, r10, n10, 4);
|
|
tc7_submul_1(r10, &n10, r8, n8, 1024);
|
|
tc7_addmul_1(r3, &n3, r8, n8, 2);
|
|
tc7_submul_1(r11, &n11, r3, n3, 1024);
|
|
tc7_add(r6, &n6, r6, n6, r7, n7);
|
|
|
|
tc7_addmul_1(r5, &n5, r2, n2, 2);
|
|
tc7_rshift_inplace(r5, &n5, 1);
|
|
tc7_sub(r2, &n2, r2, n2, r5, n5);
|
|
tc7_addmul_1(r4, &n4, r2, n2, 512);
|
|
tc7_addmul_1(r2, &n2, r7, n7, 2048);
|
|
tc7_add(r8, &n8, r8, n8, r7, n7);
|
|
tc7_addmul_1(r2, &n2, r8, n8, 2);
|
|
tc7_divexact_1(r2, &n2, r2, n2, 90);
|
|
tc7_addmul_1(r11, &n11, r1, n1, 1023);
|
|
|
|
tc7_sub(r3, &n3, r3, n3, r1, n1);
|
|
tc7_sub(r4, &n4, r4, n4, r8, n8);
|
|
tc7_addmul_1(r4, &n4, r2, n2, 11565);
|
|
|
|
tc7_sub(r6, &n6, r6, n6, r1, n1);
|
|
tc7_sub(r6, &n6, r6, n6, r13, n13);
|
|
tc7_sub(r5, &n5, r5, n5, r13, n13);
|
|
tc7_submul_1(r9, &n9, r5, n5, 1024);
|
|
tc7_sub(r9, &n9, r9, n9, r13, n13);
|
|
|
|
tc7_addmul_1(r8, &n8, r7, n7, 1023);
|
|
tc7_submul_1(r11, &n11, r8, n8, 8);
|
|
tc7_add(r10, &n10, r10, n10, r8, n8);
|
|
tc7_addmul_1(r10, &n10, r2, n2, 11520);
|
|
tc7_divexact_1(r10, &n10, r10, n10, 11340);
|
|
tc7_sub(r2, &n2, r2, n2, r10, n10);
|
|
tc7_submul_1(r11, &n11, r2, n2, 92160);
|
|
tc7_submul_1(r4, &n4, r2, n2, 11340);
|
|
tc7_divexact_1(r4, &n4, r4, n4, 170100);
|
|
tc7_submul_1(r3, &n3, r6, n6, 4);
|
|
|
|
tc7_sub(r2, &n2, r2, n2, r4, n4);
|
|
tc7_sub(r7, &n7, r7, n7, r4, n4);
|
|
tc7_addmul_1(r8, &n8, r2, n2, 15);
|
|
tc7_submul_1(r8, &n8, r7, n7, 1023);
|
|
tc7_addmul_1(r8, &n8, r10, n10, 3);
|
|
tc7_submul_1(r8, &n8, r4, n4, 1068);
|
|
|
|
tc7_addmul_1(r9, &n9, r3, n3, 4);
|
|
tc7_divexact_1(r9, &n9, r9, n9, 720);
|
|
|
|
tc7_submul_1(r5, &n5, r6, n6, 4);
|
|
tc7_addmul_1(r11, &n11, r5, n5, 4);
|
|
tc7_submul_1(r11, &n11, r9, n9, 9360);
|
|
tc7_submul_1(r11, &n11, r10, n10, 1440);
|
|
|
|
tc7_submul_1(r3, &n3, r6, n6, 1020);
|
|
tc7_add(r3, &n3, r3, n3, r5, n5);
|
|
tc7_submul_1(r3, &n3, r9, n9, 36);
|
|
tc7_submul_1(r11, &n11, r3, n3, 280);
|
|
tc7_divexact_1(r11, &n11, r11, n11, 129600);
|
|
|
|
tc7_divexact_1(r3, &n3, r3, n3, 432);
|
|
tc7_submul_1(r5, &n5, r3, n3, 12);
|
|
tc7_addmul_1(r9, &n9, r3, n3, 13);
|
|
tc7_addmul_1(r9, &n9, r11, n11, 20);
|
|
tc7_divexact_1(r9, &n9, r9, n9, 21);
|
|
n9 = -n9;
|
|
|
|
tc7_submul_1(r3, &n3, r11, n11, 5);
|
|
tc7_divexact_1(r3, &n3, r3, n3, 21);
|
|
|
|
tc7_sub(r6, &n6, r6, n6, r11, n11);
|
|
tc7_sub(r6, &n6, r6, n6, r3, n3);
|
|
tc7_submul_1(r10, &n10, r4, n4, 16);
|
|
|
|
tc7_submul_1(r2, &n2, r7, n7, 17);
|
|
tc7_add(r2, &n2, r2, n2, r10, n10);
|
|
tc7_submul_1(r2, &n2, r4, n4, 4);
|
|
|
|
tc7_submul_1(r5, &n5, r9, n9, 12);
|
|
tc7_divexact_1(r5, &n5, r5, n5, 1020);
|
|
|
|
tc7_sub(r6, &n6, r6, n6, r9, n9);
|
|
tc7_divexact_1(r2, &n2, r2, n2, 17);
|
|
n2 = -n2;
|
|
tc7_sub(r7, &n7, r7, n7, r2, n2);
|
|
tc7_sub(r6, &n6, r6, n6, r5, n5);
|
|
|
|
tc7_sub(r12, &n12, r12, n12, r13, n13);
|
|
tc7_submul_1(r12, &n12, r2, n2, 3);
|
|
tc7_submul_1(r12, &n12, r6, n6, 9);
|
|
tc7_submul_1(r12, &n12, r7, n7, 27);
|
|
tc7_submul_1(r12, &n12, r9, n9, 81);
|
|
tc7_submul_1(r12, &n12, r4, n4, 243);
|
|
tc7_submul_1(r12, &n12, r11, n11, 729);
|
|
tc7_submul_1(r12, &n12, r10, n10, 486);
|
|
tc7_submul_1(r12, &n12, r3, n3, 6561);
|
|
tc7_addmul_1(r8, &n8, r2, n2, 1023);
|
|
tc7_submul_1(r12, &n12, r5, n5, 59049);
|
|
tc7_mul_1(r12, &n12, r12, n12, 31);
|
|
tc7_submul_1(r12, &n12, r8, n8, 5368);
|
|
tc7_divexact_1(r12, &n12, r12, n12, 95550);
|
|
tc7_submul_1(r10, &n10, r12, n12, 17);
|
|
tc7_submul_1(r8, &n8, r12, n12, 48);
|
|
tc7_divexact_1(r8, &n8, r8, n8, 1023);
|
|
tc7_rshift_inplace(r10, &n10, 2);
|
|
tc7_sub(r4, &n4, r4, n4, r10, n10);
|
|
tc7_sub(r7, &n7, r7, n7, r12, n12);
|
|
tc7_sub(r2, &n2, r2, n2, r8, n8);
|
|
|
|
rpn = 0;
|
|
tc7_copy(rp, &rpn, 0, r13, n13);
|
|
tc7_copy(rp, &rpn, sn, r2, n2);
|
|
tc7_copy(rp, &rpn, 2*sn, r6, n6);
|
|
tc7_copy(rp, &rpn, 3*sn, r7, n7);
|
|
tc7_copy(rp, &rpn, 4*sn, r9, n9);
|
|
tc7_copy(rp, &rpn, 5*sn, r4, n4);
|
|
tc7_copy(rp, &rpn, 6*sn, r11, n11);
|
|
tc7_copy(rp, &rpn, 7*sn, r10, n10);
|
|
tc7_copy(rp, &rpn, 8*sn, r3, n3);
|
|
tc7_copy(rp, &rpn, 9*sn, r12, n12);
|
|
tc7_copy(rp, &rpn, 10*sn, r5, n5);
|
|
tc7_copy(rp, &rpn, 11*sn, r8, n8);
|
|
tc7_copy(rp, &rpn, 12*sn, r1, n1);
|
|
|
|
*/
|