From 5d3cf5509db9ad405ad1578cb6f242e065db20a4 Mon Sep 17 00:00:00 2001 From: wbhart Date: Thu, 23 Jul 2009 01:31:29 +0000 Subject: [PATCH] First hack at toom4_mul (unbalanced toom4). --- configure.in | 4 +- mpn/Makefile.am | 2 +- mpn/generic/mul.c | 6 +- mpn/generic/toom4_mul.c | 899 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 907 insertions(+), 4 deletions(-) create mode 100644 mpn/generic/toom4_mul.c diff --git a/configure.in b/configure.in index a54e068a..b0a70515 100644 --- a/configure.in +++ b/configure.in @@ -2503,8 +2503,8 @@ gmp_mpn_functions="$extra_functions \ mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1 \ rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \ bdivmod gcd gcd_1 gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d \ - mullow_n mullow_basecase redc_basecase \ - $gmp_mpn_functions_optional toom3_mul toom3_mul_n toom4_mul_n toom7_mul_n" + mullow_n mullow_basecase redc_basecase toom3_mul toom3_mul_n \ + toom4_mul toom4_mul_n toom7_mul_n $gmp_mpn_functions_optional" define(GMP_MULFUNC_CHOICES, [# functions that can be provided by multi-function files diff --git a/mpn/Makefile.am b/mpn/Makefile.am index b367beac..81fee0c3 100644 --- a/mpn/Makefile.am +++ b/mpn/Makefile.am @@ -47,7 +47,7 @@ nodist_EXTRA_libmpn_la_SOURCES = \ hamdist.c invert_limb.c lshift1.c rshift1.c lshift2.c rshift2.c \ ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c \ mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c \ - toom3_mul_n.c toom4_mul_n.c toom7_mul_n.c \ + toom3_mul_n.c toom4_mul_n.c toom4.c toom7_mul_n.c \ mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c \ pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c \ rootrem.c redc_basecase.c sb_divrem_mn.c scan0.c scan1.c set_str.c \ diff --git a/mpn/generic/mul.c b/mpn/generic/mul.c index f2422e09..2cf856d2 100644 --- a/mpn/generic/mul.c +++ b/mpn/generic/mul.c @@ -146,7 +146,11 @@ mpn_mul (mp_ptr prodp, k = (un + 3)/4; // ceil(un/3) - if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k)) + if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD) && (vn > 3*k)) + { + mpn_toom4_mul(prodp, up, un, vp, vn); + return prodp[un + vn - 1]; + } else if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k)) { mp_ptr ws; TMP_DECL; diff --git a/mpn/generic/toom4_mul.c b/mpn/generic/toom4_mul.c new file mode 100644 index 00000000..0cc52d39 --- /dev/null +++ b/mpn/generic/toom4_mul.c @@ -0,0 +1,899 @@ +/* mpn_toom4_mul -- Internal routine to multiply two natural numbers + using unbalanced toom4. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. +*/ + +/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way. + +Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc. +Copyright 2009 William Hart + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +MA 02110-1301, USA. */ + +/* + This implementation is based on that of Paul Zimmmermann, which is available + for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c +*/ + +#include "mpir.h" +#include "gmp-impl.h" +#include "longlong.h" + +void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) +{ + mp_limb_t cy; + mp_size_t s1 = ABS(r1n); + mp_size_t s2 = ABS(r2n); + + if (!s1) + { + *rn = 0; + } else if (!s2) + { + if (rp != r1) MPN_COPY(rp, r1, s1); + *rn = r1n; + } else if ((r1n ^ r2n) >= 0) + { + *rn = r1n; + cy = mpn_add(rp, r1, s1, r2, s2); + if (cy) + { + rp[s1] = cy; + if ((*rn) < 0) (*rn)--; + else (*rn)++; + } + } else + { + mp_size_t ct; + if (s1 != s2) ct = 1; + else MPN_CMP(ct, r1, r2, s1); + + if (!ct) *rn = 0; + else if (ct > 0) + { + mpn_sub(rp, r1, s1, r2, s2); + *rn = s1; + MPN_NORMALIZE(rp, (*rn)); + if (r1n < 0) *rn = -(*rn); + } + else + { + mpn_sub_n(rp, r2, r1, s1); + *rn = s1; + MPN_NORMALIZE(rp, (*rn)); + if (r1n > 0) *rn = -(*rn); + } + } +} + +void tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) +{ + mp_size_t s1 = ABS(r1n); + mp_size_t s2 = ABS(r2n); + + if (s1 < s2) _tc4_add(rp, rn, r2, r2n, r1, r1n); + else _tc4_add(rp, rn, r1, r1n, r2, r2n); +} + +#if HAVE_NATIVE_mpn_sumdiff_n +void tc4_sumdiff(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n) +{ + mp_limb_t cy, cy2; + mp_size_t s1 = ABS(r1n); + mp_size_t s2 = ABS(r2n); + int swapped = 0; + + if (s1 < s2) + { + MPN_PTR_SWAP(r1, r1n, r2, r2n); + MP_SIZE_T_SWAP(s1, s2); + swapped = 1; + } + + if (!s1) + { + *rn = 0; + *sn = 0; + } else if (!s2) + { + if (rp != r1) MPN_COPY(rp, r1, s1); + if (sp != r1) MPN_COPY(sp, r1, s1); + *rn = r1n; + *sn = (swapped ? -r1n : r1n); + } else + { + mp_size_t ct; + if (s1 != s2) ct = 1; + else MPN_CMP(ct, r1, r2, s1); + + if (!ct) + { + if ((r1n ^ r2n) >= 0) + { + *sn = 0; + *rn = r1n; + cy = mpn_lshift(rp, r1, s1, 1); + if (cy) + { + rp[s1] = cy; + if ((*rn) < 0) (*rn)--; + else (*rn)++; + } + } else + { + *rn = 0; + *sn = (swapped ? -r1n : r1n); + cy = mpn_lshift(sp, r1, s1, 1); + if (cy) + { + sp[s1] = cy; + if ((*sn) < 0) (*sn)--; + else (*sn)++; + } + } + } else if (ct > 0) // r1 is bigger than r2 + { + if ((r1n ^ r2n) >= 0) // both inputs are same sign + { + cy = mpn_sumdiff_n(rp, sp, r1, r2, s2); + if (s1 > s2) // r1 has more limbs than r2 + { + *rn = r1n; + cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1); + if (cy2) + { + rp[s1] = cy2; + if ((*rn) < 0) (*rn)--; + else (*rn)++; + } + mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1); + } else // both inputs have same number of limbs + { + *rn = r1n; + if (cy>>1) + { + rp[s1] = 1; + if ((*rn) < 0) (*rn)--; + else (*rn)++; + } + } + *sn = s1; + MPN_NORMALIZE(sp, (*sn)); + if (r1n ^ (~swapped) < 0) *sn = -(*sn); + } else // inputs are different sign + { + cy = mpn_sumdiff_n(sp, rp, r1, r2, s2); + if (s1 > s2) // r1 has more limbs than r2 + { + *sn = r1n; + cy2 = mpn_add_1(sp + s2, r1 + s2, s1 - s2, cy>>1); + if (cy2) + { + sp[s1] = cy2; + if ((*sn) < 0) (*sn)--; + else (*sn)++; + } + mpn_sub_1(rp + s2, r1 + s2, s1 - s2, cy&1); + } else // both inputs have same number of limbs + { + *sn = r1n; + if (cy>>1) + { + sp[s1] = 1; + if ((*sn) < 0) (*sn)--; + else (*sn)++; + } + } + *rn = s1; + MPN_NORMALIZE(rp, (*rn)); + if (r1n ^ (~swapped) < 0) *rn = -(*rn); + } + } else // r2 is bigger than r1 (but same number of limbs) + { + if ((r1n ^ r2n) >= 0) // both inputs are same sign + { + cy = mpn_sumdiff_n(rp, sp, r2, r1, s1); + *rn = r1n; + if (cy>>1) + { + rp[s1] = 1; + if ((*rn) < 0) (*rn)--; + else (*rn)++; + } + *sn = s1; + MPN_NORMALIZE(sp, (*sn)); + if (r1n ^ (~swapped) > 0) *sn = -(*sn); + } else // inputs are different sign + { + cy = mpn_sumdiff_n(sp, rp, r2, r1, s1); + *sn = r1n; + if (cy>>1) + { + sp[s1] = 1; + if ((*sn) < 0) (*sn)--; + else (*sn)++; + } + *rn = s1; + MPN_NORMALIZE(rp, (*rn)); + if (r1n ^ (~swapped) > 0) *rn = -(*rn); + } + } + } +} + +void tc4_sumdiff_unsigned(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n) +{ + mp_limb_t cy, cy2; + mp_size_t s1 = ABS(r1n); + mp_size_t s2 = ABS(r2n); + int swapped = 0; + + if (s1 < s2) + { + MPN_PTR_SWAP(r1, r1n, r2, r2n); + MP_SIZE_T_SWAP(s1, s2); + swapped = 1; + } + + if (!s1) + { + *rn = 0; + *sn = 0; + } else if (!s2) + { + if (rp != r1) MPN_COPY(rp, r1, s1); + if (sp != r1) MPN_COPY(sp, r1, s1); + *rn = r1n; + *sn = (swapped ? -r1n : r1n); + } else + { + mp_size_t ct; + if (s1 != s2) ct = 1; + else MPN_CMP(ct, r1, r2, s1); + + if (!ct) + { + *sn = 0; + *rn = r1n; + cy = mpn_lshift(rp, r1, s1, 1); + if (cy) + { + rp[s1] = cy; + (*rn)++; + } + } else if (ct > 0) // r1 is bigger than r2 + { + cy = mpn_sumdiff_n(rp, sp, r1, r2, s2); + if (s1 > s2) // r1 has more limbs than r2 + { + *rn = r1n; + cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1); + if (cy2) + { + rp[s1] = cy2; + (*rn)++; + } + mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1); + } else // both inputs have same number of limbs + { + *rn = r1n; + if (cy>>1) + { + rp[s1] = 1; + (*rn)++; + } + } + *sn = s1; + MPN_NORMALIZE(sp, (*sn)); + if (swapped) *sn = -(*sn); + } else // r2 is bigger than r1 (but same number of limbs) + { + cy = mpn_sumdiff_n(rp, sp, r2, r1, s1); + *rn = r1n; + if (cy>>1) + { + rp[s1] = 1; + (*rn)++; + } + *sn = s1; + MPN_NORMALIZE(sp, (*sn)); + if (!swapped) *sn = -(*sn); + } + } +} +#endif + +void _tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) +{ + mp_limb_t cy; + mp_size_t s1 = r1n; + mp_size_t s2 = r2n; + + if (!s2) + { + if (!s1) *rn = 0; + else + { + if (rp != r1) MPN_COPY(rp, r1, s1); + *rn = r1n; + } + } else + { + *rn = r1n; + cy = mpn_add(rp, r1, s1, r2, s2); + if (cy) + { + rp[s1] = cy; + if ((*rn) < 0) (*rn)--; + else (*rn)++; + } + } +} + +void tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n) +{ + if (r1n < r2n) _tc4_add_unsigned(rp, rn, r2, r2n, r1, r1n); + else _tc4_add_unsigned(rp, rn, r1, r1n, r2, r2n); +} + +void tc4_sub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n) +{ + tc4_add(rp, rn, r1, r1n, r2, -r2n); +} + +void tc4_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_size_t bits) +{ + if (xn == 0) *rn = 0; + else + { + mp_size_t xu = ABS(xn); + mp_limb_t msl = mpn_lshift(rp, xp, xu, bits); + if (msl) + { + rp[xu] = msl; + *rn = (xn >= 0 ? xn + 1 : xn - 1); + } else + *rn = xn; + } +} + +void tc4_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits) +{ + if (*rn) + { + if ((*rn) > 0) + { + mpn_rshift(rp, rp, *rn, bits); + if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--; + } else + { + mpn_rshift(rp, rp, -(*rn), bits); + if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++; + } + } +} + +#if HAVE_NATIVE_mpn_addlsh1_n +void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn) +{ + if (xn) + { + if (xn >= *rn) + { + mp_limb_t cy; + if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn); + cy = mpn_addlsh1_n(rp, rp, xp, xn); + if (cy) + { + rp[xn] = cy; + *rn = xn + 1; + } else *rn = xn; + } else + { + mp_limb_t cy = mpn_addlsh1_n(rp, rp, xp, xn); + if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy); + if (cy) + { + rp[*rn] = cy; + (*rn)++; + } + } + } +} +#endif + +void tc4_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c) +{ + mp_size_t abs_size; + if (xn == 0) + { + *rn = 0; + return; + } + abs_size = ABS (xn); + + MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c); + abs_size -= (rp[abs_size-1] == 0); + *rn = (xn >= 0 ? abs_size : -abs_size); +} + +void tc4_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn) +{ + if (xn) + { + mp_size_t xu = ABS(xn); + mpn_divexact_by3(rp, x, xu); + if (xn > 0) + { + if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1; + else *rn = xn; + } else + { + if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1; + else *rn = xn; + } + } else *rn = 0; +} + +#if HAVE_NATIVE_mpn_divexact_byBm1of +void tc4_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn) +{ + if (xn) + { + mp_size_t xu = ABS(xn); + mpn_divexact_byBm1of(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); // works for 32 and 64 bits + if (xn > 0) + { + if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1; + else *rn = xn; + } else + { + if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1; + else *rn = xn; + } + } else *rn = 0; +} +#endif + +void tc4_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y) +{ + mp_size_t sign, wu, xu, ws, new_wn, min_size, dsize; + mp_limb_t cy; + + /* w unaffected if x==0 or y==0 */ + if (xn == 0 || y == 0) + return; + + sign = xn; + xu = ABS (xn); + + ws = *wn; + if (*wn == 0) + { + /* nothing to add to, just set x*y, "sign" gives the sign */ + cy = mpn_mul_1 (wp, xp, xu, y); + if (cy) + { + wp[xu] = cy; + xu = xu + 1; + } + *wn = (sign >= 0 ? xu : -xu); + return; + } + + sign ^= *wn; + wu = ABS (*wn); + + new_wn = MAX (wu, xu); + min_size = MIN (wu, xu); + + if (sign >= 0) + { + /* addmul of absolute values */ + + cy = mpn_addmul_1 (wp, xp, min_size, y); + + dsize = xu - wu; +#if HAVE_NATIVE_mpn_mul_1c + if (dsize > 0) + cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy); + else if (dsize < 0) + { + dsize = -dsize; + cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy); + } +#else + if (dsize != 0) + { + mp_limb_t cy2; + if (dsize > 0) + cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y); + else + { + dsize = -dsize; + cy2 = 0; + } + cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy); + } +#endif + + if (cy) + { + wp[dsize + min_size] = cy; + new_wn ++; + } + } else + { + /* submul of absolute values */ + + cy = mpn_submul_1 (wp, xp, min_size, y); + if (wu >= xu) + { + /* if w bigger than x, then propagate borrow through it */ + if (wu != xu) + cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy); + + if (cy != 0) + { + /* Borrow out of w, take twos complement negative to get + absolute value, flip sign of w. */ + wp[new_wn] = ~-cy; /* extra limb is 0-cy */ + mpn_com_n (wp, wp, new_wn); + new_wn++; + MPN_INCR_U (wp, new_wn, CNST_LIMB(1)); + ws = -*wn; + } + } else /* wu < xu */ + { + /* x bigger than w, so want x*y-w. Submul has given w-x*y, so + take twos complement and use an mpn_mul_1 for the rest. */ + + mp_limb_t cy2; + + /* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */ + mpn_com_n (wp, wp, wu); + cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1)); + cy -= 1; + + /* If cy-1 == -1 then hold that -1 for latter. mpn_submul_1 never + returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */ + cy2 = (cy == MP_LIMB_T_MAX); + cy += cy2; + MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy); + wp[new_wn] = cy; + new_wn += (cy != 0); + + /* Apply any -1 from above. The value at wp+wsize is non-zero + because y!=0 and the high limb of x will be non-zero. */ + if (cy2) + MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1)); + + ws = -*wn; + } + + /* submul can produce high zero limbs due to cancellation, both when w + has more limbs or x has more */ + MPN_NORMALIZE (wp, new_wn); + } + + *wn = (ws >= 0 ? new_wn : -new_wn); + + ASSERT (new_wn == 0 || wp[new_wn - 1] != 0); +} + +void tc4_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y) +{ + tc4_addmul_1(wp, wn, x, -xn, y); +} + +void tc4_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn) +{ + mp_size_t yu = ABS(*yn); + mp_size_t xu = ABS(xn); + mp_limb_t cy = 0; + + if (xn == 0) + return; + + if (offset < yu) /* low part of x overlaps with y */ + { + if (offset + xu <= yu) /* x entirely inside y */ + { + cy = mpn_add_n (yp + offset, yp + offset, xp, xu); + if (offset + xu < yu) + cy = mpn_add_1 (yp + offset + xu, yp + offset + xu, + yu - (offset + xu), cy); + } else + cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset); + /* now cy is the carry at yp + yu */ + if (xu + offset > yu) /* high part of x exceeds y */ + { + MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu); + cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy); + yu = xu + offset; + } + /* now cy is the carry at yp + yn */ + if (cy) + yp[yu++] = cy; + MPN_NORMALIZE(yp, yu); + *yn = yu; + } else /* x does not overlap */ + { + if (offset > yu) + MPN_ZERO (yp + yu, offset - yu); + MPN_COPY (yp + offset, xp, xu); + *yn = offset + xu; + } +} + +#define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \ + do \ + { \ + if ((n1xx != 0) && (n2xx != 0)) \ + { mp_size_t len; \ + if (n1xx == n2xx) \ + { \ + if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \ + else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \ + } else if (n1xx > n2xx) \ + mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \ + else \ + mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \ + len = n1xx + n2xx; \ + MPN_NORMALIZE(r3xx, len); \ + n3xx = len; \ + } else \ + n3xx = 0; \ + } while (0) + +#define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \ + do \ + { \ + mp_size_t sign = n1xx ^ n2xx; \ + mp_size_t un1 = ABS(n1xx); \ + mp_size_t un2 = ABS(n2xx); \ + MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \ + if (sign < 0) n3xx = -n3xx; \ + } while (0) + +#define TC4_NORM(rxx, nxx, sxx) \ + do \ + { \ + nxx = sxx; \ + MPN_NORMALIZE(rxx, nxx); \ + } while(0) + +/* Zero out limbs to end of integer */ +#define TC4_DENORM(rxx, nxx, sxx) \ + do { \ + MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \ + } while (0) + +/* Two's complement divexact by power of 2 */ +#define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \ + do { \ + mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \ + mpn_rshift(rxx, rxx, nxx, sxx); \ + rxx[nxx-1] |= sign; \ + } while (0) + +#if HAVE_NATIVE_mpn_rshift1 +#define TC4_RSHIFT1(rxx, nxx) \ + do { \ + mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \ + mpn_rshift1(rxx, rxx, nxx); \ + rxx[nxx-1] |= sign; \ + } while (0) +#else +#define TC4_RSHIFT1(rxx, nxx) \ + do { \ + mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \ + mpn_rshift(rxx, rxx, nxx, 1); \ + rxx[nxx-1] |= sign; \ + } while (0) +#endif + +#define r1 (tp) +#define r2 (tp + t4) +#define r4 (tp + 2*t4) +#define r6 (tp + 3*t4) + +#define r3 (rp + 4*sn) +#define r5 (rp + 2*sn) +#define r7 (rp) + +#define mpn_clearit(rxx, nxx) \ + do { \ + mp_size_t ind = 0; \ + for ( ; ind < nxx; ind++) \ + (rxx)[ind] = CNST_LIMB(0); \ + } while (0) + +/* Multiply {up, un} by {vp, vn} and write the result to + {prodp, un + vn}. + + Note that prodp gets un + vn limbs stored, even if the actual + result only needs un + vn - 1. +*/ +void +mpn_toom4_mul (mp_ptr rp, mp_srcptr up, mp_srcptr un, + mp_srcptr vp, mp_size_t vn) +{ + mp_size_t ind; + mp_limb_t cy, r30, r31; + mp_ptr tp; + mp_size_t a0n, a1n, a2n, a3n, b0n, b1n, b2n, b3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4; + + MPN_NORMALIZE(up, len1); + MPN_NORMALIZE(vp, len2); + + sn = (un + 3) / 4; + + ASSERT (vn > 3*sn); + +#define a0 (up) +#define a1 (up + sn) +#define a2 (up + 2*sn) +#define a3 (up + 3*sn) +#define b0 (vp) +#define b1 (vp + sn) +#define b2 (vp + 2*sn) +#define b3 (vp + 3*sn) + + TC4_NORM(a0, a0n, sn); + TC4_NORM(a1, a1n, sn); + TC4_NORM(a2, a2n, sn); + TC4_NORM(a3, a3n, un - 3*sn); + TC4_NORM(b0, b0n, sn); + TC4_NORM(b1, b1n, sn); + TC4_NORM(b2, b2n, sn); + TC4_NORM(b3, b3n, vn - 3*sn); + + t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs + + tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 5*(sn + 1)); + +#define u2 (tp + 4*t4) +#define u3 (tp + 4*t4 + (sn+1)) +#define u4 (tp + 4*t4 + 2*(sn+1)) +#define u5 (tp + 4*t4 + 3*(sn+1)) +#define u6 (tp + 4*t4 + 4*(sn+1)) + + tc4_add_unsigned(u6, &n6, a3, a3n, a1, a1n); + tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n); +#if HAVE_NATIVE_mpn_sumdiff_n + tc4_sumdiff_unsigned(u3, &n3, u4, &n4, u5, n5, u6, n6); +#else + tc4_add_unsigned(u3, &n3, u5, n5, u6, n6); + tc4_sub(u4, &n4, u5, n5, u6, n6); +#endif + tc4_add_unsigned(u6, &n6, b3, b3n, b1, b1n); + tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n); +#if HAVE_NATIVE_mpn_sumdiff_n + tc4_sumdiff_unsigned(r2, &n8, u5, &n5, u5, n5, u6, n6); +#else + tc4_add_unsigned(r2, &n8, u5, n5, u6, n6); + tc4_sub(u5, &n5, u5, n5, u6, n6); +#endif + + MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8); + MUL_TC4(r4, n4, u4, n4, u5, n5); + + tc4_lshift(r1, &n1, a0, a0n, 3); +#if HAVE_NATIVE_mpn_addlsh1_n + tc4_addlsh1_unsigned(r1, &n1, a2, a2n); +#else + tc4_addmul_1(r1, &n1, a2, a2n, 2); +#endif + tc4_lshift(r2, &n8, a1, a1n, 2); + tc4_add(r2, &n8, r2, n8, a3, a3n); + tc4_add(u5, &n5, r1, n1, r2, n8); + tc4_sub(u6, &n6, r1, n1, r2, n8); + tc4_lshift(r1, &n1, b0, b0n, 3); +#if HAVE_NATIVE_mpn_addlsh1_n + tc4_addlsh1_unsigned(r1, &n1, b2, b2n); +#else + tc4_addmul_1(r1, &n1, b2, b2n, 2); +#endif + tc4_lshift(r2, &n8, b1, b1n, 2); + tc4_add(r2, &n8, r2, n8, b3, b3n); + tc4_add(u2, &n2, r1, n1, r2, n8); + tc4_sub(r2, &n8, r1, n1, r2, n8); + + r30 = r3[0]; + if (!n3) r30 = CNST_LIMB(0); + r31 = r3[1]; + MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2); + MUL_TC4(r6, n6, u6, n6, r2, n8); + r3[1] = r31; + + tc4_lshift(u2, &n2, a3, a3n, 3); + tc4_addmul_1(u2, &n2, a2, a2n, 4); +#if HAVE_NATIVE_mpn_addlsh1_n + tc4_addlsh1_unsigned(u2, &n2, a1, a1n); +#else + tc4_addmul_1(u2, &n2, a1, a1n, 2); +#endif + tc4_add(u2, &n2, u2, n2, a0, a0n); + tc4_lshift(r1, &n1, b3, b3n, 3); + tc4_addmul_1(r1, &n1, b2, b2n, 4); +#if HAVE_NATIVE_mpn_addlsh1_n + tc4_addlsh1_unsigned(r1, &n1, b1, b1n); +#else + tc4_addmul_1(r1, &n1, b1, b1n, 2); +#endif + tc4_add(r1, &n1, r1, n1, b0, b0n); + + MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1); + MUL_TC4_UNSIGNED(r1, n1, a3, a3n, b3, b3n); + MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n); + + TC4_DENORM(r1, n1, t4 - 1); + TC4_DENORM(r2, n2, t4 - 1); + if (n3) + TC4_DENORM(r3, n3, t4 - 1); + else { + /* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */ + for (ind = 1 ; ind < t4 - 1; ind++) + (r3)[ind] = CNST_LIMB(0); + } + TC4_DENORM(r4, n4, t4 - 1); + TC4_DENORM(r5, n5, t4 - 1); + TC4_DENORM(r6, n6, t4 - 1); + TC4_DENORM(r7, n7, t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length) + +/* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 +<----------- r7-----------><------------r5--------------> + <-------------r3-------------> + + <-------------r6-------------> < -----------r2------------>{ } + <-------------r4--------------> <--------------r1----> +*/ + + toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30); + + if (rpn != un + vn) + { + MPN_ZERO((rp + rpn), un + vn - rpn); + } + + __GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 5*(sn+1)); +} + +/* + Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a + polynomial p(x) with 7 coefficients given the values + p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0). + The output is placed in rp and the final number of limbs of the + output is given in rpn. + The 4th and 6th values may be negative, and if so, n4 and n6 + should be set to a negative value respectively. + To save space we pass r3, r5, r7 in place in the output rp. + The other r's are stored separately in space tp. + The low limb of r3 is stored in r30, as it will be overwritten + by the high limb of r5. + +rp rp1 rp2 rp3 rp4 rp5 rp6 rp7 +<----------- r7-----------><------------r5--------------> + <-------------r3-------------> + + We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) + and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated. +*/