mpir/mpn/generic/toom4_mul.c

/* mpn_toom4_mul -- Internal routine to multiply two natural numbers
   using unbalanced toom4.

   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
*/

/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way.

Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
Copyright 2009 William Hart

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */

/*
   This implementation is based on that of Paul Zimmmermann, which is available
	for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c
*/

#include "mpir.h"
#include "gmp-impl.h"
#include "longlong.h"

#define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
   do \
   { \
      if ((n1xx != 0) && (n2xx != 0)) \
      { mp_size_t len; \
	      if (n1xx == n2xx) \
		   { \
			   if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \
            else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
		   } else if (n1xx > n2xx) \
		      mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
		   else \
		      mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
	      len = n1xx + n2xx; \
		   MPN_NORMALIZE(r3xx, len); \
		   n3xx = len; \
      } else \
         n3xx = 0; \
   } while (0)

#define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
	do \
	{ \
	   mp_size_t sign = n1xx ^ n2xx; \
	   mp_size_t un1 = ABS(n1xx); \
	   mp_size_t un2 = ABS(n2xx); \
		MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
		if (sign < 0) n3xx = -n3xx; \
	} while (0)

#define TC4_NORM(rxx, nxx, sxx) \
	do \
	{ \
	   nxx = sxx; \
	   MPN_NORMALIZE(rxx, nxx); \
	} while(0)

/* Zero out limbs to end of integer */
#define TC4_DENORM(rxx, nxx, sxx) \
	do { \
	MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
	} while (0)

/* Two's complement divexact by power of 2 */
#define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \
	do { \
	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
      mpn_rshift(rxx, rxx, nxx, sxx); \
	   rxx[nxx-1] |= sign; \
	} while (0)

#if HAVE_NATIVE_mpn_rshift1
#define TC4_RSHIFT1(rxx, nxx) \
	do { \
	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
       mpn_rshift1(rxx, rxx, nxx); \
	   rxx[nxx-1] |= sign; \
	} while (0)
#else
#define TC4_RSHIFT1(rxx, nxx) \
	do { \
	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
       mpn_rshift(rxx, rxx, nxx, 1); \
	   rxx[nxx-1] |= sign; \
	} while (0)
#endif

#define r1 (tp)
#define r2 (tp + t4)
#define r4 (tp + 2*t4)
#define r6 (tp + 3*t4)

#define r3 (rp + 4*sn)
#define r5 (rp + 2*sn)
#define r7 (rp)

#define mpn_clearit(rxx, nxx) \
  do { \
     mp_size_t ind = 0; \
     for ( ; ind < nxx; ind++) \
        (rxx)[ind] = CNST_LIMB(0); \
  } while (0)

/* Multiply {up, un} by {vp, vn} and write the result to
   {prodp, un + vn} assuming vn > 3*ceil(un/4).

   Note that prodp gets un + vn limbs stored, even if the actual 
   result only needs un + vn - 1.
*/
void
mpn_toom4_mul (mp_ptr rp, mp_srcptr up, mp_size_t un,
		          mp_srcptr vp, mp_size_t vn)
{
  mp_size_t ind;
  mp_limb_t cy, r30, r31;
  mp_ptr tp;
  mp_size_t a0n, a1n, a2n, a3n, b0n, b1n, b2n, b3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4;

  sn = (un + 3) / 4;

  ASSERT (vn > 3*sn);
  
#define a0 (up)
#define a1 (up + sn)
#define a2 (up + 2*sn)
#define a3 (up + 3*sn)
#define b0 (vp)
#define b1 (vp + sn)
#define b2 (vp + 2*sn)
#define b3 (vp + 3*sn)

   TC4_NORM(a0, a0n, sn);
   TC4_NORM(a1, a1n, sn);
   TC4_NORM(a2, a2n, sn);
   TC4_NORM(a3, a3n, un - 3*sn); 
   TC4_NORM(b0, b0n, sn);
   TC4_NORM(b1, b1n, sn);
   TC4_NORM(b2, b2n, sn);
   TC4_NORM(b3, b3n, vn - 3*sn); 

   t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs

   tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 5*(sn + 1));

#define u2 (tp + 4*t4)
#define u3 (tp + 4*t4 + (sn+1))
#define u4 (tp + 4*t4 + 2*(sn+1))
#define u5 (tp + 4*t4 + 3*(sn+1))
#define u6 (tp + 4*t4 + 4*(sn+1))

   tc4_add_unsigned(u6, &n6, a3, a3n, a1, a1n); 
   tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n); 
   tc4_add_unsigned(u3, &n3, u5, n5, u6, n6); 
   tc4_sub(u4, &n4, u5, n5, u6, n6);
   tc4_add_unsigned(u6, &n6, b3, b3n, b1, b1n);
   tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n);
   tc4_add_unsigned(r2, &n8, u5, n5, u6, n6); 
   tc4_sub(u5, &n5, u5, n5, u6, n6);

   MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8);
   MUL_TC4(r4, n4, u4, n4, u5, n5);
   
   tc4_lshift(r1, &n1, a0, a0n, 3);
   tc4_addlsh1_unsigned(r1, &n1, a2, a2n);
   tc4_lshift(r2, &n8, a1, a1n, 2);
   tc4_add(r2, &n8, r2, n8, a3, a3n);
   tc4_add(u5, &n5, r1, n1, r2, n8);
   tc4_sub(u6, &n6, r1, n1, r2, n8);
   tc4_lshift(r1, &n1, b0, b0n, 3);
   tc4_addlsh1_unsigned(r1, &n1, b2, b2n);
   tc4_lshift(r2, &n8, b1, b1n, 2);
   tc4_add(r2, &n8, r2, n8, b3, b3n);
   tc4_add(u2, &n2, r1, n1, r2, n8);
   tc4_sub(r2, &n8, r1, n1, r2, n8);
   
   r30 = r3[0];
   if (!n3) r30 = CNST_LIMB(0);
   r31 = r3[1];
   MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2);
   MUL_TC4(r6, n6, u6, n6, r2, n8);
   r3[1] = r31;

   tc4_lshift(u2, &n2, a3, a3n, 3);
   tc4_addmul_1(u2, &n2, a2, a2n, 4);
   tc4_addlsh1_unsigned(u2, &n2, a1, a1n);
   tc4_add(u2, &n2, u2, n2, a0, a0n);
   tc4_lshift(r1, &n1, b3, b3n, 3);
   tc4_addmul_1(r1, &n1, b2, b2n, 4);
   tc4_addlsh1_unsigned(r1, &n1, b1, b1n);
   tc4_add(r1, &n1, r1, n1, b0, b0n);
   
   MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1);
   MUL_TC4_UNSIGNED(r1, n1, a3, a3n, b3, b3n);
   MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n);

   TC4_DENORM(r1, n1,  t4 - 1);
   TC4_DENORM(r2, n2,  t4 - 1);
   if (n3)
     TC4_DENORM(r3, n3,  t4 - 1); 
   else {
     /* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */
     for (ind = 1 ; ind < t4 - 1; ind++) 
        (r3)[ind] = CNST_LIMB(0); 
   }
   TC4_DENORM(r4, n4,  t4 - 1);
   TC4_DENORM(r5, n5,  t4 - 1);
   TC4_DENORM(r6, n6,  t4 - 1);
   TC4_DENORM(r7, n7,  t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)

/*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                       <-------------r3------------->

              <-------------r6------------->                        < -----------r2------------>{           }
                                         <-------------r4-------------->         <--------------r1---->
*/

   mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);

   if (rpn != un + vn) 
   {
	  MPN_ZERO((rp + rpn), un + vn - rpn);
   }

   __GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 5*(sn+1));
}

/* Multiply {up, un} by {vp, vn} and write the result to
   {prodp, un + vn} assuming vn > 2*ceil(un/5).

   Note that prodp gets un + vn limbs stored, even if the actual 
   result only needs un + vn - 1.
*/
void
mpn_toom53_mul (mp_ptr rp, mp_srcptr up, mp_size_t un,
		          mp_srcptr vp, mp_size_t vn)
{
  mp_size_t ind;
  mp_limb_t cy, r30, r31;
  mp_ptr tp;
  mp_size_t a0n, a1n, a2n, a3n, a4n, b0n, b1n, b2n, sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, rpn, t4;

  sn = (un + 4) / 5;

  ASSERT (vn > 2*sn);
  
#define a0 (up)
#define a1 (up + sn)
#define a2 (up + 2*sn)
#define a3 (up + 3*sn)
#define a4 (up + 4*sn)
#define b0 (vp)
#define b1 (vp + sn)
#define b2 (vp + 2*sn)

   TC4_NORM(a0, a0n, sn);
   TC4_NORM(a1, a1n, sn);
   TC4_NORM(a2, a2n, sn);
   TC4_NORM(a3, a3n, sn);
   TC4_NORM(a4, a4n, un - 4*sn); 
   TC4_NORM(b0, b0n, sn);
   TC4_NORM(b1, b1n, sn);
   TC4_NORM(b2, b2n, vn - 2*sn); 

   t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs

   tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 4*(sn + 1));

#define u2 (tp + 4*t4)
#define u3 (tp + 4*t4 + (sn+1))
#define u4 (tp + 4*t4 + 2*(sn+1))
#define u5 (tp + 4*t4 + 3*(sn+1))

   tc4_add_unsigned(u2, &n2, a3, a3n, a1, a1n); 
   tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n); 
   tc4_add_unsigned(u5, &n5, u5, n5, a4, a4n); 
   tc4_add_unsigned(u3, &n3, u5, n5, u2, n2); 
   tc4_sub(u4, &n4, u5, n5, u2, n2);

   tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n);
   tc4_add_unsigned(r2, &n8, u5, n5, b1, b1n); 
   tc4_sub(u5, &n5, u5, n5, b1, b1n);

   MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8); /* 1 */
   MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */
   
   tc4_lshift(r1, &n1, a0, a0n, 4);
   tc4_lshift(u3, &n9, a2, a2n, 2);
   tc4_add_unsigned(r1, &n1, r1, n1, u3, n9);
   tc4_add_unsigned(r1, &n1, r1, n1, a4, a4n);
   tc4_lshift(r2, &n8, a1, a1n, 3);
   tc4_addlsh1_unsigned(r2, &n8, a3, a3n);
   tc4_add_unsigned(u5, &n5, r1, n1, r2, n8);
   tc4_sub(u3, &n9, r1, n1, r2, n8);

   tc4_lshift(r1, &n1, b0, b0n, 2);
   tc4_add_unsigned(r1, &n1, r1, n1, b2, b2n);
   tc4_lshift(u4, &n10, b1, b1n, 1);
   tc4_add_unsigned(u2, &n2, r1, n1, u4, n10);
   tc4_sub(r2, &n8, r1, n1, u4, n10);
   
   r30 = r3[0];
   if (!n3) r30 = CNST_LIMB(0);
   r31 = r3[1];
   MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2); /* 1/2 */
   MUL_TC4(r6, n6, u3, n9, r2, n8); /* -1/2 */
   r3[1] = r31;

   tc4_lshift(u2, &n2, a4, a4n, 4);
   tc4_addmul_1(u2, &n2, a3, a3n, 8);
   tc4_addmul_1(u2, &n2, a2, a2n, 4);
   tc4_addlsh1_unsigned(u2, &n2, a1, a1n);
   tc4_add(u2, &n2, u2, n2, a0, a0n);

   tc4_lshift(r1, &n1, b2, b2n, 2);
   tc4_addlsh1_unsigned(r1, &n1, b1, b1n);
   tc4_add(r1, &n1, r1, n1, b0, b0n);
   
   MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1); /* 2 */

   MUL_TC4_UNSIGNED(r1, n1, a4, a4n, b2, b2n); /* oo */
   MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n); /* 0 */

   TC4_DENORM(r1, n1,  t4 - 1);
   TC4_DENORM(r2, n2,  t4 - 1);
   if (n3)
     TC4_DENORM(r3, n3,  t4 - 1); 
   else {
     /* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */
     for (ind = 1 ; ind < t4 - 1; ind++) 
        (r3)[ind] = CNST_LIMB(0); 
   }
   TC4_DENORM(r4, n4,  t4 - 1);
   TC4_DENORM(r5, n5,  t4 - 1);
   TC4_DENORM(r6, n6,  t4 - 1);
   TC4_DENORM(r7, n7,  t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)

/*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                       <-------------r3------------->

              <-------------r6------------->                        < -----------r2------------>{           }
                                         <-------------r4-------------->         <--------------r1---->
*/

   mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);

   if (rpn != un + vn) 
   {
	  MPN_ZERO((rp + rpn), un + vn - rpn);
   }

   __GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 4*(sn+1));
}

/*
   Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a 
	polynomial p(x) with 7 coefficients given the values 
	p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
	The output is placed in rp and the final number of limbs of the
	output is given in rpn.
	The 4th and 6th values may be negative, and if so, n4 and n6 
	should be set to a negative value respectively.
   To save space we pass r3, r5, r7 in place in the output rp.
	The other r's are stored separately in space tp.
	The low limb of r3 is stored in r30, as it will be overwritten
	by the high limb of r5.

rp          rp1          rp2           rp3          rp4           rp5         rp6           rp7
<----------- r7-----------><------------r5-------------->            
                                                      <-------------r3------------->

   We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) 
	and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
*/
First hack at toom4_mul (unbalanced toom4). 2009-07-22 21:31:29 -04:00			`/* mpn_toom4_mul -- Internal routine to multiply two natural numbers`
			`using unbalanced toom4.`

			`THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY`
			`SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.`
			`*/`

			`/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way.`

			`Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.`
			`Copyright 2009 William Hart`

			`This file is part of the GNU MP Library.`

			`The GNU MP Library is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU Lesser General Public License as published by`
			`the Free Software Foundation; either version 2.1 of the License, or (at your`
			`option) any later version.`

			`The GNU MP Library is distributed in the hope that it will be useful, but`
			`WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`License for more details.`

			`You should have received a copy of the GNU Lesser General Public License`
			`along with the GNU MP Library; see the file COPYING.LIB. If not, write to`
			`the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,`
			`MA 02110-1301, USA. */`

			`/*`
			`This implementation is based on that of Paul Zimmmermann, which is available`
			`for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c`
			`*/`

			`#include "mpir.h"`
			`#include "gmp-impl.h"`
			`#include "longlong.h"`

			`#define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \`
			`do \`
			`{ \`
			`if ((n1xx != 0) && (n2xx != 0)) \`
			`{ mp_size_t len; \`
			`if (n1xx == n2xx) \`
			`{ \`
			`if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \`
			`else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \`
			`} else if (n1xx > n2xx) \`
			`mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \`
			`else \`
			`mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \`
			`len = n1xx + n2xx; \`
			`MPN_NORMALIZE(r3xx, len); \`
			`n3xx = len; \`
			`} else \`
			`n3xx = 0; \`
			`} while (0)`

			`#define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \`
			`do \`
			`{ \`
			`mp_size_t sign = n1xx ^ n2xx; \`
			`mp_size_t un1 = ABS(n1xx); \`
			`mp_size_t un2 = ABS(n2xx); \`
			`MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \`
			`if (sign < 0) n3xx = -n3xx; \`
			`} while (0)`

			`#define TC4_NORM(rxx, nxx, sxx) \`
			`do \`
			`{ \`
			`nxx = sxx; \`
			`MPN_NORMALIZE(rxx, nxx); \`
			`} while(0)`

			`/* Zero out limbs to end of integer */`
			`#define TC4_DENORM(rxx, nxx, sxx) \`
			`do { \`
			`MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \`
			`} while (0)`

			`/* Two's complement divexact by power of 2 */`
			`#define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \`
			`do { \`
			`mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \`
			`mpn_rshift(rxx, rxx, nxx, sxx); \`
			`rxx[nxx-1] \|= sign; \`
			`} while (0)`

			`#if HAVE_NATIVE_mpn_rshift1`
			`#define TC4_RSHIFT1(rxx, nxx) \`
			`do { \`
			`mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \`
			`mpn_rshift1(rxx, rxx, nxx); \`
			`rxx[nxx-1] \|= sign; \`
			`} while (0)`
			`#else`
			`#define TC4_RSHIFT1(rxx, nxx) \`
			`do { \`
			`mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \`
			`mpn_rshift(rxx, rxx, nxx, 1); \`
			`rxx[nxx-1] \|= sign; \`
			`} while (0)`
			`#endif`

			`#define r1 (tp)`
			`#define r2 (tp + t4)`
			`#define r4 (tp + 2*t4)`
			`#define r6 (tp + 3*t4)`

			`#define r3 (rp + 4*sn)`
			`#define r5 (rp + 2*sn)`
			`#define r7 (rp)`

			`#define mpn_clearit(rxx, nxx) \`
			`do { \`
			`mp_size_t ind = 0; \`
			`for ( ; ind < nxx; ind++) \`
			`(rxx)[ind] = CNST_LIMB(0); \`
			`} while (0)`

			`/* Multiply {up, un} by {vp, vn} and write the result to`
Added toom53 and fiddled with the toom4 cutoff on penryn. 2009-07-23 03:48:34 -04:00			`{prodp, un + vn} assuming vn > 3*ceil(un/4).`
First hack at toom4_mul (unbalanced toom4). 2009-07-22 21:31:29 -04:00
			`Note that prodp gets un + vn limbs stored, even if the actual`
			`result only needs un + vn - 1.`
			`*/`
			`void`
Fixed all the bugs I just implemented. 2009-07-22 22:10:46 -04:00			`mpn_toom4_mul (mp_ptr rp, mp_srcptr up, mp_size_t un,`
First hack at toom4_mul (unbalanced toom4). 2009-07-22 21:31:29 -04:00			`mp_srcptr vp, mp_size_t vn)`
			`{`
			`mp_size_t ind;`
			`mp_limb_t cy, r30, r31;`
			`mp_ptr tp;`
			`mp_size_t a0n, a1n, a2n, a3n, b0n, b1n, b2n, b3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4;`

			`sn = (un + 3) / 4;`

			`ASSERT (vn > 3*sn);`

			`#define a0 (up)`
			`#define a1 (up + sn)`
			`#define a2 (up + 2*sn)`
			`#define a3 (up + 3*sn)`
			`#define b0 (vp)`
			`#define b1 (vp + sn)`
			`#define b2 (vp + 2*sn)`
			`#define b3 (vp + 3*sn)`

			`TC4_NORM(a0, a0n, sn);`
			`TC4_NORM(a1, a1n, sn);`
			`TC4_NORM(a2, a2n, sn);`
			`TC4_NORM(a3, a3n, un - 3*sn);`
			`TC4_NORM(b0, b0n, sn);`
			`TC4_NORM(b1, b1n, sn);`
			`TC4_NORM(b2, b2n, sn);`
			`TC4_NORM(b3, b3n, vn - 3*sn);`

			`t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs`

			`tp = __GMP_ALLOCATE_FUNC_LIMBS(4t4 + 5(sn + 1));`

			`#define u2 (tp + 4*t4)`
			`#define u3 (tp + 4*t4 + (sn+1))`
			`#define u4 (tp + 4t4 + 2(sn+1))`
			`#define u5 (tp + 4t4 + 3(sn+1))`
			`#define u6 (tp + 4t4 + 4(sn+1))`

			`tc4_add_unsigned(u6, &n6, a3, a3n, a1, a1n);`
			`tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n);`
			`tc4_add_unsigned(u3, &n3, u5, n5, u6, n6);`
			`tc4_sub(u4, &n4, u5, n5, u6, n6);`
			`tc4_add_unsigned(u6, &n6, b3, b3n, b1, b1n);`
			`tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n);`
			`tc4_add_unsigned(r2, &n8, u5, n5, u6, n6);`
			`tc4_sub(u5, &n5, u5, n5, u6, n6);`

			`MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8);`
			`MUL_TC4(r4, n4, u4, n4, u5, n5);`

			`tc4_lshift(r1, &n1, a0, a0n, 3);`
			`tc4_addlsh1_unsigned(r1, &n1, a2, a2n);`
			`tc4_lshift(r2, &n8, a1, a1n, 2);`
			`tc4_add(r2, &n8, r2, n8, a3, a3n);`
			`tc4_add(u5, &n5, r1, n1, r2, n8);`
			`tc4_sub(u6, &n6, r1, n1, r2, n8);`
			`tc4_lshift(r1, &n1, b0, b0n, 3);`
			`tc4_addlsh1_unsigned(r1, &n1, b2, b2n);`
			`tc4_lshift(r2, &n8, b1, b1n, 2);`
			`tc4_add(r2, &n8, r2, n8, b3, b3n);`
			`tc4_add(u2, &n2, r1, n1, r2, n8);`
			`tc4_sub(r2, &n8, r1, n1, r2, n8);`

			`r30 = r3[0];`
			`if (!n3) r30 = CNST_LIMB(0);`
			`r31 = r3[1];`
			`MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2);`
			`MUL_TC4(r6, n6, u6, n6, r2, n8);`
			`r3[1] = r31;`

			`tc4_lshift(u2, &n2, a3, a3n, 3);`
			`tc4_addmul_1(u2, &n2, a2, a2n, 4);`
			`tc4_addlsh1_unsigned(u2, &n2, a1, a1n);`
			`tc4_add(u2, &n2, u2, n2, a0, a0n);`
			`tc4_lshift(r1, &n1, b3, b3n, 3);`
			`tc4_addmul_1(r1, &n1, b2, b2n, 4);`
			`tc4_addlsh1_unsigned(r1, &n1, b1, b1n);`
			`tc4_add(r1, &n1, r1, n1, b0, b0n);`

			`MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1);`
			`MUL_TC4_UNSIGNED(r1, n1, a3, a3n, b3, b3n);`
			`MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n);`

			`TC4_DENORM(r1, n1, t4 - 1);`
			`TC4_DENORM(r2, n2, t4 - 1);`
			`if (n3)`
			`TC4_DENORM(r3, n3, t4 - 1);`
			`else {`
			`/* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */`
			`for (ind = 1 ; ind < t4 - 1; ind++)`
			`(r3)[ind] = CNST_LIMB(0);`
			`}`
			`TC4_DENORM(r4, n4, t4 - 1);`
			`TC4_DENORM(r5, n5, t4 - 1);`
			`TC4_DENORM(r6, n6, t4 - 1);`
			`TC4_DENORM(r7, n7, t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)`

			`/* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7`
			`<----------- r7-----------><------------r5-------------->`
			`<-------------r3------------->`

			`<-------------r6-------------> < -----------r2------------>{ }`
			`<-------------r4--------------> <--------------r1---->`
			`*/`

Fixed all the bugs I just implemented. 2009-07-22 22:10:46 -04:00			`mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);`
First hack at toom4_mul (unbalanced toom4). 2009-07-22 21:31:29 -04:00
			`if (rpn != un + vn)`
			`{`
			`MPN_ZERO((rp + rpn), un + vn - rpn);`
			`}`

			`__GMP_FREE_FUNC_LIMBS (tp, 4t4 + 5(sn+1));`
			`}`

Added toom53 and fiddled with the toom4 cutoff on penryn. 2009-07-23 03:48:34 -04:00			`/* Multiply {up, un} by {vp, vn} and write the result to`
			`{prodp, un + vn} assuming vn > 2*ceil(un/5).`

			`Note that prodp gets un + vn limbs stored, even if the actual`
			`result only needs un + vn - 1.`
			`*/`
			`void`
			`mpn_toom53_mul (mp_ptr rp, mp_srcptr up, mp_size_t un,`
			`mp_srcptr vp, mp_size_t vn)`
			`{`
			`mp_size_t ind;`
			`mp_limb_t cy, r30, r31;`
			`mp_ptr tp;`
			`mp_size_t a0n, a1n, a2n, a3n, a4n, b0n, b1n, b2n, sn, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, rpn, t4;`

			`sn = (un + 4) / 5;`

			`ASSERT (vn > 2*sn);`

			`#define a0 (up)`
			`#define a1 (up + sn)`
			`#define a2 (up + 2*sn)`
			`#define a3 (up + 3*sn)`
			`#define a4 (up + 4*sn)`
			`#define b0 (vp)`
			`#define b1 (vp + sn)`
			`#define b2 (vp + 2*sn)`

			`TC4_NORM(a0, a0n, sn);`
			`TC4_NORM(a1, a1n, sn);`
			`TC4_NORM(a2, a2n, sn);`
			`TC4_NORM(a3, a3n, sn);`
			`TC4_NORM(a4, a4n, un - 4*sn);`
			`TC4_NORM(b0, b0n, sn);`
			`TC4_NORM(b1, b1n, sn);`
			`TC4_NORM(b2, b2n, vn - 2*sn);`

			`t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs`

			`tp = __GMP_ALLOCATE_FUNC_LIMBS(4t4 + 4(sn + 1));`

			`#define u2 (tp + 4*t4)`
			`#define u3 (tp + 4*t4 + (sn+1))`
			`#define u4 (tp + 4t4 + 2(sn+1))`
			`#define u5 (tp + 4t4 + 3(sn+1))`

			`tc4_add_unsigned(u2, &n2, a3, a3n, a1, a1n);`
			`tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n);`
			`tc4_add_unsigned(u5, &n5, u5, n5, a4, a4n);`
			`tc4_add_unsigned(u3, &n3, u5, n5, u2, n2);`
			`tc4_sub(u4, &n4, u5, n5, u2, n2);`

			`tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n);`
			`tc4_add_unsigned(r2, &n8, u5, n5, b1, b1n);`
			`tc4_sub(u5, &n5, u5, n5, b1, b1n);`

			`MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8); /* 1 */`
			`MUL_TC4(r4, n4, u4, n4, u5, n5); /* -1 */`

			`tc4_lshift(r1, &n1, a0, a0n, 4);`
			`tc4_lshift(u3, &n9, a2, a2n, 2);`
			`tc4_add_unsigned(r1, &n1, r1, n1, u3, n9);`
			`tc4_add_unsigned(r1, &n1, r1, n1, a4, a4n);`
			`tc4_lshift(r2, &n8, a1, a1n, 3);`
			`tc4_addlsh1_unsigned(r2, &n8, a3, a3n);`
			`tc4_add_unsigned(u5, &n5, r1, n1, r2, n8);`
			`tc4_sub(u3, &n9, r1, n1, r2, n8);`

			`tc4_lshift(r1, &n1, b0, b0n, 2);`
			`tc4_add_unsigned(r1, &n1, r1, n1, b2, b2n);`
			`tc4_lshift(u4, &n10, b1, b1n, 1);`
			`tc4_add_unsigned(u2, &n2, r1, n1, u4, n10);`
			`tc4_sub(r2, &n8, r1, n1, u4, n10);`

			`r30 = r3[0];`
			`if (!n3) r30 = CNST_LIMB(0);`
			`r31 = r3[1];`
			`MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2); /* 1/2 */`
			`MUL_TC4(r6, n6, u3, n9, r2, n8); /* -1/2 */`
			`r3[1] = r31;`

			`tc4_lshift(u2, &n2, a4, a4n, 4);`
			`tc4_addmul_1(u2, &n2, a3, a3n, 8);`
			`tc4_addmul_1(u2, &n2, a2, a2n, 4);`
			`tc4_addlsh1_unsigned(u2, &n2, a1, a1n);`
			`tc4_add(u2, &n2, u2, n2, a0, a0n);`

			`tc4_lshift(r1, &n1, b2, b2n, 2);`
			`tc4_addlsh1_unsigned(r1, &n1, b1, b1n);`
			`tc4_add(r1, &n1, r1, n1, b0, b0n);`

			`MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1); /* 2 */`

			`MUL_TC4_UNSIGNED(r1, n1, a4, a4n, b2, b2n); /* oo */`
			`MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n); /* 0 */`

			`TC4_DENORM(r1, n1, t4 - 1);`
			`TC4_DENORM(r2, n2, t4 - 1);`
			`if (n3)`
			`TC4_DENORM(r3, n3, t4 - 1);`
			`else {`
			`/* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */`
			`for (ind = 1 ; ind < t4 - 1; ind++)`
			`(r3)[ind] = CNST_LIMB(0);`
			`}`
			`TC4_DENORM(r4, n4, t4 - 1);`
			`TC4_DENORM(r5, n5, t4 - 1);`
			`TC4_DENORM(r6, n6, t4 - 1);`
			`TC4_DENORM(r7, n7, t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)`

			`/* rp rp1 rp2 rp3 rp4 rp5 rp6 rp7`
			`<----------- r7-----------><------------r5-------------->`
			`<-------------r3------------->`

			`<-------------r6-------------> < -----------r2------------>{ }`
			`<-------------r4--------------> <--------------r1---->`
			`*/`

			`mpn_toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);`

			`if (rpn != un + vn)`
			`{`
			`MPN_ZERO((rp + rpn), un + vn - rpn);`
			`}`

			`__GMP_FREE_FUNC_LIMBS (tp, 4t4 + 4(sn+1));`
			`}`

First hack at toom4_mul (unbalanced toom4). 2009-07-22 21:31:29 -04:00			`/*`
			`Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a`
			`polynomial p(x) with 7 coefficients given the values`
			`p(oo), p(2), p(1), p(-1), 2^6p(1/2), 2^6p(-1/2), p(0).`
			`The output is placed in rp and the final number of limbs of the`
			`output is given in rpn.`
			`The 4th and 6th values may be negative, and if so, n4 and n6`
			`should be set to a negative value respectively.`
			`To save space we pass r3, r5, r7 in place in the output rp.`
			`The other r's are stored separately in space tp.`
			`The low limb of r3 is stored in r30, as it will be overwritten`
			`by the high limb of r5.`

			`rp rp1 rp2 rp3 rp4 rp5 rp6 rp7`
			`<----------- r7-----------><------------r5-------------->`
			`<-------------r3------------->`

			`We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4)`
			`and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.`
			`*/`