From 5d3cf5509db9ad405ad1578cb6f242e065db20a4 Mon Sep 17 00:00:00 2001
From: wbhart <goodwillhart@googlemail.com>
Date: Thu, 23 Jul 2009 01:31:29 +0000
Subject: [PATCH] First hack at toom4_mul (unbalanced toom4).

---
 configure.in            |   4 +-
 mpn/Makefile.am         |   2 +-
 mpn/generic/mul.c       |   6 +-
 mpn/generic/toom4_mul.c | 899 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 907 insertions(+), 4 deletions(-)
 create mode 100644 mpn/generic/toom4_mul.c

diff --git a/configure.in b/configure.in
index a54e068a..b0a70515 100644
--- a/configure.in
+++ b/configure.in
@@ -2503,8 +2503,8 @@ gmp_mpn_functions="$extra_functions					   \
   mul mul_fft mul_n mul_basecase sqr_basecase random random2 pow_1	   \
   rootrem sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
   bdivmod gcd gcd_1 gcdext tdiv_qr dc_divrem_n sb_divrem_mn jacbase get_d  \
-  mullow_n mullow_basecase redc_basecase	   \
-  $gmp_mpn_functions_optional toom3_mul toom3_mul_n toom4_mul_n toom7_mul_n"
+  mullow_n mullow_basecase redc_basecase toom3_mul toom3_mul_n \
+  toom4_mul toom4_mul_n toom7_mul_n $gmp_mpn_functions_optional"
 
 define(GMP_MULFUNC_CHOICES,
 [# functions that can be provided by multi-function files
diff --git a/mpn/Makefile.am b/mpn/Makefile.am
index b367beac..81fee0c3 100644
--- a/mpn/Makefile.am
+++ b/mpn/Makefile.am
@@ -47,7 +47,7 @@ nodist_EXTRA_libmpn_la_SOURCES =					    \
   hamdist.c invert_limb.c lshift1.c rshift1.c lshift2.c rshift2.c	    \
   ior_n.c iorn_n.c jacbase.c lshift.c mod_1.c mod_34lsub1.c mode1o.c	    \
   mul.c mul_1.c mul_2.c mul_fft.c mul_n.c mul_basecase.c toom3_mul.c        \
-  toom3_mul_n.c toom4_mul_n.c toom7_mul_n.c                                 \
+  toom3_mul_n.c toom4_mul_n.c toom4.c toom7_mul_n.c                         \
   mullow_n.c mullow_basecase.c nand_n.c nior_n.c perfsqr.c popcount.c	    \
   pre_divrem_1.c pre_mod_1.c pow_1.c random.c random2.c rshift.c	    \
   rootrem.c redc_basecase.c sb_divrem_mn.c scan0.c scan1.c set_str.c	    \
diff --git a/mpn/generic/mul.c b/mpn/generic/mul.c
index f2422e09..2cf856d2 100644
--- a/mpn/generic/mul.c
+++ b/mpn/generic/mul.c
@@ -146,7 +146,11 @@ mpn_mul (mp_ptr prodp,
 
   k = (un + 3)/4; // ceil(un/3)
 
-  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
+  if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM4_THRESHOLD) && (vn > 3*k))
+  {
+          mpn_toom4_mul(prodp, up, un, vp, vn);
+          return prodp[un + vn - 1];
+  } else if (ABOVE_THRESHOLD (un + vn, 2*MUL_TOOM3_THRESHOLD) && (vn > k))
   {
           mp_ptr ws;
           TMP_DECL;
diff --git a/mpn/generic/toom4_mul.c b/mpn/generic/toom4_mul.c
new file mode 100644
index 00000000..0cc52d39
--- /dev/null
+++ b/mpn/generic/toom4_mul.c
@@ -0,0 +1,899 @@
+/* mpn_toom4_mul -- Internal routine to multiply two natural numbers
+   using unbalanced toom4.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+*/
+
+/* Implementation of the Bodrato-Zanoni algorithm for Toom-Cook 4-way.
+
+Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
+Copyright 2009 William Hart
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+MA 02110-1301, USA. */
+
+/*
+   This implementation is based on that of Paul Zimmmermann, which is available
+	for mpz_t's at http://www.loria.fr/~zimmerma/software/toom4.c
+*/
+
+#include "mpir.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void _tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
+{
+   mp_limb_t cy;
+   mp_size_t s1 = ABS(r1n);
+   mp_size_t s2 = ABS(r2n);
+   
+   if (!s1)
+   {
+      *rn = 0;
+   } else if (!s2)
+   {
+      if (rp != r1) MPN_COPY(rp, r1, s1);
+		*rn = r1n;
+   } else if ((r1n ^ r2n) >= 0)
+   {
+      *rn = r1n;
+      cy = mpn_add(rp, r1, s1, r2, s2);
+      if (cy) 
+      {
+         rp[s1] = cy;
+         if ((*rn) < 0) (*rn)--;
+         else (*rn)++;
+      }
+   } else
+   {
+      mp_size_t ct;
+		if (s1 != s2) ct = 1;
+		else MPN_CMP(ct, r1, r2, s1); 
+		    
+      if (!ct) *rn = 0;
+      else if (ct > 0) 
+      {
+         mpn_sub(rp, r1, s1, r2, s2);
+         *rn = s1;
+         MPN_NORMALIZE(rp, (*rn));
+			if (r1n < 0) *rn = -(*rn);
+      }
+      else
+      {
+         mpn_sub_n(rp, r2, r1, s1);
+         *rn = s1;
+         MPN_NORMALIZE(rp, (*rn));
+			if (r1n > 0) *rn = -(*rn);
+      }
+   }
+}
+
+void tc4_add(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
+{
+   mp_size_t s1 = ABS(r1n);
+   mp_size_t s2 = ABS(r2n);
+   
+   if (s1 < s2) _tc4_add(rp, rn, r2, r2n, r1, r1n);
+	else _tc4_add(rp, rn, r1, r1n, r2, r2n);
+} 
+
+#if HAVE_NATIVE_mpn_sumdiff_n
+void tc4_sumdiff(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
+{
+   mp_limb_t cy, cy2;
+   mp_size_t s1 = ABS(r1n);
+   mp_size_t s2 = ABS(r2n);
+	int swapped = 0;
+   
+   if (s1 < s2) 
+   {
+      MPN_PTR_SWAP(r1, r1n, r2, r2n);
+      MP_SIZE_T_SWAP(s1, s2);
+		swapped = 1;
+   } 
+   
+   if (!s1)
+   {
+      *rn = 0;
+		*sn = 0;
+   } else if (!s2)
+   {
+      if (rp != r1) MPN_COPY(rp, r1, s1);
+		if (sp != r1) MPN_COPY(sp, r1, s1);
+		*rn = r1n;
+		*sn = (swapped ? -r1n : r1n);
+   } else
+   {
+      mp_size_t ct;
+		if (s1 != s2) ct = 1;
+		else MPN_CMP(ct, r1, r2, s1); 
+		    
+      if (!ct) 
+		{
+			if ((r1n ^ r2n) >= 0)
+			{
+				*sn = 0;
+				*rn = r1n;
+            cy = mpn_lshift(rp, r1, s1, 1);
+            if (cy) 
+            {
+               rp[s1] = cy;
+               if ((*rn) < 0) (*rn)--;
+               else (*rn)++;
+				}
+         } else
+			{
+				*rn = 0;
+				*sn = (swapped ? -r1n : r1n);
+            cy = mpn_lshift(sp, r1, s1, 1);
+            if (cy) 
+            {
+               sp[s1] = cy;
+               if ((*sn) < 0) (*sn)--;
+               else (*sn)++;
+				}
+			}
+		} else if (ct > 0) // r1 is bigger than r2
+      {
+         if ((r1n ^ r2n) >= 0) // both inputs are same sign
+			{
+				cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
+				if (s1 > s2) // r1 has more limbs than r2
+				{
+               *rn = r1n;
+               cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
+					if (cy2) 
+               {
+                  rp[s1] = cy2;
+                  if ((*rn) < 0) (*rn)--;
+                  else (*rn)++;
+				   }
+					mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
+				} else // both inputs have same number of limbs
+				{
+               *rn = r1n;
+               if (cy>>1) 
+               {
+                  rp[s1] = 1;
+                  if ((*rn) < 0) (*rn)--;
+                  else (*rn)++;
+				   }
+				}
+				*sn = s1;
+            MPN_NORMALIZE(sp, (*sn));
+			   if (r1n ^ (~swapped) < 0) *sn = -(*sn);
+			} else // inputs are different sign
+			{
+				cy = mpn_sumdiff_n(sp, rp, r1, r2, s2);
+				if (s1 > s2) // r1 has more limbs than r2
+				{
+               *sn = r1n;
+               cy2 = mpn_add_1(sp + s2, r1 + s2, s1 - s2, cy>>1);
+					if (cy2) 
+               {
+                  sp[s1] = cy2;
+                  if ((*sn) < 0) (*sn)--;
+                  else (*sn)++;
+				   }
+					mpn_sub_1(rp + s2, r1 + s2, s1 - s2, cy&1);
+				} else // both inputs have same number of limbs
+				{
+               *sn = r1n;
+               if (cy>>1) 
+               {
+                  sp[s1] = 1;
+                  if ((*sn) < 0) (*sn)--;
+                  else (*sn)++;
+				   }
+				}
+				*rn = s1;
+            MPN_NORMALIZE(rp, (*rn));
+			   if (r1n ^ (~swapped) < 0) *rn = -(*rn);
+			}
+      } else // r2 is bigger than r1 (but same number of limbs)
+      {
+         if ((r1n ^ r2n) >= 0) // both inputs are same sign
+			{
+				cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
+				*rn = r1n;
+            if (cy>>1) 
+            {
+               rp[s1] = 1;
+               if ((*rn) < 0) (*rn)--;
+               else (*rn)++;
+				}
+				*sn = s1;
+            MPN_NORMALIZE(sp, (*sn));
+			   if (r1n ^ (~swapped) > 0) *sn = -(*sn);
+			} else // inputs are different sign
+			{
+				cy = mpn_sumdiff_n(sp, rp, r2, r1, s1);
+				*sn = r1n;
+            if (cy>>1) 
+            {
+               sp[s1] = 1;
+               if ((*sn) < 0) (*sn)--;
+               else (*sn)++;
+				}
+				*rn = s1;
+            MPN_NORMALIZE(rp, (*rn));
+			   if (r1n ^ (~swapped) > 0) *rn = -(*rn);
+			}
+      }
+   }
+}
+
+void tc4_sumdiff_unsigned(mp_ptr rp, mp_size_t * rn, mp_ptr sp, mp_size_t * sn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
+{
+   mp_limb_t cy, cy2;
+   mp_size_t s1 = ABS(r1n);
+   mp_size_t s2 = ABS(r2n);
+	int swapped = 0;
+   
+   if (s1 < s2) 
+   {
+      MPN_PTR_SWAP(r1, r1n, r2, r2n);
+      MP_SIZE_T_SWAP(s1, s2);
+		swapped = 1;
+   } 
+   
+   if (!s1)
+   {
+      *rn = 0;
+		*sn = 0;
+   } else if (!s2)
+   {
+      if (rp != r1) MPN_COPY(rp, r1, s1);
+		if (sp != r1) MPN_COPY(sp, r1, s1);
+		*rn = r1n;
+		*sn = (swapped ? -r1n : r1n);
+   } else
+   {
+      mp_size_t ct;
+		if (s1 != s2) ct = 1;
+		else MPN_CMP(ct, r1, r2, s1); 
+		    
+      if (!ct) 
+		{
+			*sn = 0;
+		   *rn = r1n;
+         cy = mpn_lshift(rp, r1, s1, 1);
+         if (cy) 
+         {
+            rp[s1] = cy;
+            (*rn)++;
+		   }
+		} else if (ct > 0) // r1 is bigger than r2
+      {
+         cy = mpn_sumdiff_n(rp, sp, r1, r2, s2);
+			if (s1 > s2) // r1 has more limbs than r2
+			{
+            *rn = r1n;
+            cy2 = mpn_add_1(rp + s2, r1 + s2, s1 - s2, cy>>1);
+				if (cy2) 
+            {
+               rp[s1] = cy2;
+               (*rn)++;
+				}
+			   mpn_sub_1(sp + s2, r1 + s2, s1 - s2, cy&1);
+			} else // both inputs have same number of limbs
+			{
+            *rn = r1n;
+            if (cy>>1) 
+            {
+               rp[s1] = 1;
+               (*rn)++;
+				}
+		   }
+			*sn = s1;
+         MPN_NORMALIZE(sp, (*sn));
+			if (swapped) *sn = -(*sn);
+      } else // r2 is bigger than r1 (but same number of limbs)
+      {
+         cy = mpn_sumdiff_n(rp, sp, r2, r1, s1);
+			*rn = r1n;
+         if (cy>>1) 
+         {
+            rp[s1] = 1;
+            (*rn)++;
+			}
+			*sn = s1;
+         MPN_NORMALIZE(sp, (*sn));
+			if (!swapped) *sn = -(*sn);
+      }
+   }
+}
+#endif
+
+void _tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
+{
+   mp_limb_t cy;
+   mp_size_t s1 = r1n;
+   mp_size_t s2 = r2n;
+   
+   if (!s2)
+   {
+      if (!s1) *rn = 0;
+      else
+      {
+         if (rp != r1) MPN_COPY(rp, r1, s1);
+		   *rn = r1n;
+		}
+   } else
+   {
+      *rn = r1n;
+      cy = mpn_add(rp, r1, s1, r2, s2);
+      if (cy) 
+      {
+         rp[s1] = cy;
+         if ((*rn) < 0) (*rn)--;
+         else (*rn)++;
+      }
+   } 
+}
+
+void tc4_add_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr r1, mp_size_t r1n, mp_srcptr r2, mp_size_t r2n)
+{
+   if (r1n < r2n) _tc4_add_unsigned(rp, rn, r2, r2n, r1, r1n);
+	else _tc4_add_unsigned(rp, rn, r1, r1n, r2, r2n);
+} 
+
+void tc4_sub(mp_ptr rp, mp_size_t * rn, mp_ptr r1, mp_size_t r1n, mp_ptr r2, mp_size_t r2n)
+{
+   tc4_add(rp, rn, r1, r1n, r2, -r2n);
+}
+	
+void tc4_lshift(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn, mp_size_t bits)
+{
+   if (xn == 0) *rn = 0;
+   else
+	{
+		mp_size_t xu = ABS(xn);
+		mp_limb_t msl = mpn_lshift(rp, xp, xu, bits);
+      if (msl) 
+		{
+			rp[xu] = msl;
+			*rn = (xn >= 0 ? xn + 1 : xn - 1);
+		} else
+		   *rn = xn;
+	}
+}
+
+void tc4_rshift_inplace(mp_ptr rp, mp_size_t * rn, mp_size_t bits)
+{
+   if (*rn)
+	{
+		if ((*rn) > 0) 
+		{
+			mpn_rshift(rp, rp, *rn, bits);
+	      if (rp[(*rn) - 1] == CNST_LIMB(0)) (*rn)--;
+		} else 
+		{
+			mpn_rshift(rp, rp, -(*rn), bits);
+	      if (rp[-(*rn) - 1] == CNST_LIMB(0)) (*rn)++;
+		}
+	}
+}
+
+#if HAVE_NATIVE_mpn_addlsh1_n
+void tc4_addlsh1_unsigned(mp_ptr rp, mp_size_t * rn, mp_srcptr xp, mp_size_t xn)
+{
+	if (xn)
+	{
+		if (xn >= *rn)
+		{
+            mp_limb_t cy;
+			if (xn > *rn) MPN_ZERO(rp + *rn, xn - *rn);
+			cy = mpn_addlsh1_n(rp, rp, xp, xn);
+			if (cy) 
+			{
+				rp[xn] = cy;
+				*rn = xn + 1;
+			} else *rn = xn;
+		} else
+	   {
+		   mp_limb_t cy = mpn_addlsh1_n(rp, rp, xp, xn);
+	      if (cy) cy = mpn_add_1(rp + xn, rp + xn, *rn - xn, cy);
+		   if (cy) 
+		   {
+			   rp[*rn] = cy;
+			   (*rn)++;
+		   }
+		}
+	}
+}
+#endif
+
+void tc4_divexact_ui(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn, mp_limb_t c)
+{
+  mp_size_t abs_size;
+  if (xn == 0)
+    {
+      *rn = 0;
+      return;
+    }
+  abs_size = ABS (xn);
+
+  MPN_DIVREM_OR_DIVEXACT_1 (rp, x, abs_size, c);
+  abs_size -= (rp[abs_size-1] == 0);
+  *rn = (xn >= 0 ? abs_size : -abs_size);
+}
+
+void tc4_divexact_by3(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
+{
+	if (xn)
+	{
+		mp_size_t xu = ABS(xn);
+		mpn_divexact_by3(rp, x, xu);
+		if (xn > 0)
+		{
+			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
+			else *rn = xn;
+		} else
+		{
+			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
+			else *rn = xn;
+		}	
+	} else *rn = 0;
+}
+
+#if HAVE_NATIVE_mpn_divexact_byBm1of
+void tc4_divexact_by15(mp_ptr rp, mp_size_t * rn, mp_ptr x, mp_size_t xn)
+{
+	if (xn)
+	{
+		mp_size_t xu = ABS(xn);
+		mpn_divexact_byBm1of(rp, x, xu, CNST_LIMB(15), CNST_LIMB((~0)/15)); // works for 32 and 64 bits
+		if (xn > 0)
+		{
+			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn - 1;
+			else *rn = xn;
+		} else
+		{
+			if (rp[xu - 1] == CNST_LIMB(0)) *rn = xn + 1;
+			else *rn = xn;
+		}	
+	} else *rn = 0;
+}
+#endif
+
+void tc4_addmul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr xp, mp_size_t xn, mp_limb_t y)
+{
+  mp_size_t  sign, wu, xu, ws, new_wn, min_size, dsize;
+  mp_limb_t  cy;
+
+  /* w unaffected if x==0 or y==0 */
+  if (xn == 0 || y == 0)
+    return;
+
+  sign = xn;
+  xu = ABS (xn);
+
+  ws = *wn;
+  if (*wn == 0)
+  {
+      /* nothing to add to, just set x*y, "sign" gives the sign */
+      cy = mpn_mul_1 (wp, xp, xu, y);
+      if (cy)
+		{
+			wp[xu] = cy;
+         xu = xu + 1;
+		} 
+      *wn = (sign >= 0 ? xu : -xu);
+      return;
+  }
+  
+  sign ^= *wn;
+  wu = ABS (*wn);
+
+  new_wn = MAX (wu, xu);
+  min_size = MIN (wu, xu);
+
+  if (sign >= 0)
+  {
+      /* addmul of absolute values */
+
+      cy = mpn_addmul_1 (wp, xp, min_size, y);
+      
+      dsize = xu - wu;
+#if HAVE_NATIVE_mpn_mul_1c
+      if (dsize > 0)
+        cy = mpn_mul_1c (wp + min_size, xp + min_size, dsize, y, cy);
+      else if (dsize < 0)
+      {
+          dsize = -dsize;
+          cy = mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
+      }
+#else
+      if (dsize != 0)
+      {
+          mp_limb_t cy2;
+          if (dsize > 0)
+            cy2 = mpn_mul_1 (wp + min_size, xp + min_size, dsize, y);
+          else
+          {
+              dsize = -dsize;
+              cy2 = 0;
+          }
+          cy = cy2 + mpn_add_1 (wp + min_size, wp + min_size, dsize, cy);
+      }
+#endif
+
+      if (cy)
+		{
+			wp[dsize + min_size] = cy;
+         new_wn ++;
+		}
+   } else
+   {
+      /* submul of absolute values */
+
+      cy = mpn_submul_1 (wp, xp, min_size, y);
+      if (wu >= xu)
+      {
+          /* if w bigger than x, then propagate borrow through it */
+          if (wu != xu)
+            cy = mpn_sub_1 (wp + xu, wp + xu, wu - xu, cy);
+
+          if (cy != 0)
+          {
+              /* Borrow out of w, take twos complement negative to get
+                 absolute value, flip sign of w.  */
+              wp[new_wn] = ~-cy;  /* extra limb is 0-cy */
+              mpn_com_n (wp, wp, new_wn);
+              new_wn++;
+              MPN_INCR_U (wp, new_wn, CNST_LIMB(1));
+              ws = -*wn;
+          }
+      } else /* wu < xu */
+      {
+          /* x bigger than w, so want x*y-w.  Submul has given w-x*y, so
+             take twos complement and use an mpn_mul_1 for the rest.  */
+
+          mp_limb_t  cy2;
+
+          /* -(-cy*b^n + w-x*y) = (cy-1)*b^n + ~(w-x*y) + 1 */
+          mpn_com_n (wp, wp, wu);
+          cy += mpn_add_1 (wp, wp, wu, CNST_LIMB(1));
+          cy -= 1;
+
+          /* If cy-1 == -1 then hold that -1 for latter.  mpn_submul_1 never
+             returns cy==MP_LIMB_T_MAX so that value always indicates a -1. */
+          cy2 = (cy == MP_LIMB_T_MAX);
+          cy += cy2;
+          MPN_MUL_1C (cy, wp + wu, xp + wu, xu - wu, y, cy);
+          wp[new_wn] = cy;
+          new_wn += (cy != 0);
+
+          /* Apply any -1 from above.  The value at wp+wsize is non-zero
+             because y!=0 and the high limb of x will be non-zero.  */
+          if (cy2)
+            MPN_DECR_U (wp+wu, new_wn - wu, CNST_LIMB(1));
+
+          ws = -*wn;
+        }
+
+      /* submul can produce high zero limbs due to cancellation, both when w
+         has more limbs or x has more  */
+      MPN_NORMALIZE (wp, new_wn);
+  }
+
+  *wn = (ws >= 0 ? new_wn : -new_wn);
+
+  ASSERT (new_wn == 0 || wp[new_wn - 1] != 0);
+}
+
+void tc4_submul_1(mp_ptr wp, mp_size_t * wn, mp_srcptr x, mp_size_t xn, mp_limb_t y)
+{
+	tc4_addmul_1(wp, wn, x, -xn, y);
+}
+
+void tc4_copy (mp_ptr yp, mp_size_t * yn, mp_size_t offset, mp_srcptr xp, mp_size_t xn)
+{
+  mp_size_t yu = ABS(*yn);
+  mp_size_t xu = ABS(xn);
+  mp_limb_t cy = 0;
+
+  if (xn == 0)
+    return;
+
+  if (offset < yu) /* low part of x overlaps with y */
+  {
+      if (offset + xu <= yu) /* x entirely inside y */
+      {
+          cy = mpn_add_n (yp + offset, yp + offset, xp, xu);
+          if (offset + xu < yu)
+            cy = mpn_add_1 (yp + offset + xu, yp + offset + xu,
+                            yu - (offset + xu), cy);
+      } else
+        cy = mpn_add_n (yp + offset, yp + offset, xp, yu - offset);
+      /* now cy is the carry at yp + yu */
+      if (xu + offset > yu) /* high part of x exceeds y */
+      {
+          MPN_COPY (yp + yu, xp + yu - offset, xu + offset - yu);
+          cy = mpn_add_1 (yp + yu, yp + yu, xu + offset - yu, cy);
+          yu = xu + offset;
+      }
+      /* now cy is the carry at yp + yn */
+      if (cy)
+        yp[yu++] = cy;
+      MPN_NORMALIZE(yp, yu);
+      *yn = yu;
+  } else /* x does not overlap */
+  {
+      if (offset > yu)
+        MPN_ZERO (yp + yu, offset - yu);
+      MPN_COPY (yp + offset, xp, xu);
+      *yn = offset + xu;
+  }
+}
+
+#define MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
+   do \
+   { \
+      if ((n1xx != 0) && (n2xx != 0)) \
+      { mp_size_t len; \
+	      if (n1xx == n2xx) \
+		   { \
+			   if (n1xx > MUL_TOOM4_THRESHOLD) mpn_toom4_mul_n(r3xx, r1xx, r2xx, n1xx); \
+            else mpn_mul_n(r3xx, r1xx, r2xx, n1xx); \
+		   } else if (n1xx > n2xx) \
+		      mpn_mul(r3xx, r1xx, n1xx, r2xx, n2xx); \
+		   else \
+		      mpn_mul(r3xx, r2xx, n2xx, r1xx, n1xx); \
+	      len = n1xx + n2xx; \
+		   MPN_NORMALIZE(r3xx, len); \
+		   n3xx = len; \
+      } else \
+         n3xx = 0; \
+   } while (0)
+
+#define MUL_TC4(r3xx, n3xx, r1xx, n1xx, r2xx, n2xx) \
+	do \
+	{ \
+	   mp_size_t sign = n1xx ^ n2xx; \
+	   mp_size_t un1 = ABS(n1xx); \
+	   mp_size_t un2 = ABS(n2xx); \
+		MUL_TC4_UNSIGNED(r3xx, n3xx, r1xx, un1, r2xx, un2); \
+		if (sign < 0) n3xx = -n3xx; \
+	} while (0)
+
+#define TC4_NORM(rxx, nxx, sxx) \
+	do \
+	{ \
+	   nxx = sxx; \
+	   MPN_NORMALIZE(rxx, nxx); \
+	} while(0)
+
+/* Zero out limbs to end of integer */
+#define TC4_DENORM(rxx, nxx, sxx) \
+	do { \
+	MPN_ZERO(rxx + ABS(nxx), sxx - ABS(nxx)); \
+	} while (0)
+
+/* Two's complement divexact by power of 2 */
+#define TC4_DIVEXACT_2EXP(rxx, nxx, sxx) \
+	do { \
+	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - sxx)); \
+      mpn_rshift(rxx, rxx, nxx, sxx); \
+	   rxx[nxx-1] |= sign; \
+	} while (0)
+
+#if HAVE_NATIVE_mpn_rshift1
+#define TC4_RSHIFT1(rxx, nxx) \
+	do { \
+	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
+       mpn_rshift1(rxx, rxx, nxx); \
+	   rxx[nxx-1] |= sign; \
+	} while (0)
+#else
+#define TC4_RSHIFT1(rxx, nxx) \
+	do { \
+	   mp_limb_t sign = (LIMB_HIGHBIT_TO_MASK(rxx[nxx-1]) << (GMP_LIMB_BITS - 1)); \
+       mpn_rshift(rxx, rxx, nxx, 1); \
+	   rxx[nxx-1] |= sign; \
+	} while (0)
+#endif
+
+#define r1 (tp)
+#define r2 (tp + t4)
+#define r4 (tp + 2*t4)
+#define r6 (tp + 3*t4)
+
+#define r3 (rp + 4*sn)
+#define r5 (rp + 2*sn)
+#define r7 (rp)
+
+#define mpn_clearit(rxx, nxx) \
+  do { \
+     mp_size_t ind = 0; \
+     for ( ; ind < nxx; ind++) \
+        (rxx)[ind] = CNST_LIMB(0); \
+  } while (0)
+
+/* Multiply {up, un} by {vp, vn} and write the result to
+   {prodp, un + vn}.
+
+   Note that prodp gets un + vn limbs stored, even if the actual 
+   result only needs un + vn - 1.
+*/
+void
+mpn_toom4_mul (mp_ptr rp, mp_srcptr up, mp_srcptr un,
+		          mp_srcptr vp, mp_size_t vn)
+{
+  mp_size_t ind;
+  mp_limb_t cy, r30, r31;
+  mp_ptr tp;
+  mp_size_t a0n, a1n, a2n, a3n, b0n, b1n, b2n, b3n, sn, n1, n2, n3, n4, n5, n6, n7, n8, rpn, t4;
+
+  MPN_NORMALIZE(up, len1);
+  MPN_NORMALIZE(vp, len2);
+
+  sn = (un + 3) / 4;
+
+  ASSERT (vn > 3*sn);
+  
+#define a0 (up)
+#define a1 (up + sn)
+#define a2 (up + 2*sn)
+#define a3 (up + 3*sn)
+#define b0 (vp)
+#define b1 (vp + sn)
+#define b2 (vp + 2*sn)
+#define b3 (vp + 3*sn)
+
+   TC4_NORM(a0, a0n, sn);
+   TC4_NORM(a1, a1n, sn);
+   TC4_NORM(a2, a2n, sn);
+   TC4_NORM(a3, a3n, un - 3*sn); 
+   TC4_NORM(b0, b0n, sn);
+   TC4_NORM(b1, b1n, sn);
+   TC4_NORM(b2, b2n, sn);
+   TC4_NORM(b3, b3n, vn - 3*sn); 
+
+   t4 = 2*sn+2; // allows mult of 2 integers of sn + 1 limbs
+
+   tp = __GMP_ALLOCATE_FUNC_LIMBS(4*t4 + 5*(sn + 1));
+
+#define u2 (tp + 4*t4)
+#define u3 (tp + 4*t4 + (sn+1))
+#define u4 (tp + 4*t4 + 2*(sn+1))
+#define u5 (tp + 4*t4 + 3*(sn+1))
+#define u6 (tp + 4*t4 + 4*(sn+1))
+
+   tc4_add_unsigned(u6, &n6, a3, a3n, a1, a1n); 
+   tc4_add_unsigned(u5, &n5, a2, a2n, a0, a0n); 
+#if HAVE_NATIVE_mpn_sumdiff_n
+   tc4_sumdiff_unsigned(u3, &n3, u4, &n4, u5, n5, u6, n6); 
+#else
+   tc4_add_unsigned(u3, &n3, u5, n5, u6, n6); 
+   tc4_sub(u4, &n4, u5, n5, u6, n6);
+#endif
+   tc4_add_unsigned(u6, &n6, b3, b3n, b1, b1n);
+   tc4_add_unsigned(u5, &n5, b2, b2n, b0, b0n);
+#if HAVE_NATIVE_mpn_sumdiff_n
+   tc4_sumdiff_unsigned(r2, &n8, u5, &n5, u5, n5, u6, n6); 
+#else
+   tc4_add_unsigned(r2, &n8, u5, n5, u6, n6); 
+   tc4_sub(u5, &n5, u5, n5, u6, n6);
+#endif
+
+   MUL_TC4_UNSIGNED(r3, n3, u3, n3, r2, n8);
+   MUL_TC4(r4, n4, u4, n4, u5, n5);
+   
+   tc4_lshift(r1, &n1, a0, a0n, 3);
+#if HAVE_NATIVE_mpn_addlsh1_n
+   tc4_addlsh1_unsigned(r1, &n1, a2, a2n);
+#else
+   tc4_addmul_1(r1, &n1, a2, a2n, 2);
+#endif
+   tc4_lshift(r2, &n8, a1, a1n, 2);
+   tc4_add(r2, &n8, r2, n8, a3, a3n);
+   tc4_add(u5, &n5, r1, n1, r2, n8);
+   tc4_sub(u6, &n6, r1, n1, r2, n8);
+   tc4_lshift(r1, &n1, b0, b0n, 3);
+#if HAVE_NATIVE_mpn_addlsh1_n
+   tc4_addlsh1_unsigned(r1, &n1, b2, b2n);
+#else
+   tc4_addmul_1(r1, &n1, b2, b2n, 2);
+#endif
+   tc4_lshift(r2, &n8, b1, b1n, 2);
+   tc4_add(r2, &n8, r2, n8, b3, b3n);
+   tc4_add(u2, &n2, r1, n1, r2, n8);
+   tc4_sub(r2, &n8, r1, n1, r2, n8);
+   
+   r30 = r3[0];
+   if (!n3) r30 = CNST_LIMB(0);
+   r31 = r3[1];
+   MUL_TC4_UNSIGNED(r5, n5, u5, n5, u2, n2);
+   MUL_TC4(r6, n6, u6, n6, r2, n8);
+   r3[1] = r31;
+
+   tc4_lshift(u2, &n2, a3, a3n, 3);
+   tc4_addmul_1(u2, &n2, a2, a2n, 4);
+#if HAVE_NATIVE_mpn_addlsh1_n
+   tc4_addlsh1_unsigned(u2, &n2, a1, a1n);
+#else
+   tc4_addmul_1(u2, &n2, a1, a1n, 2);
+#endif
+   tc4_add(u2, &n2, u2, n2, a0, a0n);
+   tc4_lshift(r1, &n1, b3, b3n, 3);
+   tc4_addmul_1(r1, &n1, b2, b2n, 4);
+#if HAVE_NATIVE_mpn_addlsh1_n
+   tc4_addlsh1_unsigned(r1, &n1, b1, b1n);
+#else
+   tc4_addmul_1(r1, &n1, b1, b1n, 2);
+#endif
+   tc4_add(r1, &n1, r1, n1, b0, b0n);
+   
+   MUL_TC4_UNSIGNED(r2, n2, u2, n2, r1, n1);
+   MUL_TC4_UNSIGNED(r1, n1, a3, a3n, b3, b3n);
+   MUL_TC4_UNSIGNED(r7, n7, a0, a0n, b0, b0n);
+
+   TC4_DENORM(r1, n1,  t4 - 1);
+   TC4_DENORM(r2, n2,  t4 - 1);
+   if (n3)
+     TC4_DENORM(r3, n3,  t4 - 1); 
+   else {
+     /* MPN_ZERO defeats gcc 4.1.2 here, hence the explicit for loop */
+     for (ind = 1 ; ind < t4 - 1; ind++) 
+        (r3)[ind] = CNST_LIMB(0); 
+   }
+   TC4_DENORM(r4, n4,  t4 - 1);
+   TC4_DENORM(r5, n5,  t4 - 1);
+   TC4_DENORM(r6, n6,  t4 - 1);
+   TC4_DENORM(r7, n7,  t4 - 2); // we treat r7 differently (it cannot exceed t4-2 in length)
+
+/*	rp        rp1          rp2           rp3          rp4           rp5         rp6           rp7
+<----------- r7-----------><------------r5-------------->            
+                                                       <-------------r3------------->
+
+              <-------------r6------------->                        < -----------r2------------>{           }
+                                         <-------------r4-------------->         <--------------r1---->
+*/
+
+   toom4_interpolate(rp, &rpn, sn, tp, t4 - 1, n4, n6, r30);
+
+   if (rpn != un + vn) 
+   {
+	  MPN_ZERO((rp + rpn), un + vn - rpn);
+   }
+
+   __GMP_FREE_FUNC_LIMBS (tp, 4*t4 + 5*(sn+1));
+}
+
+/*
+   Toom 4 interpolation. Interpolates the value at 2^(sn*B) of a 
+	polynomial p(x) with 7 coefficients given the values 
+	p(oo), p(2), p(1), p(-1), 2^6*p(1/2), 2^6*p(-1/2), p(0).
+	The output is placed in rp and the final number of limbs of the
+	output is given in rpn.
+	The 4th and 6th values may be negative, and if so, n4 and n6 
+	should be set to a negative value respectively.
+   To save space we pass r3, r5, r7 in place in the output rp.
+	The other r's are stored separately in space tp.
+	The low limb of r3 is stored in r30, as it will be overwritten
+	by the high limb of r5.
+
+rp          rp1          rp2           rp3          rp4           rp5         rp6           rp7
+<----------- r7-----------><------------r5-------------->            
+                                                      <-------------r3------------->
+
+   We assume that r1 is stored at tp, r2 at (tp + t4), r4 at (tp + 2*t4) 
+	and r6 (tp + 3*t4). Each of these r's has t4 = s4 + 1 limbs allocated.
+*/