From f444a2bf6c9e57e9b14330b558559be3ab4cfb9f Mon Sep 17 00:00:00 2001
From: "(no author)" <(no author)>
Date: Fri, 19 Feb 2010 12:54:56 +0000
Subject: [PATCH] Attempt to tune some of the division functions.

---
 gmp-impl.h                          | 104 +++++------
 mpn/generic/dc_div_qr.c             |   2 -
 mpn/generic/dc_divappr_q.c          |   6 +-
 mpn/generic/dc_divappr_q_n.c        |   2 -
 mpn/generic/dc_divrem_n.c           |   2 +
 mpn/generic/inv_div_qr.c            |   3 -
 mpn/generic/inv_divappr_q.c         |   8 +-
 mpn/generic/tdiv_q.c                |   5 -
 mpn/generic/tdiv_qr.c               |   3 -
 mpn/x86_64/k8/k10/k102/gmp-mparam.h |   5 +-
 tune/Makefile.am                    |   3 +-
 tune/common.c                       |  20 +++
 tune/speed.h                        | 261 ++++++++++++++++++++++++++++
 tune/tuneup.c                       | 147 ++++++++++++----
 14 files changed, 447 insertions(+), 124 deletions(-)

diff --git a/gmp-impl.h b/gmp-impl.h
index f4866105..9950df12 100644
--- a/gmp-impl.h
+++ b/gmp-impl.h
@@ -584,51 +584,6 @@ void  __gmp_tmp_debug_free  _PROTO ((const char *, int, int,
 #define GMP_NAIL_LOWBIT   (CNST_LIMB(1) << GMP_NUMB_BITS)
 #endif
 
-#if GMP_NAIL_BITS != 0
-/* Set various *_THRESHOLD values to be used for nails.  Thus we avoid using
-   code that has not yet been qualified.  */
-
-#undef DIV_SB_PREINV_THRESHOLD
-#undef DIV_DC_THRESHOLD
-#undef POWM_THRESHOLD
-#define DIV_SB_PREINV_THRESHOLD           MP_SIZE_T_MAX
-#define DIV_DC_THRESHOLD                 50
-#define POWM_THRESHOLD                    0
-
-#undef GCD_ACCEL_THRESHOLD
-#define GCD_ACCEL_THRESHOLD               3
-
-#undef DIVREM_1_NORM_THRESHOLD
-#undef DIVREM_1_UNNORM_THRESHOLD
-#undef MOD_1_NORM_THRESHOLD
-#undef MOD_1_UNNORM_THRESHOLD
-#undef USE_PREINV_DIVREM_1
-#undef USE_PREINV_MOD_1
-#undef DIVREM_2_THRESHOLD
-#undef DIVEXACT_1_THRESHOLD
-#undef MODEXACT_1_ODD_THRESHOLD
-#define DIVREM_1_NORM_THRESHOLD           MP_SIZE_T_MAX  /* no preinv */
-#define DIVREM_1_UNNORM_THRESHOLD         MP_SIZE_T_MAX  /* no preinv */
-#define MOD_1_NORM_THRESHOLD              MP_SIZE_T_MAX  /* no preinv */
-#define MOD_1_UNNORM_THRESHOLD            MP_SIZE_T_MAX  /* no preinv */
-#define USE_PREINV_DIVREM_1               0  /* no preinv */
-#define USE_PREINV_MOD_1                  0  /* no preinv */
-#define DIVREM_2_THRESHOLD                MP_SIZE_T_MAX  /* no preinv */
-
-#undef GET_STR_DC_THRESHOLD
-#undef GET_STR_PRECOMPUTE_THRESHOLD
-#undef SET_STR_THRESHOLD
-#define GET_STR_DC_THRESHOLD             22
-#define GET_STR_PRECOMPUTE_THRESHOLD     42
-#define SET_STR_THRESHOLD              3259
-
-/* mpn/generic/mul_fft.c is not nails-capable. */
-#undef  MUL_FFT_THRESHOLD
-#undef  SQR_FFT_THRESHOLD
-#define MUL_FFT_THRESHOLD                MP_SIZE_T_MAX
-#define SQR_FFT_THRESHOLD                MP_SIZE_T_MAX
-#endif
-
 /* Swap macros. */
 
 #define MP_LIMB_T_SWAP(x, y)                    \
@@ -1270,6 +1225,8 @@ void mpn_mul_fft_full _PROTO ((mp_ptr op,
 #define   mpn_fft_next_size __MPN(fft_next_size)
 mp_size_t mpn_fft_next_size _PROTO ((mp_size_t pl, int k)) ATTRIBUTE_CONST;
 
+#define DC_DIVAPPR_Q_N_ITCH(n) (n*10)
+
 #define mpn_sb_divrem_mn  __MPN(sb_divrem_mn)
 mp_limb_t mpn_sb_divrem_mn _PROTO ((mp_ptr, mp_ptr, mp_size_t,
                                     mp_srcptr, mp_size_t));
@@ -1790,18 +1747,37 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[];
 
 #define MPN_FFT_TABLE_SIZE  16
 
-
-/* mpn_dc_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
-   div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
-   i.e. n/2 >= MUL_KARATSUBA_THRESHOLD
-
-   Measured values are between 2 and 4 times MUL_KARATSUBA_THRESHOLD, so go
-   for 3 as an average.  */
-
-#ifndef DIV_DC_THRESHOLD
-#define DIV_DC_THRESHOLD    (3 * MUL_KARATSUBA_THRESHOLD)
+#ifndef DC_DIV_QR_THRESHOLD
+#define DC_DIV_QR_THRESHOLD    (3 * MUL_KARATSUBA_THRESHOLD)
 #endif
 
+#ifndef DC_DIVAPPR_Q_N_THRESHOLD
+#define DC_DIVAPPR_Q_N_THRESHOLD    (3 * MUL_KARATSUBA_THRESHOLD)
+#endif
+
+#ifndef INV_DIV_QR_THRESHOLD
+#define INV_DIV_QR_THRESHOLD    (MUL_FFT_THRESHOLD/3)
+#endif
+
+#ifndef INV_DIVAPPR_Q_N_THRESHOLD
+#define INV_DIVAPPR_Q_N_THRESHOLD    (MUL_FFT_THRESHOLD/3)
+#endif
+
+#ifndef DC_DIV_Q_THRESHOLD
+#define DC_DIV_Q_THRESHOLD    (3 * MUL_KARATSUBA_THRESHOLD)
+#endif
+
+#ifndef INV_DIV_Q_THRESHOLD
+#define INV_DIV_Q_THRESHOLD    (MUL_FFT_THRESHOLD/3)
+#endif
+
+#ifndef DC_DIVAPPR_Q_THRESHOLD
+#define DC_DIVAPPR_Q_THRESHOLD    (3 * MUL_TOOM3_THRESHOLD)
+#endif
+
+#ifndef INV_DIVAPPR_Q_THRESHOLD
+#define INV_DIVAPPR_Q_THRESHOLD    (MUL_FFT_THRESHOLD/2)
+#endif
 
 /* Return non-zero if xp,xsize and yp,ysize overlap.
    If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no
@@ -4180,9 +4156,21 @@ extern mp_size_t                     mulmod_2expm1_threshold;
 extern mp_size_t                     div_sb_preinv_threshold;
 #endif
 
-#undef  DIV_DC_THRESHOLD
-#define DIV_DC_THRESHOLD             div_dc_threshold
-extern mp_size_t                     div_dc_threshold;
+#undef  DC_DIV_QR_THRESHOLD
+#define DC_DIV_QR_THRESHOLD          dc_div_qr_threshold
+extern mp_size_t                     dc_div_qr_threshold;
+
+#undef  DC_DIVAPPR_Q_N_THRESHOLD
+#define DC_DIVAPPR_Q_N_THRESHOLD     dc_divappr_q_n_threshold
+extern mp_size_t                     dc_divappr_q_n_threshold;
+
+#undef  INV_DIV_QR_THRESHOLD
+#define INV_DIV_QR_THRESHOLD         inv_div_qr_threshold
+extern mp_size_t                     inv_div_qr_threshold;
+
+#undef  INV_DIVAPPR_Q_N_THRESHOLD
+#define INV_DIVAPPR_Q_N_THRESHOLD    inv_divappr_q_n_threshold
+extern mp_size_t                     inv_divappr_q_n_threshold;
 
 #undef  POWM_THRESHOLD
 #define POWM_THRESHOLD               powm_threshold
diff --git a/mpn/generic/dc_div_qr.c b/mpn/generic/dc_div_qr.c
index e7ab824e..7c6e67c8 100644
--- a/mpn/generic/dc_div_qr.c
+++ b/mpn/generic/dc_div_qr.c
@@ -28,8 +28,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#define DC_DIV_QR_THRESHOLD 56 
-
 mp_limb_t
 mpn_dc_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
 		    mp_limb_t dinv, mp_ptr tp)
diff --git a/mpn/generic/dc_divappr_q.c b/mpn/generic/dc_divappr_q.c
index dba5acd5..4e174f9f 100644
--- a/mpn/generic/dc_divappr_q.c
+++ b/mpn/generic/dc_divappr_q.c
@@ -28,10 +28,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */
-#define DC_DIVAPPR_QR_THRESHOLD 145 
-#define DC_DIV_QR_THRESHOLD 56 
-
 mp_limb_t
 mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
 		     mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
@@ -187,7 +183,7 @@ mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
 
       q2p = TMP_SALLOC_LIMBS (qn + 1);
       
-      if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
+      if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_N_THRESHOLD))
 	{
 	  qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
 				    dp - (qn + 1), qn + 1, dinv);
diff --git a/mpn/generic/dc_divappr_q_n.c b/mpn/generic/dc_divappr_q_n.c
index fad27eda..62a68d0c 100644
--- a/mpn/generic/dc_divappr_q_n.c
+++ b/mpn/generic/dc_divappr_q_n.c
@@ -64,8 +64,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   has that description.
 */
 
-#define DC_DIVAPPR_Q_N_THRESHOLD 56 
-
 mp_limb_t
 mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, 
 		    mp_limb_t dip, mp_ptr tp)
diff --git a/mpn/generic/dc_divrem_n.c b/mpn/generic/dc_divrem_n.c
index a73d136b..0528e53f 100644
--- a/mpn/generic/dc_divrem_n.c
+++ b/mpn/generic/dc_divrem_n.c
@@ -29,6 +29,8 @@ MA 02110-1301, USA. */
 #include "mpir.h"
 #include "gmp-impl.h"
 
+#define DIV_DC_THRESHOLD 16
+
 /*
 [1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
     Technical report MPI-I-98-1-022, october 1998.
diff --git a/mpn/generic/inv_div_qr.c b/mpn/generic/inv_div_qr.c
index c077d600..040d70bc 100644
--- a/mpn/generic/inv_div_qr.c
+++ b/mpn/generic/inv_div_qr.c
@@ -30,9 +30,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#define DC_DIV_QR_THRESHOLD 56 
-#define INV_DIV_QR_THRESHOLD 1200 
-
 mp_limb_t
 mpn_inv_div_qr (mp_ptr qp,
 		  mp_ptr np, mp_size_t nn,
diff --git a/mpn/generic/inv_divappr_q.c b/mpn/generic/inv_divappr_q.c
index bc6affc9..6a50aeb0 100644
--- a/mpn/generic/inv_divappr_q.c
+++ b/mpn/generic/inv_divappr_q.c
@@ -28,12 +28,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */
-#define INV_DIVAPPR_Q_THRESHOLD 1000 
-#define INV_DIV_QR_THRESHOLD 1200 
-#define DC_DIVAPPR_QR_THRESHOLD 145 
-#define DC_DIV_QR_THRESHOLD 56 
-
 mp_limb_t
 mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
 		     mp_srcptr dp, mp_size_t dn, mp_srcptr dinv)
@@ -190,7 +184,7 @@ mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
 	  qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
 				    dp - (qn + 1), qn + 1, dinv2);
 	}
-      else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_THRESHOLD))
+      else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_N_THRESHOLD))
 	{
 	  /* It is tempting to use qp for recursive scratch and put quotient in
 	     tp, but the recursive scratch needs one limb too many.  */
diff --git a/mpn/generic/tdiv_q.c b/mpn/generic/tdiv_q.c
index 61ddb761..eb6c1cb5 100644
--- a/mpn/generic/tdiv_q.c
+++ b/mpn/generic/tdiv_q.c
@@ -81,11 +81,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
    for the code to be correct.  */
 #define FUDGE 5			/* FIXME: tune this */
 
-#define DC_DIV_Q_THRESHOLD 156 /* FIXME: tune this */    
-#define DC_DIVAPPR_Q_THRESHOLD 156 /* FIXME: tune this */
-#define INV_DIV_Q_THRESHOLD 1000 
-#define INV_DIVAPPR_Q_THRESHOLD 2000
-
 void
 mpn_tdiv_q (mp_ptr qp,
 	   mp_srcptr np, mp_size_t nn,
diff --git a/mpn/generic/tdiv_qr.c b/mpn/generic/tdiv_qr.c
index 6e745bb1..e556d40e 100644
--- a/mpn/generic/tdiv_qr.c
+++ b/mpn/generic/tdiv_qr.c
@@ -35,9 +35,6 @@ along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-#define DC_DIV_QR_THRESHOLD 152 /*FIXME: tune this!*/
-#define INV_DIV_QR_THRESHOLD 1000
-
 void
 mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
 	     mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
diff --git a/mpn/x86_64/k8/k10/k102/gmp-mparam.h b/mpn/x86_64/k8/k10/k102/gmp-mparam.h
index 98bc4c34..00be330b 100644
--- a/mpn/x86_64/k8/k10/k102/gmp-mparam.h
+++ b/mpn/x86_64/k8/k10/k102/gmp-mparam.h
@@ -23,7 +23,10 @@
 #define MULMOD_2EXPM1_THRESHOLD          18
 
 #define DIV_SB_PREINV_THRESHOLD           0  /* always */
-#define DIV_DC_THRESHOLD                372
+#define DC_DIV_QR_THRESHOLD              14
+#define DC_DIVAPPR_Q_N_THRESHOLD         10
+#define INV_DIV_QR_THRESHOLD            926 
+#define INV_DIVAPPR_Q_N_THRESHOLD       926
 #define POWM_THRESHOLD                  154
 #define FAC_UI_THRESHOLD              17010
 
diff --git a/tune/Makefile.am b/tune/Makefile.am
index 198b384f..f68d5435 100644
--- a/tune/Makefile.am
+++ b/tune/Makefile.am
@@ -129,7 +129,8 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
 TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \
   mul_n.c mullow_n.c mulhigh_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c toom4_mul_n.c \
   toom7_mul_n.c mulmod_2expm1.c rootrem.c divrem_euclidean_r_1.c divrem_hensel_qr_1.c \
-  rsh_divrem_hensel_qr_1.c 
+  rsh_divrem_hensel_qr_1.c dc_divappr_q.c dc_div_qr.c inv_divappr_q.c \
+  inv_div_qr.c 
 
 $(TUNE_MPN_SRCS_BASIC):
 	for i in $(TUNE_MPN_SRCS_BASIC); do \
diff --git a/tune/common.c b/tune/common.c
index 5f41e2d5..641e205f 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -874,6 +874,26 @@ speed_mpn_sb_divrem_m3_inv (struct speed_params *s)
 {
   SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv);
 }
+double
+speed_mpn_dc_div_qr_n (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE (mpn_dc_div_qr_n, DC_DIVAPPR_Q_N_ITCH(s->size));
+}
+double
+speed_mpn_dc_divappr_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q (mpn_dc_divappr_q);
+}
+double
+speed_mpn_inv_div_qr (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_INV_DIV (mpn_inv_div_qr);
+}
+double
+speed_mpn_inv_divappr_q (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q (mpn_inv_divappr_q);
+}
 
 double
 speed_mpz_mod (struct speed_params *s)
diff --git a/tune/speed.h b/tune/speed.h
index 668a4b8b..9c2c58cb 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -178,6 +178,10 @@ double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s));
 double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s));
 double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s));
 double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s));
+double speed_mpn_dc_div_qr_n _PROTO ((struct speed_params *s));
+double speed_mpn_dc_divappr_q _PROTO ((struct speed_params *s));
+double speed_mpn_inv_div_qr _PROTO ((struct speed_params *s));
+double speed_mpn_inv_divappr_q _PROTO ((struct speed_params *s));
 double speed_MPN_COPY _PROTO ((struct speed_params *s));
 double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
 double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
@@ -1499,6 +1503,263 @@ int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
     return t;								\
   }
 
+#define SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE(function, tsize)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q, tmp;						\
+    mp_limb_t inv;                                             \
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (tmp, tsize,   s->align_wp2);			\
+    								\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[2*s->size-1] = d[s->size-1] - 1;					\
+									\
+    speed_operand_src (s, a, 2*s->size);				\
+    speed_operand_src (s, d, s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_cache_fill (s);						\
+	                                                         \
+    invert_1(inv, d[s->size-1], d[s->size-2]);                        \
+								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function(q, a, d, s->size, inv, tmp);								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DC_DIV(function)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q;						\
+    mp_limb_t inv;                                             \
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    								\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[2*s->size-1] = d[s->size-1] - 1;					\
+									\
+    speed_operand_src (s, a, 2*s->size);				\
+    speed_operand_src (s, d, s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_cache_fill (s);						\
+	                                                         \
+    invert_1(inv, d[s->size-1], d[s->size-2]);                        \
+								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function(q, a, 2*s->size, d, s->size, inv);								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q(function)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q;						\
+    mp_limb_t inv;                                             \
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, 2*s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    								\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+    MPN_COPY (a+2*s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+    MPN_COPY (d+s->size, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[2*s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[3*s->size-1] = d[2*s->size-1] - 1;					\
+									\
+    speed_operand_src (s, a, 3*s->size);				\
+    speed_operand_src (s, d, 2*s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_cache_fill (s);						\
+	                                                         \
+    invert_1(inv, d[2*s->size-1], d[2*s->size-2]);                        \
+								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function(q, a, 3*s->size, d, 2*s->size, inv);								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_INV_DIV_N(function)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q, inv;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2);			\
+    								\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[2*s->size-1] = d[s->size-1] - 1;					\
+									\
+    speed_operand_src (s, a, 2*s->size);				\
+    speed_operand_src (s, d, s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_cache_fill (s);						\
+	                                                         \
+    mpn_invert(inv, d, s->size);                        \
+								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function(q, a, d, s->size, inv);								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_INV_DIV(function)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q, inv;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2);			\
+    								\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[2*s->size-1] = d[s->size-1] - 1;					\
+									\
+    speed_operand_src (s, a, 2*s->size);				\
+    speed_operand_src (s, d, s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_cache_fill (s);						\
+	                                                         \
+    mpn_invert(inv, d, s->size);                        \
+								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function(q, a, 2*s->size, d, s->size, inv);								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
+#define SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q(function)				\
+  {									\
+    unsigned  i;							\
+    mp_ptr    a, d, q, inv;						\
+    double    t;							\
+    TMP_DECL;								\
+									\
+    SPEED_RESTRICT_COND (s->size >= 2);					\
+									\
+    TMP_MARK;								\
+    SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp);			\
+    SPEED_TMP_ALLOC_LIMBS (d, 2*s->size,   s->align_yp);			\
+    SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
+    SPEED_TMP_ALLOC_LIMBS (inv, 2*s->size, s->align_wp2);			\
+    								\
+    MPN_COPY (a, s->xp, s->size);					\
+    MPN_COPY (a+s->size, s->xp, s->size);				\
+    MPN_COPY (a+2*s->size, s->xp, s->size);				\
+									\
+    MPN_COPY (d, s->yp, s->size);					\
+    MPN_COPY (d+s->size, s->yp, s->size);					\
+									\
+    /* normalize the data */						\
+    d[2*s->size-1] |= GMP_NUMB_HIGHBIT;					\
+    a[3*s->size-1] = d[2*s->size-1] - 1;					\
+									\
+    speed_operand_src (s, a, 3*s->size);				\
+    speed_operand_src (s, d, 2*s->size);					\
+    speed_operand_dst (s, q, s->size+1);				\
+    speed_cache_fill (s);						\
+	                                                         \
+    mpn_invert(inv, d, 2*s->size);                        \
+								\
+    speed_starttime ();							\
+    i = s->reps;							\
+    do									\
+      function(q, a, 3*s->size, d, 2*s->size, inv);								\
+    while (--i != 0);							\
+    t = speed_endtime ();						\
+									\
+    TMP_FREE;								\
+    return t;								\
+  }
+
 #define SPEED_ROUTINE_MPN_DC_DIVREM_N(function)				\
   SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size))
 
diff --git a/tune/tuneup.c b/tune/tuneup.c
index e2177223..1186af96 100644
--- a/tune/tuneup.c
+++ b/tune/tuneup.c
@@ -24,40 +24,16 @@ MA 02110-1301, USA. */
 
    -t turns on some diagnostic traces, a second -t turns on more traces.
 
-   Notes:
-
-   The code here isn't a vision of loveliness, mainly because it's subject
-   to ongoing changes according to new things wanting to be tuned, and
-   practical requirements of systems tested.
-
-   Sometimes running the program twice produces slightly different results.
-   This is probably because there's so little separating algorithms near
-   their crossover, and on that basis it should make little or no difference
-   to the final speed of the relevant routines, but nothing has been done to
-   check that carefully.
-
    Algorithm:
 
-   The thresholds are determined as follows.  A crossover may not be a
-   single size but rather a range where it oscillates between method A or
-   method B faster.  If the threshold is set making B used where A is faster
-   (or vice versa) that's bad.  Badness is the percentage time lost and
-   total badness is the sum of this over all sizes measured.  The threshold
-   is set to minimize total badness.
+   The thresholds are determined as follows: two algorithms A and B are 
+   compared over a range of sizes. At each point, we define "badness" to
+   be the percentage time lost if algorithm B is chosen over algorithm A.
+   Then total badness is the sum of this over all sizes measured.  The 
+   threshold is set to minimize total badness.
 
-   Suppose, as sizes increase, method B becomes faster than method A.  The
-   effect of the rule is that, as you look at increasing sizes, isolated
-   points where B is faster are ignored, but when it's consistently faster,
-   or faster on balance, then the threshold is set there.  The same result
-   is obtained thinking in the other direction of A becoming faster at
-   smaller sizes.
-
-   In practice the thresholds tend to be chosen to bring on the next
-   algorithm fairly quickly.
-
-   This rule is attractive because it's got a basis in reason and is fairly
-   easy to implement, but no work has been done to actually compare it in
-   absolute terms to other possibilities.
+   In practice the thresholds tend to be chosen to bring on algorithm B 
+   fairly quickly.
 
    Implementation:
 
@@ -97,6 +73,57 @@ MA 02110-1301, USA. */
    so each k is probably being brought on a touch early.  This isn't likely
    to make a difference, and the simpler probing means fewer tests.
 
+   Code: 
+     - main : checks for various command line options and calls all()
+     - all : prints the tuneup message, date and compiler, then calls
+             each of the individual tuning functions in turn, e.g. 
+             tune_mul()
+     - tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the
+             karatsuba and toom cutoffs. It sets up a param struct with the
+             following parameters:
+               a) name : the name of the threshold being tuned, e.g.
+                  MUL_TOOM3_THRESHOLD
+               b) function : the first function being compared (this must be
+                  of the form speed_blah and the function speed_blah will 
+                  exist in speed.h and speed.c
+               c) function2 : the second function being compared (if set to
+                  NULL, this is automatically set to equal function
+               d) step_factor : the size of the step between sizes, 
+                  set to 0.01 by default, i.e. 1% increments
+               e) function_fudge : multiplier for the speed of function, used
+                  to adjust for overheads, by default set to 1.0
+               f) stop_since_change is a stop condition. If the threshold
+                  has not changed for this many iterations, then stop. This
+                  is set to 80 iterations by default.
+               g) stop_factor : this is another stop factor. If method B 
+                  becomes faster by at least this factor, then stop. By
+                  default this is set to 1.2, i.e. 20% faster.
+               h) min_size : the minimum size to start comparing from.
+               i) min_is_always : if this is set to 1, then if the threshold
+                  just ends up being min_size, then the threshold is actually
+                  set to 0, i.e. algorithm B is always used.
+               j) max_size : the maximum size to compare up to. By default this
+                  is set to DEFAULT_MAX_SIZE which is 1000 limbs.
+               h) check_size : if set, will check that the given starting size
+                  is valid for both algorithms and that algorithm B is at least
+                  4% slower than algorithm A at that point.
+               i) size_extra : this is a bias added to each size when doing 
+                  measurements. It is subtracted off after each measurement. 
+                  It is basically used for shifting a threshold from the
+                  measured value.
+               j) data_high : if set to 1, the high limb of xp and yp are set to
+                  be less than s->r, if set to 2, the high limb of xp and yp are
+                  set to be greater than or equal to s->r
+               k) noprint : if set, the threshold is computed but not printed.
+             
+             After setting all the appropriate parameters, the function one() is
+             called. It takes a reference to a parameter, e.g. mul_toom3_threshold
+             which is defined in a table below. That threshold will have been given
+             some initial value (usually MP_SIZE_T_MAX) in the table. It also takes
+             a reference to the param struct.
+        - one() : does repeated timings over the given range of sizes always setting
+            the threshold to size+1 for function and size for function2.
+
 */
 
 #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
@@ -172,7 +199,10 @@ mp_size_t  mulhigh_basecase_threshold   = MP_SIZE_T_MAX;
 mp_size_t  mulhigh_dc_threshold         = MP_SIZE_T_MAX;
 mp_size_t  mulhigh_mul_threshold        = MP_SIZE_T_MAX;
 mp_size_t  div_sb_preinv_threshold      = MP_SIZE_T_MAX;
-mp_size_t  div_dc_threshold             = MP_SIZE_T_MAX;
+mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  dc_divappr_q_n_threshold     = MP_SIZE_T_MAX;
+mp_size_t  inv_div_qr_threshold          = MP_SIZE_T_MAX;
+mp_size_t  inv_divappr_q_n_threshold     = MP_SIZE_T_MAX;
 mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
 mp_size_t  fac_ui_threshold             = MP_SIZE_T_MAX;
 mp_size_t  gcd_accel_threshold          = MP_SIZE_T_MAX;
@@ -1130,13 +1160,45 @@ tune_sb_preinv (gmp_randstate_t rands)
 
 
 void
-tune_dc (gmp_randstate_t rands)
+tune_dc_div (gmp_randstate_t rands)
 {
+  {
   static struct param_t  param;
-  param.name = "DIV_DC_THRESHOLD";
-  param.function = speed_mpn_dc_tdiv_qr;
+  param.name = "DC_DIV_QR_THRESHOLD";
+  param.function = speed_mpn_dc_div_qr_n;
+  param.stop_factor = 1.1;
   param.step_factor = 0.02;
-  one (&div_dc_threshold, rands, &param);
+  one (&dc_div_qr_threshold, rands, &param);
+  }
+
+  {
+  static struct param_t  param;
+  param.name = "DC_DIVAPPR_Q_N_THRESHOLD";
+  param.function = speed_mpn_dc_divappr_q;
+  param.stop_factor = 1.1;
+  param.step_factor = 0.02;
+  one (&dc_divappr_q_n_threshold, rands, &param);
+  }
+
+  {
+  static struct param_t  param;
+  param.name = "INV_DIV_QR_THRESHOLD";
+  param.function = speed_mpn_inv_div_qr;
+  param.min_size = dc_div_qr_threshold*4;
+  param.stop_factor = 2.0;
+  param.step_factor = 0.2;
+  one (&inv_div_qr_threshold, rands, &param);
+  }
+  
+  {
+  static struct param_t  param;
+  param.name = "INV_DIVAPPR_Q_N_THRESHOLD";
+  param.function = speed_mpn_inv_divappr_q;
+  param.min_size = dc_divappr_q_n_threshold*4;
+  param.stop_factor = 2.0;
+  param.step_factor = 0.2;
+  one (&inv_divappr_q_n_threshold, rands, &param);
+  }
 }
 
 
@@ -1866,7 +1928,18 @@ all (gmp_randstate_t rands)
   printf("\n");
   
   tune_sb_preinv (rands);
-  tune_dc (rands);
+
+  /* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */
+  tune_dc_div (rands);
+  
+  /* mpn_tdiv_q : balanced 
+  tune_dc_div_q (rands);
+  tune_inv_div_q (rands);
+
+  /* mpn_tdiv_q : small quotient 
+  tune_dc_divappr_q (rands);
+  tune_inv_divappr_q (rands); */
+  
   tune_powm (rands);
   tune_fac_ui(rands);
   printf("\n");