From f444a2bf6c9e57e9b14330b558559be3ab4cfb9f Mon Sep 17 00:00:00 2001 From: "(no author)" <(no author)> Date: Fri, 19 Feb 2010 12:54:56 +0000 Subject: [PATCH] Attempt to tune some of the division functions. --- gmp-impl.h | 104 +++++------ mpn/generic/dc_div_qr.c | 2 - mpn/generic/dc_divappr_q.c | 6 +- mpn/generic/dc_divappr_q_n.c | 2 - mpn/generic/dc_divrem_n.c | 2 + mpn/generic/inv_div_qr.c | 3 - mpn/generic/inv_divappr_q.c | 8 +- mpn/generic/tdiv_q.c | 5 - mpn/generic/tdiv_qr.c | 3 - mpn/x86_64/k8/k10/k102/gmp-mparam.h | 5 +- tune/Makefile.am | 3 +- tune/common.c | 20 +++ tune/speed.h | 261 ++++++++++++++++++++++++++++ tune/tuneup.c | 147 ++++++++++++---- 14 files changed, 447 insertions(+), 124 deletions(-) diff --git a/gmp-impl.h b/gmp-impl.h index f4866105..9950df12 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -584,51 +584,6 @@ void __gmp_tmp_debug_free _PROTO ((const char *, int, int, #define GMP_NAIL_LOWBIT (CNST_LIMB(1) << GMP_NUMB_BITS) #endif -#if GMP_NAIL_BITS != 0 -/* Set various *_THRESHOLD values to be used for nails. Thus we avoid using - code that has not yet been qualified. */ - -#undef DIV_SB_PREINV_THRESHOLD -#undef DIV_DC_THRESHOLD -#undef POWM_THRESHOLD -#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX -#define DIV_DC_THRESHOLD 50 -#define POWM_THRESHOLD 0 - -#undef GCD_ACCEL_THRESHOLD -#define GCD_ACCEL_THRESHOLD 3 - -#undef DIVREM_1_NORM_THRESHOLD -#undef DIVREM_1_UNNORM_THRESHOLD -#undef MOD_1_NORM_THRESHOLD -#undef MOD_1_UNNORM_THRESHOLD -#undef USE_PREINV_DIVREM_1 -#undef USE_PREINV_MOD_1 -#undef DIVREM_2_THRESHOLD -#undef DIVEXACT_1_THRESHOLD -#undef MODEXACT_1_ODD_THRESHOLD -#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */ -#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */ -#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */ -#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */ -#define USE_PREINV_DIVREM_1 0 /* no preinv */ -#define USE_PREINV_MOD_1 0 /* no preinv */ -#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv */ - -#undef GET_STR_DC_THRESHOLD -#undef GET_STR_PRECOMPUTE_THRESHOLD -#undef SET_STR_THRESHOLD -#define GET_STR_DC_THRESHOLD 22 -#define GET_STR_PRECOMPUTE_THRESHOLD 42 -#define SET_STR_THRESHOLD 3259 - -/* mpn/generic/mul_fft.c is not nails-capable. */ -#undef MUL_FFT_THRESHOLD -#undef SQR_FFT_THRESHOLD -#define MUL_FFT_THRESHOLD MP_SIZE_T_MAX -#define SQR_FFT_THRESHOLD MP_SIZE_T_MAX -#endif - /* Swap macros. */ #define MP_LIMB_T_SWAP(x, y) \ @@ -1270,6 +1225,8 @@ void mpn_mul_fft_full _PROTO ((mp_ptr op, #define mpn_fft_next_size __MPN(fft_next_size) mp_size_t mpn_fft_next_size _PROTO ((mp_size_t pl, int k)) ATTRIBUTE_CONST; +#define DC_DIVAPPR_Q_N_ITCH(n) (n*10) + #define mpn_sb_divrem_mn __MPN(sb_divrem_mn) mp_limb_t mpn_sb_divrem_mn _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t)); @@ -1790,18 +1747,37 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[]; #define MPN_FFT_TABLE_SIZE 16 - -/* mpn_dc_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than - div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way, - i.e. n/2 >= MUL_KARATSUBA_THRESHOLD - - Measured values are between 2 and 4 times MUL_KARATSUBA_THRESHOLD, so go - for 3 as an average. */ - -#ifndef DIV_DC_THRESHOLD -#define DIV_DC_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD) +#ifndef DC_DIV_QR_THRESHOLD +#define DC_DIV_QR_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD) #endif +#ifndef DC_DIVAPPR_Q_N_THRESHOLD +#define DC_DIVAPPR_Q_N_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD) +#endif + +#ifndef INV_DIV_QR_THRESHOLD +#define INV_DIV_QR_THRESHOLD (MUL_FFT_THRESHOLD/3) +#endif + +#ifndef INV_DIVAPPR_Q_N_THRESHOLD +#define INV_DIVAPPR_Q_N_THRESHOLD (MUL_FFT_THRESHOLD/3) +#endif + +#ifndef DC_DIV_Q_THRESHOLD +#define DC_DIV_Q_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD) +#endif + +#ifndef INV_DIV_Q_THRESHOLD +#define INV_DIV_Q_THRESHOLD (MUL_FFT_THRESHOLD/3) +#endif + +#ifndef DC_DIVAPPR_Q_THRESHOLD +#define DC_DIVAPPR_Q_THRESHOLD (3 * MUL_TOOM3_THRESHOLD) +#endif + +#ifndef INV_DIVAPPR_Q_THRESHOLD +#define INV_DIVAPPR_Q_THRESHOLD (MUL_FFT_THRESHOLD/2) +#endif /* Return non-zero if xp,xsize and yp,ysize overlap. If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no @@ -4180,9 +4156,21 @@ extern mp_size_t mulmod_2expm1_threshold; extern mp_size_t div_sb_preinv_threshold; #endif -#undef DIV_DC_THRESHOLD -#define DIV_DC_THRESHOLD div_dc_threshold -extern mp_size_t div_dc_threshold; +#undef DC_DIV_QR_THRESHOLD +#define DC_DIV_QR_THRESHOLD dc_div_qr_threshold +extern mp_size_t dc_div_qr_threshold; + +#undef DC_DIVAPPR_Q_N_THRESHOLD +#define DC_DIVAPPR_Q_N_THRESHOLD dc_divappr_q_n_threshold +extern mp_size_t dc_divappr_q_n_threshold; + +#undef INV_DIV_QR_THRESHOLD +#define INV_DIV_QR_THRESHOLD inv_div_qr_threshold +extern mp_size_t inv_div_qr_threshold; + +#undef INV_DIVAPPR_Q_N_THRESHOLD +#define INV_DIVAPPR_Q_N_THRESHOLD inv_divappr_q_n_threshold +extern mp_size_t inv_divappr_q_n_threshold; #undef POWM_THRESHOLD #define POWM_THRESHOLD powm_threshold diff --git a/mpn/generic/dc_div_qr.c b/mpn/generic/dc_div_qr.c index e7ab824e..7c6e67c8 100644 --- a/mpn/generic/dc_div_qr.c +++ b/mpn/generic/dc_div_qr.c @@ -28,8 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" -#define DC_DIV_QR_THRESHOLD 56 - mp_limb_t mpn_dc_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_limb_t dinv, mp_ptr tp) diff --git a/mpn/generic/dc_divappr_q.c b/mpn/generic/dc_divappr_q.c index dba5acd5..4e174f9f 100644 --- a/mpn/generic/dc_divappr_q.c +++ b/mpn/generic/dc_divappr_q.c @@ -28,10 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" -#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */ -#define DC_DIVAPPR_QR_THRESHOLD 145 -#define DC_DIV_QR_THRESHOLD 56 - mp_limb_t mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) @@ -187,7 +183,7 @@ mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, q2p = TMP_SALLOC_LIMBS (qn + 1); - if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD)) + if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_N_THRESHOLD)) { qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1), dp - (qn + 1), qn + 1, dinv); diff --git a/mpn/generic/dc_divappr_q_n.c b/mpn/generic/dc_divappr_q_n.c index fad27eda..62a68d0c 100644 --- a/mpn/generic/dc_divappr_q_n.c +++ b/mpn/generic/dc_divappr_q_n.c @@ -64,8 +64,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. has that description. */ -#define DC_DIVAPPR_Q_N_THRESHOLD 56 - mp_limb_t mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_limb_t dip, mp_ptr tp) diff --git a/mpn/generic/dc_divrem_n.c b/mpn/generic/dc_divrem_n.c index a73d136b..0528e53f 100644 --- a/mpn/generic/dc_divrem_n.c +++ b/mpn/generic/dc_divrem_n.c @@ -29,6 +29,8 @@ MA 02110-1301, USA. */ #include "mpir.h" #include "gmp-impl.h" +#define DIV_DC_THRESHOLD 16 + /* [1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler, Technical report MPI-I-98-1-022, october 1998. diff --git a/mpn/generic/inv_div_qr.c b/mpn/generic/inv_div_qr.c index c077d600..040d70bc 100644 --- a/mpn/generic/inv_div_qr.c +++ b/mpn/generic/inv_div_qr.c @@ -30,9 +30,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" -#define DC_DIV_QR_THRESHOLD 56 -#define INV_DIV_QR_THRESHOLD 1200 - mp_limb_t mpn_inv_div_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, diff --git a/mpn/generic/inv_divappr_q.c b/mpn/generic/inv_divappr_q.c index bc6affc9..6a50aeb0 100644 --- a/mpn/generic/inv_divappr_q.c +++ b/mpn/generic/inv_divappr_q.c @@ -28,12 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" -#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */ -#define INV_DIVAPPR_Q_THRESHOLD 1000 -#define INV_DIV_QR_THRESHOLD 1200 -#define DC_DIVAPPR_QR_THRESHOLD 145 -#define DC_DIV_QR_THRESHOLD 56 - mp_limb_t mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_srcptr dinv) @@ -190,7 +184,7 @@ mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1), dp - (qn + 1), qn + 1, dinv2); } - else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_THRESHOLD)) + else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_N_THRESHOLD)) { /* It is tempting to use qp for recursive scratch and put quotient in tp, but the recursive scratch needs one limb too many. */ diff --git a/mpn/generic/tdiv_q.c b/mpn/generic/tdiv_q.c index 61ddb761..eb6c1cb5 100644 --- a/mpn/generic/tdiv_q.c +++ b/mpn/generic/tdiv_q.c @@ -81,11 +81,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ for the code to be correct. */ #define FUDGE 5 /* FIXME: tune this */ -#define DC_DIV_Q_THRESHOLD 156 /* FIXME: tune this */ -#define DC_DIVAPPR_Q_THRESHOLD 156 /* FIXME: tune this */ -#define INV_DIV_Q_THRESHOLD 1000 -#define INV_DIVAPPR_Q_THRESHOLD 2000 - void mpn_tdiv_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, diff --git a/mpn/generic/tdiv_qr.c b/mpn/generic/tdiv_qr.c index 6e745bb1..e556d40e 100644 --- a/mpn/generic/tdiv_qr.c +++ b/mpn/generic/tdiv_qr.c @@ -35,9 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ #include "gmp-impl.h" #include "longlong.h" -#define DC_DIV_QR_THRESHOLD 152 /*FIXME: tune this!*/ -#define INV_DIV_QR_THRESHOLD 1000 - void mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) diff --git a/mpn/x86_64/k8/k10/k102/gmp-mparam.h b/mpn/x86_64/k8/k10/k102/gmp-mparam.h index 98bc4c34..00be330b 100644 --- a/mpn/x86_64/k8/k10/k102/gmp-mparam.h +++ b/mpn/x86_64/k8/k10/k102/gmp-mparam.h @@ -23,7 +23,10 @@ #define MULMOD_2EXPM1_THRESHOLD 18 #define DIV_SB_PREINV_THRESHOLD 0 /* always */ -#define DIV_DC_THRESHOLD 372 +#define DC_DIV_QR_THRESHOLD 14 +#define DC_DIVAPPR_Q_N_THRESHOLD 10 +#define INV_DIV_QR_THRESHOLD 926 +#define INV_DIVAPPR_Q_N_THRESHOLD 926 #define POWM_THRESHOLD 154 #define FAC_UI_THRESHOLD 17010 diff --git a/tune/Makefile.am b/tune/Makefile.am index 198b384f..f68d5435 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -129,7 +129,8 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \ mul_n.c mullow_n.c mulhigh_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c toom4_mul_n.c \ toom7_mul_n.c mulmod_2expm1.c rootrem.c divrem_euclidean_r_1.c divrem_hensel_qr_1.c \ - rsh_divrem_hensel_qr_1.c + rsh_divrem_hensel_qr_1.c dc_divappr_q.c dc_div_qr.c inv_divappr_q.c \ + inv_div_qr.c $(TUNE_MPN_SRCS_BASIC): for i in $(TUNE_MPN_SRCS_BASIC); do \ diff --git a/tune/common.c b/tune/common.c index 5f41e2d5..641e205f 100644 --- a/tune/common.c +++ b/tune/common.c @@ -874,6 +874,26 @@ speed_mpn_sb_divrem_m3_inv (struct speed_params *s) { SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv); } +double +speed_mpn_dc_div_qr_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE (mpn_dc_div_qr_n, DC_DIVAPPR_Q_N_ITCH(s->size)); +} +double +speed_mpn_dc_divappr_q (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q (mpn_dc_divappr_q); +} +double +speed_mpn_inv_div_qr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_INV_DIV (mpn_inv_div_qr); +} +double +speed_mpn_inv_divappr_q (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q (mpn_inv_divappr_q); +} double speed_mpz_mod (struct speed_params *s) diff --git a/tune/speed.h b/tune/speed.h index 668a4b8b..9c2c58cb 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -178,6 +178,10 @@ double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s)); double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s)); double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s)); double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s)); +double speed_mpn_dc_div_qr_n _PROTO ((struct speed_params *s)); +double speed_mpn_dc_divappr_q _PROTO ((struct speed_params *s)); +double speed_mpn_inv_div_qr _PROTO ((struct speed_params *s)); +double speed_mpn_inv_divappr_q _PROTO ((struct speed_params *s)); double speed_MPN_COPY _PROTO ((struct speed_params *s)); double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s)); double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s)); @@ -1499,6 +1503,263 @@ int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s, return t; \ } +#define SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE(function, tsize) \ + { \ + unsigned i; \ + mp_ptr a, d, q, tmp; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tmp, tsize, s->align_wp2); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[2*s->size-1] = d[s->size-1] - 1; \ + \ + speed_operand_src (s, a, 2*s->size); \ + speed_operand_src (s, d, s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_cache_fill (s); \ + \ + invert_1(inv, d[s->size-1], d[s->size-2]); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function(q, a, d, s->size, inv, tmp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_DC_DIV(function) \ + { \ + unsigned i; \ + mp_ptr a, d, q; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[2*s->size-1] = d[s->size-1] - 1; \ + \ + speed_operand_src (s, a, 2*s->size); \ + speed_operand_src (s, d, s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_cache_fill (s); \ + \ + invert_1(inv, d[s->size-1], d[s->size-2]); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function(q, a, 2*s->size, d, s->size, inv); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q(function) \ + { \ + unsigned i; \ + mp_ptr a, d, q; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, 2*s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + MPN_COPY (a+2*s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + MPN_COPY (d+s->size, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[2*s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[3*s->size-1] = d[2*s->size-1] - 1; \ + \ + speed_operand_src (s, a, 3*s->size); \ + speed_operand_src (s, d, 2*s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_cache_fill (s); \ + \ + invert_1(inv, d[2*s->size-1], d[2*s->size-2]); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function(q, a, 3*s->size, d, 2*s->size, inv); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_INV_DIV_N(function) \ + { \ + unsigned i; \ + mp_ptr a, d, q, inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[2*s->size-1] = d[s->size-1] - 1; \ + \ + speed_operand_src (s, a, 2*s->size); \ + speed_operand_src (s, d, s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_cache_fill (s); \ + \ + mpn_invert(inv, d, s->size); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function(q, a, d, s->size, inv); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_INV_DIV(function) \ + { \ + unsigned i; \ + mp_ptr a, d, q, inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[2*s->size-1] = d[s->size-1] - 1; \ + \ + speed_operand_src (s, a, 2*s->size); \ + speed_operand_src (s, d, s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_cache_fill (s); \ + \ + mpn_invert(inv, d, s->size); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function(q, a, 2*s->size, d, s->size, inv); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q(function) \ + { \ + unsigned i; \ + mp_ptr a, d, q, inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, 2*s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (inv, 2*s->size, s->align_wp2); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + MPN_COPY (a+2*s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + MPN_COPY (d+s->size, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[2*s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[3*s->size-1] = d[2*s->size-1] - 1; \ + \ + speed_operand_src (s, a, 3*s->size); \ + speed_operand_src (s, d, 2*s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_cache_fill (s); \ + \ + mpn_invert(inv, d, 2*s->size); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function(q, a, 3*s->size, d, 2*s->size, inv); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + #define SPEED_ROUTINE_MPN_DC_DIVREM_N(function) \ SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size)) diff --git a/tune/tuneup.c b/tune/tuneup.c index e2177223..1186af96 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -24,40 +24,16 @@ MA 02110-1301, USA. */ -t turns on some diagnostic traces, a second -t turns on more traces. - Notes: - - The code here isn't a vision of loveliness, mainly because it's subject - to ongoing changes according to new things wanting to be tuned, and - practical requirements of systems tested. - - Sometimes running the program twice produces slightly different results. - This is probably because there's so little separating algorithms near - their crossover, and on that basis it should make little or no difference - to the final speed of the relevant routines, but nothing has been done to - check that carefully. - Algorithm: - The thresholds are determined as follows. A crossover may not be a - single size but rather a range where it oscillates between method A or - method B faster. If the threshold is set making B used where A is faster - (or vice versa) that's bad. Badness is the percentage time lost and - total badness is the sum of this over all sizes measured. The threshold - is set to minimize total badness. + The thresholds are determined as follows: two algorithms A and B are + compared over a range of sizes. At each point, we define "badness" to + be the percentage time lost if algorithm B is chosen over algorithm A. + Then total badness is the sum of this over all sizes measured. The + threshold is set to minimize total badness. - Suppose, as sizes increase, method B becomes faster than method A. The - effect of the rule is that, as you look at increasing sizes, isolated - points where B is faster are ignored, but when it's consistently faster, - or faster on balance, then the threshold is set there. The same result - is obtained thinking in the other direction of A becoming faster at - smaller sizes. - - In practice the thresholds tend to be chosen to bring on the next - algorithm fairly quickly. - - This rule is attractive because it's got a basis in reason and is fairly - easy to implement, but no work has been done to actually compare it in - absolute terms to other possibilities. + In practice the thresholds tend to be chosen to bring on algorithm B + fairly quickly. Implementation: @@ -97,6 +73,57 @@ MA 02110-1301, USA. */ so each k is probably being brought on a touch early. This isn't likely to make a difference, and the simpler probing means fewer tests. + Code: + - main : checks for various command line options and calls all() + - all : prints the tuneup message, date and compiler, then calls + each of the individual tuning functions in turn, e.g. + tune_mul() + - tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the + karatsuba and toom cutoffs. It sets up a param struct with the + following parameters: + a) name : the name of the threshold being tuned, e.g. + MUL_TOOM3_THRESHOLD + b) function : the first function being compared (this must be + of the form speed_blah and the function speed_blah will + exist in speed.h and speed.c + c) function2 : the second function being compared (if set to + NULL, this is automatically set to equal function + d) step_factor : the size of the step between sizes, + set to 0.01 by default, i.e. 1% increments + e) function_fudge : multiplier for the speed of function, used + to adjust for overheads, by default set to 1.0 + f) stop_since_change is a stop condition. If the threshold + has not changed for this many iterations, then stop. This + is set to 80 iterations by default. + g) stop_factor : this is another stop factor. If method B + becomes faster by at least this factor, then stop. By + default this is set to 1.2, i.e. 20% faster. + h) min_size : the minimum size to start comparing from. + i) min_is_always : if this is set to 1, then if the threshold + just ends up being min_size, then the threshold is actually + set to 0, i.e. algorithm B is always used. + j) max_size : the maximum size to compare up to. By default this + is set to DEFAULT_MAX_SIZE which is 1000 limbs. + h) check_size : if set, will check that the given starting size + is valid for both algorithms and that algorithm B is at least + 4% slower than algorithm A at that point. + i) size_extra : this is a bias added to each size when doing + measurements. It is subtracted off after each measurement. + It is basically used for shifting a threshold from the + measured value. + j) data_high : if set to 1, the high limb of xp and yp are set to + be less than s->r, if set to 2, the high limb of xp and yp are + set to be greater than or equal to s->r + k) noprint : if set, the threshold is computed but not printed. + + After setting all the appropriate parameters, the function one() is + called. It takes a reference to a parameter, e.g. mul_toom3_threshold + which is defined in a table below. That threshold will have been given + some initial value (usually MP_SIZE_T_MAX) in the table. It also takes + a reference to the param struct. + - one() : does repeated timings over the given range of sizes always setting + the threshold to size+1 for function and size for function2. + */ #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ @@ -172,7 +199,10 @@ mp_size_t mulhigh_basecase_threshold = MP_SIZE_T_MAX; mp_size_t mulhigh_dc_threshold = MP_SIZE_T_MAX; mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX; mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; -mp_size_t div_dc_threshold = MP_SIZE_T_MAX; +mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; +mp_size_t dc_divappr_q_n_threshold = MP_SIZE_T_MAX; +mp_size_t inv_div_qr_threshold = MP_SIZE_T_MAX; +mp_size_t inv_divappr_q_n_threshold = MP_SIZE_T_MAX; mp_size_t powm_threshold = MP_SIZE_T_MAX; mp_size_t fac_ui_threshold = MP_SIZE_T_MAX; mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; @@ -1130,13 +1160,45 @@ tune_sb_preinv (gmp_randstate_t rands) void -tune_dc (gmp_randstate_t rands) +tune_dc_div (gmp_randstate_t rands) { + { static struct param_t param; - param.name = "DIV_DC_THRESHOLD"; - param.function = speed_mpn_dc_tdiv_qr; + param.name = "DC_DIV_QR_THRESHOLD"; + param.function = speed_mpn_dc_div_qr_n; + param.stop_factor = 1.1; param.step_factor = 0.02; - one (&div_dc_threshold, rands, ¶m); + one (&dc_div_qr_threshold, rands, ¶m); + } + + { + static struct param_t param; + param.name = "DC_DIVAPPR_Q_N_THRESHOLD"; + param.function = speed_mpn_dc_divappr_q; + param.stop_factor = 1.1; + param.step_factor = 0.02; + one (&dc_divappr_q_n_threshold, rands, ¶m); + } + + { + static struct param_t param; + param.name = "INV_DIV_QR_THRESHOLD"; + param.function = speed_mpn_inv_div_qr; + param.min_size = dc_div_qr_threshold*4; + param.stop_factor = 2.0; + param.step_factor = 0.2; + one (&inv_div_qr_threshold, rands, ¶m); + } + + { + static struct param_t param; + param.name = "INV_DIVAPPR_Q_N_THRESHOLD"; + param.function = speed_mpn_inv_divappr_q; + param.min_size = dc_divappr_q_n_threshold*4; + param.stop_factor = 2.0; + param.step_factor = 0.2; + one (&inv_divappr_q_n_threshold, rands, ¶m); + } } @@ -1866,7 +1928,18 @@ all (gmp_randstate_t rands) printf("\n"); tune_sb_preinv (rands); - tune_dc (rands); + + /* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */ + tune_dc_div (rands); + + /* mpn_tdiv_q : balanced + tune_dc_div_q (rands); + tune_inv_div_q (rands); + + /* mpn_tdiv_q : small quotient + tune_dc_divappr_q (rands); + tune_inv_divappr_q (rands); */ + tune_powm (rands); tune_fac_ui(rands); printf("\n");