From 6f542aa81da845f945441b2a1c07e09fcdbd8b24 Mon Sep 17 00:00:00 2001 From: wbhart Date: Wed, 8 Aug 2012 05:04:56 +0000 Subject: [PATCH] Fixed fft stuff in tuneup. --- gmp-impl.h | 75 ++--------------------- tune/common.c | 29 ++------- tune/speed.c | 8 +-- tune/speed.h | 6 +- tune/tuneup.c | 160 +++++++------------------------------------------- 5 files changed, 37 insertions(+), 241 deletions(-) diff --git a/gmp-impl.h b/gmp-impl.h index cb8c8f6f..ab533c56 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -1828,31 +1828,7 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[]; #define SQR_TOOM8_THRESHOLD_LIMIT SQR_TOOM8_THRESHOLD #endif -/* First k to use for an FFT modF multiply. A modF FFT is an order - log(2^k)/log(2^(k-1)) algorithm, so k=3 is merely 1.5 like karatsuba, - whereas k=4 is 1.33 which is faster than toom3 at 1.485. */ -#define FFT_FIRST_K 4 - -/* Threshold at which FFT should be used to do a modF NxN -> N multiply. */ -#ifndef MUL_FFT_MODF_THRESHOLD -#define MUL_FFT_MODF_THRESHOLD (MUL_TOOM3_THRESHOLD * 3) -#endif -#ifndef SQR_FFT_MODF_THRESHOLD -#define SQR_FFT_MODF_THRESHOLD (SQR_TOOM3_THRESHOLD * 3) -#endif - -/* Threshold at which FFT should be used to do an NxN -> 2N multiply. This - will be a size where FFT is using k=7 or k=8, since an FFT-k used for an - NxN->2N multiply and not recursing into itself is an order - log(2^k)/log(2^(k-2)) algorithm, so it'll be at least k=7 at 1.39 which - is the first better than toom3. */ -#ifndef MUL_FFT_THRESHOLD -#define MUL_FFT_THRESHOLD (MUL_FFT_MODF_THRESHOLD * 10) -#endif -#ifndef SQR_FFT_THRESHOLD -#define SQR_FFT_THRESHOLD (SQR_FFT_MODF_THRESHOLD * 10) -#endif - +/* points at which fft is used for mul/sqr and mulmod_Bexp resp. */ #ifndef MUL_FFT_FULL_THRESHOLD #define MUL_FFT_FULL_THRESHOLD (MUL_TOOM8H_THRESHOLD * 10) #endif @@ -1860,36 +1836,13 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[]; #define SQR_FFT_FULL_THRESHOLD (SQR_TOOM8_THRESHOLD * 10) #endif -/* Table of thresholds for successive modF FFT "k"s. The first entry is - where FFT_FIRST_K+1 should be used, the second FFT_FIRST_K+2, - etc. See mpn_fft_best_k(). */ -#ifndef MUL_FFT_TABLE -#define MUL_FFT_TABLE \ - { MUL_TOOM3_THRESHOLD * 4, /* k=5 */ \ - MUL_TOOM3_THRESHOLD * 8, /* k=6 */ \ - MUL_TOOM3_THRESHOLD * 16, /* k=7 */ \ - MUL_TOOM3_THRESHOLD * 32, /* k=8 */ \ - MUL_TOOM3_THRESHOLD * 96, /* k=9 */ \ - MUL_TOOM3_THRESHOLD * 288, /* k=10 */ \ - 0 } +#ifndef MUL_FFT_THRESHOLD +#define MUL_FFT_THRESHOLD (MUL_FFT_FULL_THRESHOLD / 2) #endif -#ifndef SQR_FFT_TABLE -#define SQR_FFT_TABLE \ - { SQR_TOOM3_THRESHOLD * 4, /* k=5 */ \ - SQR_TOOM3_THRESHOLD * 8, /* k=6 */ \ - SQR_TOOM3_THRESHOLD * 16, /* k=7 */ \ - SQR_TOOM3_THRESHOLD * 32, /* k=8 */ \ - SQR_TOOM3_THRESHOLD * 96, /* k=9 */ \ - SQR_TOOM3_THRESHOLD * 288, /* k=10 */ \ - 0 } +#ifndef SQR_FFT_THRESHOLD +#define SQR_FFT_THRESHOLD (SQR_FFT_FULL_THRESHOLD / 2) #endif -#ifndef FFT_TABLE_ATTRS -#define FFT_TABLE_ATTRS static const -#endif - -#define MPN_FFT_TABLE_SIZE 16 - #ifndef DC_DIV_QR_THRESHOLD #define DC_DIV_QR_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD) #endif @@ -4067,13 +4020,6 @@ extern mp_size_t mul_fft_threshold; #define MUL_FFT_FULL_THRESHOLD mul_fft_full_threshold extern mp_size_t mul_fft_full_threshold; -#undef MUL_FFT_MODF_THRESHOLD -#define MUL_FFT_MODF_THRESHOLD mul_fft_modf_threshold -extern mp_size_t mul_fft_modf_threshold; - -#undef MUL_FFT_TABLE -#define MUL_FFT_TABLE { 0 } - /* A native mpn_sqr_basecase is not tuned and SQR_BASECASE_THRESHOLD should remain as zero (always use it). */ #if ! HAVE_NATIVE_mpn_sqr_basecase @@ -4111,13 +4057,6 @@ extern mp_size_t sqr_fft_threshold; #define SQR_FFT_FULL_THRESHOLD sqr_fft_full_threshold extern mp_size_t sqr_fft_full_threshold; -#undef SQR_FFT_MODF_THRESHOLD -#define SQR_FFT_MODF_THRESHOLD sqr_fft_modf_threshold -extern mp_size_t sqr_fft_modf_threshold; - -#undef SQR_FFT_TABLE -#define SQR_FFT_TABLE { 0 } - #undef MULLOW_BASECASE_THRESHOLD #define MULLOW_BASECASE_THRESHOLD mullow_basecase_threshold extern mp_size_t mullow_basecase_threshold; @@ -4279,10 +4218,6 @@ extern mp_size_t set_str_dc_threshold; #define SET_STR_PRECOMPUTE_THRESHOLD set_str_precompute_threshold extern mp_size_t set_str_precompute_threshold; -#undef FFT_TABLE_ATTRS -#define FFT_TABLE_ATTRS -extern mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE]; - /* Sizes the tune program tests up to, used in a couple of recompilations. */ #undef MUL_KARATSUBA_THRESHOLD_LIMIT #undef MUL_TOOM3_THRESHOLD_LIMIT diff --git a/tune/common.c b/tune/common.c index fbc8992e..654fa437 100644 --- a/tune/common.c +++ b/tune/common.c @@ -1150,16 +1150,16 @@ speed_mpn_toom8_sqr_n (struct speed_params *s) } double -speed_mpn_mul_fft_full (struct speed_params *s) +speed_mpn_mul_fft_main (struct speed_params *s) { SPEED_ROUTINE_MPN_MUL_N_CALL - (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size)); + (mpn_mul_fft_main (wp, s->xp, s->size, s->yp, s->size)); } double -speed_mpn_mul_fft_full_sqr (struct speed_params *s) +speed_mpn_sqr_fft_main (struct speed_params *s) { SPEED_ROUTINE_MPN_SQR_CALL - (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size)); + (mpn_mul_fft_main (wp, s->xp, s->size, s->xp, s->size)); } @@ -1171,20 +1171,15 @@ speed_mpn_mul_fft_full_sqr (struct speed_params *s) { \ mp_ptr wp; \ mp_size_t pl; \ - int k; \ unsigned i; \ double t; \ TMP_DECL; \ \ SPEED_RESTRICT_COND (s->size >= 1); \ \ - if (s->r != 0) \ - k = s->r; \ - else \ - k = mpn_fft_best_k (s->size, sqr); \ \ TMP_MARK; \ - pl = mpn_fft_next_size (s->size, k); \ + pl = fft_adjust_limbs (s->size); \ SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp); \ \ speed_operand_src (s, s->xp, s->size); \ @@ -1204,20 +1199,6 @@ speed_mpn_mul_fft_full_sqr (struct speed_params *s) return t; \ } -double -speed_mpn_mul_fft (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_MUL_FFT_CALL - (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0); -} - -double -speed_mpn_mul_fft_sqr (struct speed_params *s) -{ - SPEED_ROUTINE_MPN_MUL_FFT_CALL - (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1); -} - double speed_mpn_mullow_n (struct speed_params *s) { diff --git a/tune/speed.c b/tune/speed.c index 03672d5a..38814734 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -326,11 +326,9 @@ const struct routine_t { { "mpn_toom8h_mul", speed_mpn_toom8h_mul }, { "mpn_toom3_sqr_n", speed_mpn_toom3_sqr_n }, { "mpn_toom4_sqr_n", speed_mpn_toom4_sqr_n }, - { "mpn_mul_fft_full", speed_mpn_mul_fft_full }, - { "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr }, - - { "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL }, - { "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL }, + + { "mpn_mul_fft_main", speed_mpn_mul_fft_main, FLAG_R_OPTIONAL }, + { "mpn_sqr_fft_main", speed_mpn_sqr_fft_main, FLAG_R_OPTIONAL }, { "mpn_mullow_n", speed_mpn_mullow_n }, { "mpn_mullow_n_basecase", speed_mpn_mullow_n_basecase}, diff --git a/tune/speed.h b/tune/speed.h index 8863c40a..57c00fdf 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -251,10 +251,8 @@ double speed_mpn_mul_1 _PROTO ((struct speed_params *s)); double speed_mpn_mul_1_inplace _PROTO ((struct speed_params *s)); double speed_mpn_mul_2 _PROTO ((struct speed_params *s)); double speed_mpn_mul_basecase _PROTO ((struct speed_params *s)); -double speed_mpn_mul_fft _PROTO ((struct speed_params *s)); -double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s)); -double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s)); -double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s)); +double speed_mpn_mul_fft_main _PROTO ((struct speed_params *s)); +double speed_mpn_sqr_fft_main _PROTO ((struct speed_params *s)); double speed_mpn_mul_n _PROTO ((struct speed_params *s)); double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s)); double speed_mpn_mullow_n _PROTO ((struct speed_params *s)); diff --git a/tune/tuneup.c b/tune/tuneup.c index 02e8f351..719bdd9e 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -67,12 +67,6 @@ MA 02110-1301, USA. */ limits are recompiled though, to make them accept a bigger range of sizes than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n. - Limitations: - - The FFTs aren't subject to the same badness rule as the other thresholds, - so each k is probably being brought on a touch early. This isn't likely - to make a difference, and the simpler probing means fewer tests. - Code: - main : checks for various command line options and calls all() - all : prints the tuneup message, date and compiler, then calls @@ -182,18 +176,14 @@ mp_size_t mul_karatsuba_threshold = MP_SIZE_T_MAX; mp_size_t mul_toom3_threshold = MUL_TOOM3_THRESHOLD_LIMIT; mp_size_t mul_toom4_threshold = MUL_TOOM4_THRESHOLD_LIMIT; mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; -mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; mp_size_t mul_fft_full_threshold = MP_SIZE_T_MAX; -mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; mp_size_t sqr_karatsuba_threshold = (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX); mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; -mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; mp_size_t sqr_fft_full_threshold = MP_SIZE_T_MAX; -mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; mp_size_t mulmod_2expm1_threshold = MP_SIZE_T_MAX; mp_size_t mullow_basecase_threshold = MP_SIZE_T_MAX; mp_size_t mullow_dc_threshold = MP_SIZE_T_MAX; @@ -234,9 +224,6 @@ mp_size_t divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX; mp_size_t rsh_divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX; mp_size_t divrem_euclid_hensel_threshold = MP_SIZE_T_MAX; -mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; -mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; - struct param_t { const char *name; speed_function_t function; @@ -702,11 +689,8 @@ one (mp_size_t *threshold, gmp_randstate_t rands,struct param_t *param) if this step effect remains. */ struct fft_param_t { - const char *table_name; const char *threshold_name; - const char *modf_threshold_name; mp_size_t *p_threshold; - mp_size_t *p_modf_threshold; mp_size_t first_size; mp_size_t max_size; speed_function_t function; @@ -714,103 +698,32 @@ struct fft_param_t { mp_size_t sqr; }; - -/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with - N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple - of 2^(k-1) bits. */ - mp_size_t -fft_step_size (int k) +fft_step_size (int size) { mp_size_t step; - - step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB; - step *= (mp_size_t) 1 << k; + + step = fft_adjust_limbs(size + 1) - size; if (step <= 0) { - printf ("Can't handle k=%d\n", k); + printf ("Can't handle size=%d\n", size); abort (); } return step; } -mp_size_t -fft_next_size (mp_size_t pl, int k) -{ - mp_size_t m = fft_step_size (k); - -/* printf ("[k=%d %ld] %ld ->", k, m, pl); */ - - if (pl == 0 || (pl & (m-1)) != 0) - pl = (pl | (m-1)) + 1; - -/* printf (" %ld\n", pl); */ - return pl; -} - void fft (struct fft_param_t *p,gmp_randstate_t rands) { mp_size_t size; int i, k; - for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++) - mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX; - *p->p_threshold = MP_SIZE_T_MAX; - *p->p_modf_threshold = MP_SIZE_T_MAX; - + option_trace = MAX (option_trace, option_fft_trace); - printf ("#define %s {", p->table_name); - if (option_trace >= 2) - printf ("\n"); - - k = FFT_FIRST_K; - size = p->first_size; - for (;;) - { - double tk, tk1; - - size = fft_next_size (size+1, k+1); - - if (size >= p->max_size) - break; - if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr])) - break; - - /* compare k to k+1 in the middle of the current k+1 step */ - s.size = size + fft_step_size (k+1) / 2; - s.r = k; - tk = tuneup_measure (p->function, rands, NULL, &s); - if (tk == -1.0) - abort (); - - s.r = k+1; - tk1 = tuneup_measure (p->function, rands, NULL, &s); - if (tk1 == -1.0) - abort (); - - if (option_trace >= 2) - printf ("at %ld size=%ld k=%d %.9f k=%d %.9f\n", - (long) size, (long) s.size, k, tk, k+1, tk1); - - /* declare the k+1 threshold as soon as it's faster at its midpoint */ - if (tk1 < tk) - { - mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size; - printf (" %ld,", (long) s.size); - if (option_trace >= 2) printf ("\n"); - k++; - } - } - - mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0; - printf (" 0 }\n"); - - size = p->first_size; /* Declare an FFT faster than a plain toom3 etc multiplication found as @@ -818,62 +731,39 @@ fft (struct fft_param_t *p,gmp_randstate_t rands) middle of the FFT step is tested. */ for (;;) { - int modf = (*p->p_modf_threshold == MP_SIZE_T_MAX); double tk, tm; - /* k=7 should be the first FFT which can beat toom3 on a full - multiply, so jump to that threshold and save some probing after the - modf threshold is found. */ - if (!modf && size < mpn_fft_table[p->sqr][2]) - { - size = mpn_fft_table[p->sqr][2]; - if (option_trace >= 2) - printf ("jump to size=%ld\n", (long) size); - } - - size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr)); - k = mpn_fft_best_k (size, p->sqr); - + size = fft_adjust_limbs (size+1); + if (size >= p->max_size) break; - s.size = size + fft_step_size (k) / 2; - s.r = k; + s.size = size + fft_step_size (size) / 2; + tk = tuneup_measure (p->function, rands, NULL, &s); if (tk == -1.0) abort (); - if (!modf) s.size; tm = tuneup_measure (p->mul_function, rands, NULL, &s); if (tm == -1.0) abort (); if (option_trace >= 2) - printf ("at %ld size=%ld k=%d %.9f size=%ld %s mul %.9f\n", + printf ("at %ld size=%ld %.9f size=%ld %s mul %.9f\n", (long) size, - (long) size + fft_step_size (k) / 2, k, tk, - (long) s.size, modf ? "modf" : "full", tm); + (long) size + fft_step_size (size) / 2, tk, + (long) s.size, "full", tm); if (tk < tm) { - if (modf) - { - *p->p_modf_threshold = s.size; - print_define (p->modf_threshold_name, *p->p_modf_threshold); - } - else - { - *p->p_threshold = s.size; - print_define (p->threshold_name, *p->p_threshold); - break; - } + *p->p_threshold = s.size; + print_define (p->threshold_name, *p->p_threshold); + break; } } } - - /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, giving wrong results. */ void @@ -904,7 +794,7 @@ tune_mul (gmp_randstate_t rands) one (&mul_toom8h_threshold, rands, ¶m); /* disabled until tuned */ - MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; + MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; } @@ -936,7 +826,7 @@ tune_mullow (gmp_randstate_t rands) one (&mullow_mul_threshold, rands, ¶m); /* disabled until tuned */ - MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; + MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; } void @@ -949,7 +839,7 @@ tune_mulmod_2expm1 (gmp_randstate_t rands) //param.max_size = ?? ; one (&mulmod_2expm1_threshold, rands, ¶m); /* disabled until tuned */ - MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; // ?????????????? + MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; // ?????????????? } void @@ -978,7 +868,7 @@ tune_mulhigh (gmp_randstate_t rands) one (&mulhigh_mul_threshold, rands, ¶m); /* disabled until tuned */ - MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; + MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; } void @@ -1057,7 +947,7 @@ void tune_sqr (gmp_randstate_t rands) { /* disabled until tuned */ - SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; + SQR_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; if (HAVE_NATIVE_mpn_sqr_basecase) { @@ -1909,14 +1799,11 @@ tune_fft_mul (gmp_randstate_t rands) if (option_fft_max_size == 0) return; - param.table_name = "MUL_FFT_TABLE"; param.threshold_name = "MUL_FFT_FULL_THRESHOLD"; param.p_threshold = &mul_fft_full_threshold; - param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; - param.p_modf_threshold = &mul_fft_modf_threshold; param.first_size = MUL_TOOM8H_THRESHOLD / 2; param.max_size = option_fft_max_size; - param.function = speed_mpn_mul_fft; + param.function = speed_mpn_mul_fft_main; param.mul_function = speed_mpn_mul_n; param.sqr = 0; fft (¶m,rands); @@ -1931,14 +1818,11 @@ tune_fft_sqr (gmp_randstate_t rands) if (option_fft_max_size == 0) return; - param.table_name = "SQR_FFT_TABLE"; param.threshold_name = "SQR_FFT_FULL_THRESHOLD"; param.p_threshold = &sqr_fft_full_threshold; - param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; - param.p_modf_threshold = &sqr_fft_modf_threshold; param.first_size = SQR_TOOM8_THRESHOLD / 2; param.max_size = option_fft_max_size; - param.function = speed_mpn_mul_fft_sqr; + param.function = speed_mpn_sqr_fft_main; param.mul_function = speed_mpn_sqr; param.sqr = 0; fft (¶m,rands);