Fixed fft stuff in tuneup.

This commit is contained in:
wbhart 2012-08-08 05:04:56 +00:00
parent ea8c9d864a
commit 6f542aa81d
5 changed files with 37 additions and 241 deletions

View File

@ -1828,31 +1828,7 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[];
#define SQR_TOOM8_THRESHOLD_LIMIT SQR_TOOM8_THRESHOLD
#endif
/* First k to use for an FFT modF multiply. A modF FFT is an order
log(2^k)/log(2^(k-1)) algorithm, so k=3 is merely 1.5 like karatsuba,
whereas k=4 is 1.33 which is faster than toom3 at 1.485. */
#define FFT_FIRST_K 4
/* Threshold at which FFT should be used to do a modF NxN -> N multiply. */
#ifndef MUL_FFT_MODF_THRESHOLD
#define MUL_FFT_MODF_THRESHOLD (MUL_TOOM3_THRESHOLD * 3)
#endif
#ifndef SQR_FFT_MODF_THRESHOLD
#define SQR_FFT_MODF_THRESHOLD (SQR_TOOM3_THRESHOLD * 3)
#endif
/* Threshold at which FFT should be used to do an NxN -> 2N multiply. This
will be a size where FFT is using k=7 or k=8, since an FFT-k used for an
NxN->2N multiply and not recursing into itself is an order
log(2^k)/log(2^(k-2)) algorithm, so it'll be at least k=7 at 1.39 which
is the first better than toom3. */
#ifndef MUL_FFT_THRESHOLD
#define MUL_FFT_THRESHOLD (MUL_FFT_MODF_THRESHOLD * 10)
#endif
#ifndef SQR_FFT_THRESHOLD
#define SQR_FFT_THRESHOLD (SQR_FFT_MODF_THRESHOLD * 10)
#endif
/* points at which fft is used for mul/sqr and mulmod_Bexp resp. */
#ifndef MUL_FFT_FULL_THRESHOLD
#define MUL_FFT_FULL_THRESHOLD (MUL_TOOM8H_THRESHOLD * 10)
#endif
@ -1860,36 +1836,13 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[];
#define SQR_FFT_FULL_THRESHOLD (SQR_TOOM8_THRESHOLD * 10)
#endif
/* Table of thresholds for successive modF FFT "k"s. The first entry is
where FFT_FIRST_K+1 should be used, the second FFT_FIRST_K+2,
etc. See mpn_fft_best_k(). */
#ifndef MUL_FFT_TABLE
#define MUL_FFT_TABLE \
{ MUL_TOOM3_THRESHOLD * 4, /* k=5 */ \
MUL_TOOM3_THRESHOLD * 8, /* k=6 */ \
MUL_TOOM3_THRESHOLD * 16, /* k=7 */ \
MUL_TOOM3_THRESHOLD * 32, /* k=8 */ \
MUL_TOOM3_THRESHOLD * 96, /* k=9 */ \
MUL_TOOM3_THRESHOLD * 288, /* k=10 */ \
0 }
#ifndef MUL_FFT_THRESHOLD
#define MUL_FFT_THRESHOLD (MUL_FFT_FULL_THRESHOLD / 2)
#endif
#ifndef SQR_FFT_TABLE
#define SQR_FFT_TABLE \
{ SQR_TOOM3_THRESHOLD * 4, /* k=5 */ \
SQR_TOOM3_THRESHOLD * 8, /* k=6 */ \
SQR_TOOM3_THRESHOLD * 16, /* k=7 */ \
SQR_TOOM3_THRESHOLD * 32, /* k=8 */ \
SQR_TOOM3_THRESHOLD * 96, /* k=9 */ \
SQR_TOOM3_THRESHOLD * 288, /* k=10 */ \
0 }
#ifndef SQR_FFT_THRESHOLD
#define SQR_FFT_THRESHOLD (SQR_FFT_FULL_THRESHOLD / 2)
#endif
#ifndef FFT_TABLE_ATTRS
#define FFT_TABLE_ATTRS static const
#endif
#define MPN_FFT_TABLE_SIZE 16
#ifndef DC_DIV_QR_THRESHOLD
#define DC_DIV_QR_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
#endif
@ -4067,13 +4020,6 @@ extern mp_size_t mul_fft_threshold;
#define MUL_FFT_FULL_THRESHOLD mul_fft_full_threshold
extern mp_size_t mul_fft_full_threshold;
#undef MUL_FFT_MODF_THRESHOLD
#define MUL_FFT_MODF_THRESHOLD mul_fft_modf_threshold
extern mp_size_t mul_fft_modf_threshold;
#undef MUL_FFT_TABLE
#define MUL_FFT_TABLE { 0 }
/* A native mpn_sqr_basecase is not tuned and SQR_BASECASE_THRESHOLD should
remain as zero (always use it). */
#if ! HAVE_NATIVE_mpn_sqr_basecase
@ -4111,13 +4057,6 @@ extern mp_size_t sqr_fft_threshold;
#define SQR_FFT_FULL_THRESHOLD sqr_fft_full_threshold
extern mp_size_t sqr_fft_full_threshold;
#undef SQR_FFT_MODF_THRESHOLD
#define SQR_FFT_MODF_THRESHOLD sqr_fft_modf_threshold
extern mp_size_t sqr_fft_modf_threshold;
#undef SQR_FFT_TABLE
#define SQR_FFT_TABLE { 0 }
#undef MULLOW_BASECASE_THRESHOLD
#define MULLOW_BASECASE_THRESHOLD mullow_basecase_threshold
extern mp_size_t mullow_basecase_threshold;
@ -4279,10 +4218,6 @@ extern mp_size_t set_str_dc_threshold;
#define SET_STR_PRECOMPUTE_THRESHOLD set_str_precompute_threshold
extern mp_size_t set_str_precompute_threshold;
#undef FFT_TABLE_ATTRS
#define FFT_TABLE_ATTRS
extern mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE];
/* Sizes the tune program tests up to, used in a couple of recompilations. */
#undef MUL_KARATSUBA_THRESHOLD_LIMIT
#undef MUL_TOOM3_THRESHOLD_LIMIT

View File

@ -1150,16 +1150,16 @@ speed_mpn_toom8_sqr_n (struct speed_params *s)
}
double
speed_mpn_mul_fft_full (struct speed_params *s)
speed_mpn_mul_fft_main (struct speed_params *s)
{
SPEED_ROUTINE_MPN_MUL_N_CALL
(mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
(mpn_mul_fft_main (wp, s->xp, s->size, s->yp, s->size));
}
double
speed_mpn_mul_fft_full_sqr (struct speed_params *s)
speed_mpn_sqr_fft_main (struct speed_params *s)
{
SPEED_ROUTINE_MPN_SQR_CALL
(mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
(mpn_mul_fft_main (wp, s->xp, s->size, s->xp, s->size));
}
@ -1171,20 +1171,15 @@ speed_mpn_mul_fft_full_sqr (struct speed_params *s)
{ \
mp_ptr wp; \
mp_size_t pl; \
int k; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
if (s->r != 0) \
k = s->r; \
else \
k = mpn_fft_best_k (s->size, sqr); \
\
TMP_MARK; \
pl = mpn_fft_next_size (s->size, k); \
pl = fft_adjust_limbs (s->size); \
SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
@ -1204,20 +1199,6 @@ speed_mpn_mul_fft_full_sqr (struct speed_params *s)
return t; \
}
double
speed_mpn_mul_fft (struct speed_params *s)
{
SPEED_ROUTINE_MPN_MUL_FFT_CALL
(mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
}
double
speed_mpn_mul_fft_sqr (struct speed_params *s)
{
SPEED_ROUTINE_MPN_MUL_FFT_CALL
(mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
}
double
speed_mpn_mullow_n (struct speed_params *s)
{

View File

@ -326,11 +326,9 @@ const struct routine_t {
{ "mpn_toom8h_mul", speed_mpn_toom8h_mul },
{ "mpn_toom3_sqr_n", speed_mpn_toom3_sqr_n },
{ "mpn_toom4_sqr_n", speed_mpn_toom4_sqr_n },
{ "mpn_mul_fft_full", speed_mpn_mul_fft_full },
{ "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr },
{ "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL },
{ "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
{ "mpn_mul_fft_main", speed_mpn_mul_fft_main, FLAG_R_OPTIONAL },
{ "mpn_sqr_fft_main", speed_mpn_sqr_fft_main, FLAG_R_OPTIONAL },
{ "mpn_mullow_n", speed_mpn_mullow_n },
{ "mpn_mullow_n_basecase", speed_mpn_mullow_n_basecase},

View File

@ -251,10 +251,8 @@ double speed_mpn_mul_1 _PROTO ((struct speed_params *s));
double speed_mpn_mul_1_inplace _PROTO ((struct speed_params *s));
double speed_mpn_mul_2 _PROTO ((struct speed_params *s));
double speed_mpn_mul_basecase _PROTO ((struct speed_params *s));
double speed_mpn_mul_fft _PROTO ((struct speed_params *s));
double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s));
double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s));
double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s));
double speed_mpn_mul_fft_main _PROTO ((struct speed_params *s));
double speed_mpn_sqr_fft_main _PROTO ((struct speed_params *s));
double speed_mpn_mul_n _PROTO ((struct speed_params *s));
double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s));
double speed_mpn_mullow_n _PROTO ((struct speed_params *s));

View File

@ -67,12 +67,6 @@ MA 02110-1301, USA. */
limits are recompiled though, to make them accept a bigger range of sizes
than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n.
Limitations:
The FFTs aren't subject to the same badness rule as the other thresholds,
so each k is probably being brought on a touch early. This isn't likely
to make a difference, and the simpler probing means fewer tests.
Code:
- main : checks for various command line options and calls all()
- all : prints the tuneup message, date and compiler, then calls
@ -182,18 +176,14 @@ mp_size_t mul_karatsuba_threshold = MP_SIZE_T_MAX;
mp_size_t mul_toom3_threshold = MUL_TOOM3_THRESHOLD_LIMIT;
mp_size_t mul_toom4_threshold = MUL_TOOM4_THRESHOLD_LIMIT;
mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
mp_size_t mul_fft_full_threshold = MP_SIZE_T_MAX;
mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
mp_size_t sqr_karatsuba_threshold
= (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX);
mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
mp_size_t sqr_fft_full_threshold = MP_SIZE_T_MAX;
mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
mp_size_t mulmod_2expm1_threshold = MP_SIZE_T_MAX;
mp_size_t mullow_basecase_threshold = MP_SIZE_T_MAX;
mp_size_t mullow_dc_threshold = MP_SIZE_T_MAX;
@ -234,9 +224,6 @@ mp_size_t divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
mp_size_t rsh_divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
mp_size_t divrem_euclid_hensel_threshold = MP_SIZE_T_MAX;
mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
struct param_t {
const char *name;
speed_function_t function;
@ -702,11 +689,8 @@ one (mp_size_t *threshold, gmp_randstate_t rands,struct param_t *param)
if this step effect remains. */
struct fft_param_t {
const char *table_name;
const char *threshold_name;
const char *modf_threshold_name;
mp_size_t *p_threshold;
mp_size_t *p_modf_threshold;
mp_size_t first_size;
mp_size_t max_size;
speed_function_t function;
@ -714,103 +698,32 @@ struct fft_param_t {
mp_size_t sqr;
};
/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
of 2^(k-1) bits. */
mp_size_t
fft_step_size (int k)
fft_step_size (int size)
{
mp_size_t step;
step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
step *= (mp_size_t) 1 << k;
step = fft_adjust_limbs(size + 1) - size;
if (step <= 0)
{
printf ("Can't handle k=%d\n", k);
printf ("Can't handle size=%d\n", size);
abort ();
}
return step;
}
mp_size_t
fft_next_size (mp_size_t pl, int k)
{
mp_size_t m = fft_step_size (k);
/* printf ("[k=%d %ld] %ld ->", k, m, pl); */
if (pl == 0 || (pl & (m-1)) != 0)
pl = (pl | (m-1)) + 1;
/* printf (" %ld\n", pl); */
return pl;
}
void
fft (struct fft_param_t *p,gmp_randstate_t rands)
{
mp_size_t size;
int i, k;
for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++)
mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX;
*p->p_threshold = MP_SIZE_T_MAX;
*p->p_modf_threshold = MP_SIZE_T_MAX;
option_trace = MAX (option_trace, option_fft_trace);
printf ("#define %s {", p->table_name);
if (option_trace >= 2)
printf ("\n");
k = FFT_FIRST_K;
size = p->first_size;
for (;;)
{
double tk, tk1;
size = fft_next_size (size+1, k+1);
if (size >= p->max_size)
break;
if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
break;
/* compare k to k+1 in the middle of the current k+1 step */
s.size = size + fft_step_size (k+1) / 2;
s.r = k;
tk = tuneup_measure (p->function, rands, NULL, &s);
if (tk == -1.0)
abort ();
s.r = k+1;
tk1 = tuneup_measure (p->function, rands, NULL, &s);
if (tk1 == -1.0)
abort ();
if (option_trace >= 2)
printf ("at %ld size=%ld k=%d %.9f k=%d %.9f\n",
(long) size, (long) s.size, k, tk, k+1, tk1);
/* declare the k+1 threshold as soon as it's faster at its midpoint */
if (tk1 < tk)
{
mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size;
printf (" %ld,", (long) s.size);
if (option_trace >= 2) printf ("\n");
k++;
}
}
mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
printf (" 0 }\n");
size = p->first_size;
/* Declare an FFT faster than a plain toom3 etc multiplication found as
@ -818,62 +731,39 @@ fft (struct fft_param_t *p,gmp_randstate_t rands)
middle of the FFT step is tested. */
for (;;)
{
int modf = (*p->p_modf_threshold == MP_SIZE_T_MAX);
double tk, tm;
/* k=7 should be the first FFT which can beat toom3 on a full
multiply, so jump to that threshold and save some probing after the
modf threshold is found. */
if (!modf && size < mpn_fft_table[p->sqr][2])
{
size = mpn_fft_table[p->sqr][2];
if (option_trace >= 2)
printf ("jump to size=%ld\n", (long) size);
}
size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr));
k = mpn_fft_best_k (size, p->sqr);
size = fft_adjust_limbs (size+1);
if (size >= p->max_size)
break;
s.size = size + fft_step_size (k) / 2;
s.r = k;
s.size = size + fft_step_size (size) / 2;
tk = tuneup_measure (p->function, rands, NULL, &s);
if (tk == -1.0)
abort ();
if (!modf) s.size;
tm = tuneup_measure (p->mul_function, rands, NULL, &s);
if (tm == -1.0)
abort ();
if (option_trace >= 2)
printf ("at %ld size=%ld k=%d %.9f size=%ld %s mul %.9f\n",
printf ("at %ld size=%ld %.9f size=%ld %s mul %.9f\n",
(long) size,
(long) size + fft_step_size (k) / 2, k, tk,
(long) s.size, modf ? "modf" : "full", tm);
(long) size + fft_step_size (size) / 2, tk,
(long) s.size, "full", tm);
if (tk < tm)
{
if (modf)
{
*p->p_modf_threshold = s.size;
print_define (p->modf_threshold_name, *p->p_modf_threshold);
}
else
{
*p->p_threshold = s.size;
print_define (p->threshold_name, *p->p_threshold);
break;
}
*p->p_threshold = s.size;
print_define (p->threshold_name, *p->p_threshold);
break;
}
}
}
/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
giving wrong results. */
void
@ -904,7 +794,7 @@ tune_mul (gmp_randstate_t rands)
one (&mul_toom8h_threshold, rands, &param);
/* disabled until tuned */
MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
}
@ -936,7 +826,7 @@ tune_mullow (gmp_randstate_t rands)
one (&mullow_mul_threshold, rands, &param);
/* disabled until tuned */
MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
}
void
@ -949,7 +839,7 @@ tune_mulmod_2expm1 (gmp_randstate_t rands)
//param.max_size = ?? ;
one (&mulmod_2expm1_threshold, rands, &param);
/* disabled until tuned */
MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; // ??????????????
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; // ??????????????
}
void
@ -978,7 +868,7 @@ tune_mulhigh (gmp_randstate_t rands)
one (&mulhigh_mul_threshold, rands, &param);
/* disabled until tuned */
MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
}
void
@ -1057,7 +947,7 @@ void
tune_sqr (gmp_randstate_t rands)
{
/* disabled until tuned */
SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
SQR_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
if (HAVE_NATIVE_mpn_sqr_basecase)
{
@ -1909,14 +1799,11 @@ tune_fft_mul (gmp_randstate_t rands)
if (option_fft_max_size == 0)
return;
param.table_name = "MUL_FFT_TABLE";
param.threshold_name = "MUL_FFT_FULL_THRESHOLD";
param.p_threshold = &mul_fft_full_threshold;
param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
param.p_modf_threshold = &mul_fft_modf_threshold;
param.first_size = MUL_TOOM8H_THRESHOLD / 2;
param.max_size = option_fft_max_size;
param.function = speed_mpn_mul_fft;
param.function = speed_mpn_mul_fft_main;
param.mul_function = speed_mpn_mul_n;
param.sqr = 0;
fft (&param,rands);
@ -1931,14 +1818,11 @@ tune_fft_sqr (gmp_randstate_t rands)
if (option_fft_max_size == 0)
return;
param.table_name = "SQR_FFT_TABLE";
param.threshold_name = "SQR_FFT_FULL_THRESHOLD";
param.p_threshold = &sqr_fft_full_threshold;
param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
param.p_modf_threshold = &sqr_fft_modf_threshold;
param.first_size = SQR_TOOM8_THRESHOLD / 2;
param.max_size = option_fft_max_size;
param.function = speed_mpn_mul_fft_sqr;
param.function = speed_mpn_sqr_fft_main;
param.mul_function = speed_mpn_sqr;
param.sqr = 0;
fft (&param,rands);