2303 lines
65 KiB
C
2303 lines
65 KiB
C
/* Create tuned thresholds for various algorithms.
|
|
|
|
Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010,
|
|
2011, 2012 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU MP Library.
|
|
|
|
The GNU MP Library is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License as published by
|
|
the Free Software Foundation; either version 3 of the License, or (at your
|
|
option) any later version.
|
|
|
|
The GNU MP Library is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
|
|
|
|
*/
|
|
|
|
|
|
/* Usage: tuneup [-t] [-t] [-p precision]
|
|
|
|
-t turns on some diagnostic traces, a second -t turns on more traces.
|
|
|
|
Algorithm:
|
|
|
|
The thresholds are determined as follows: two algorithms A and B are
|
|
compared over a range of sizes. At each point, we define "badness" to
|
|
be the percentage time lost if algorithm B is chosen over algorithm A.
|
|
Then total badness is the sum of this over all sizes measured. The
|
|
threshold is set to minimize total badness.
|
|
|
|
In practice the thresholds tend to be chosen to bring on algorithm B
|
|
fairly quickly.
|
|
|
|
Implementation:
|
|
|
|
In a normal library build the thresholds are constants. To tune them
|
|
selected objects are recompiled with the thresholds as global variables
|
|
instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
|
|
the end of gmp-impl.h, and rules in tune/Makefile.am.
|
|
|
|
MUL_KARATSUBA_THRESHOLD for example uses a recompiled mpn_mul_n. The
|
|
threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
|
|
level, but recurse into the basecase.
|
|
|
|
MUL_TOOM3_THRESHOLD makes use of the tuned MUL_KARATSUBA_THRESHOLD value.
|
|
Other routines in turn will make use of both of those. Naturally the
|
|
dependants must be tuned first.
|
|
|
|
In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
|
|
just a threshold based on comparing two routines (mpn_divrem_1 and
|
|
mpn_divexact_1), and no further use of the value determined.
|
|
|
|
Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
|
|
just comparisons between certain routines on representative data.
|
|
|
|
Shortcuts are applied when native (assembler) versions of routines exist.
|
|
For instance a native mpn_sqr_basecase is assumed to be always faster
|
|
than mpn_mul_basecase, with no measuring.
|
|
|
|
No attempt is made to tune within assembler routines, for instance
|
|
DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
|
|
written and tuned all by hand. Assembler routines that might have hard
|
|
limits are recompiled though, to make them accept a bigger range of sizes
|
|
than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n.
|
|
|
|
Code:
|
|
- main : checks for various command line options and calls all()
|
|
- all : prints the tuneup message, date and compiler, then calls
|
|
each of the individual tuning functions in turn, e.g.
|
|
tune_mul()
|
|
- tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the
|
|
karatsuba and toom cutoffs. It sets up a param struct with the
|
|
following parameters:
|
|
a) name : the name of the threshold being tuned, e.g.
|
|
MUL_TOOM3_THRESHOLD
|
|
b) function : the first function being compared (this must be
|
|
of the form speed_blah and the function speed_blah will
|
|
exist in speed.h and speed.c
|
|
c) function2 : the second function being compared (if set to
|
|
NULL, this is automatically set to equal function
|
|
d) step_factor : the size of the step between sizes,
|
|
set to 0.01 by default, i.e. 1% increments
|
|
e) function_fudge : multiplier for the speed of function, used
|
|
to adjust for overheads, by default set to 1.0
|
|
f) stop_since_change is a stop condition. If the threshold
|
|
has not changed for this many iterations, then stop. This
|
|
is set to 80 iterations by default.
|
|
g) stop_factor : this is another stop factor. If method B
|
|
becomes faster by at least this factor, then stop. By
|
|
default this is set to 1.2, i.e. 20% faster.
|
|
h) min_size : the minimum size to start comparing from.
|
|
i) min_is_always : if this is set to 1, then if the threshold
|
|
just ends up being min_size, then the threshold is actually
|
|
set to 0, i.e. algorithm B is always used.
|
|
j) max_size : the maximum size to compare up to. By default this
|
|
is set to DEFAULT_MAX_SIZE which is 1000 limbs.
|
|
h) check_size : if set, will check that the given starting size
|
|
is valid for both algorithms and that algorithm B is at least
|
|
4% slower than algorithm A at that point.
|
|
i) size_extra : this is a bias added to each size when doing
|
|
measurements. It is subtracted off after each measurement.
|
|
It is basically used for shifting a threshold from the
|
|
measured value.
|
|
j) data_high : if set to 1, the high limb of xp and yp are set to
|
|
be less than s->r, if set to 2, the high limb of xp and yp are
|
|
set to be greater than or equal to s->r
|
|
k) noprint : if set, the threshold is computed but not printed.
|
|
|
|
After setting all the appropriate parameters, the function one() is
|
|
called. It takes a reference to a parameter, e.g. mul_toom3_threshold
|
|
which is defined in a table below. That threshold will have been given
|
|
some initial value (usually MP_SIZE_T_MAX) in the table. It also takes
|
|
a reference to the param struct.
|
|
- one() : does repeated timings over the given range of sizes always setting
|
|
the threshold to size+1 for function and size for function2.
|
|
|
|
N.B: the functions that need to be rebuilt to use variable thresholds must be
|
|
added to the Makefile.am file (and automake run) before tune can work.
|
|
|
|
*/
|
|
|
|
#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
|
|
|
|
#include "config.h"
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
#if HAVE_UNISTD_H
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#include "mpir.h"
|
|
#include "gmp-impl.h"
|
|
#include "longlong.h"
|
|
|
|
#include "tests.h"
|
|
#include "speed.h"
|
|
|
|
#if !HAVE_DECL_OPTARG
|
|
extern char *optarg;
|
|
extern int optind, opterr;
|
|
#endif
|
|
|
|
|
|
#define DEFAULT_MAX_SIZE 1000 /* limbs */
|
|
|
|
#if WANT_FFT
|
|
mp_size_t option_fft_max_size = 50000; /* limbs */
|
|
#else
|
|
mp_size_t option_fft_max_size = 0;
|
|
#endif
|
|
int option_trace = 0;
|
|
int option_fft_trace = 0;
|
|
struct speed_params s;
|
|
|
|
struct dat_t {
|
|
mp_size_t size;
|
|
double d;
|
|
} *dat = NULL;
|
|
int ndat = 0;
|
|
int allocdat = 0;
|
|
|
|
/* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
|
|
case use zero here, which for params.max_size means no limit. */
|
|
#ifndef TUNE_SQR_KARATSUBA_MAX
|
|
#define TUNE_SQR_KARATSUBA_MAX 0
|
|
#endif
|
|
|
|
mp_size_t mul_karatsuba_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mul_toom3_threshold = MUL_TOOM3_THRESHOLD_LIMIT;
|
|
mp_size_t mul_toom4_threshold = MUL_TOOM4_THRESHOLD_LIMIT;
|
|
mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
|
|
mp_size_t mul_fft_full_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t sqr_karatsuba_threshold
|
|
= (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX);
|
|
mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
|
|
mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
|
|
mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
|
|
mp_size_t sqr_fft_full_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mulmod_2expm1_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mullow_basecase_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mullow_dc_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mullow_mul_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mulhigh_basecase_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mulhigh_dc_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t inv_div_qr_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t inv_divappr_q_n_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t dc_div_q_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t inv_div_q_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t inv_divappr_q_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mod_1_1_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mod_1_2_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mod_1_3_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t rootrem_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t rsh_divrem_hensel_qr_1_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t divrem_euclid_hensel_threshold = MP_SIZE_T_MAX;
|
|
mp_size_t fac_odd_threshold = 0;
|
|
mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT;
|
|
|
|
struct param_t {
|
|
const char *name;
|
|
speed_function_t function;
|
|
speed_function_t function2;
|
|
double step_factor; /* how much to step sizes (rounded down) */
|
|
double function_fudge; /* multiplier for "function" speeds */
|
|
int stop_since_change;
|
|
double stop_factor;
|
|
mp_size_t min_size;
|
|
int min_is_always;
|
|
mp_size_t max_size;
|
|
mp_size_t check_size;
|
|
mp_size_t size_extra;
|
|
|
|
#define DATA_HIGH_LT_R 1
|
|
#define DATA_HIGH_GE_R 2
|
|
int data_high;
|
|
|
|
int noprint;
|
|
};
|
|
|
|
|
|
/* These are normally undefined when false, which suits "#if" fine.
|
|
But give them zero values so they can be used in plain C "if"s. */
|
|
#ifndef UDIV_PREINV_ALWAYS
|
|
#define UDIV_PREINV_ALWAYS 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_divexact_1
|
|
#define HAVE_NATIVE_mpn_divexact_1 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_divrem_1
|
|
#define HAVE_NATIVE_mpn_divrem_1 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_divrem_2
|
|
#define HAVE_NATIVE_mpn_divrem_2 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_mod_1
|
|
#define HAVE_NATIVE_mpn_mod_1 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_modexact_1_odd
|
|
#define HAVE_NATIVE_mpn_modexact_1_odd 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_preinv_divrem_1
|
|
#define HAVE_NATIVE_mpn_preinv_divrem_1 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_preinv_mod_1
|
|
#define HAVE_NATIVE_mpn_preinv_mod_1 0
|
|
#endif
|
|
#ifndef HAVE_NATIVE_mpn_sqr_basecase
|
|
#define HAVE_NATIVE_mpn_sqr_basecase 0
|
|
#endif
|
|
|
|
|
|
#define MAX3(a,b,c) MAX (MAX (a, b), c)
|
|
|
|
mp_limb_t
|
|
randlimb_norm (gmp_randstate_t rands)
|
|
{
|
|
mp_limb_t n;
|
|
mpn_randomb (&n,rands, 1);
|
|
n |= GMP_NUMB_HIGHBIT;
|
|
return n;
|
|
}
|
|
|
|
#define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
|
|
|
|
mp_limb_t
|
|
randlimb_half (gmp_randstate_t rands)
|
|
{
|
|
mp_limb_t n;
|
|
mpn_randomb (&n, rands,1);
|
|
n &= GMP_NUMB_HALFMASK;
|
|
n += (n==0);
|
|
return n;
|
|
}
|
|
|
|
|
|
/* Add an entry to the end of the dat[] array, reallocing to make it bigger
|
|
if necessary. */
|
|
void
|
|
add_dat (mp_size_t size, double d)
|
|
{
|
|
#define ALLOCDAT_STEP 500
|
|
|
|
ASSERT_ALWAYS (ndat <= allocdat);
|
|
|
|
if (ndat == allocdat)
|
|
{
|
|
dat = (struct dat_t *) __gmp_allocate_or_reallocate
|
|
(dat, allocdat * sizeof(dat[0]),
|
|
(allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
|
|
allocdat += ALLOCDAT_STEP;
|
|
}
|
|
|
|
dat[ndat].size = size;
|
|
dat[ndat].d = d;
|
|
ndat++;
|
|
}
|
|
|
|
|
|
/* Return the threshold size based on the data accumulated. */
|
|
mp_size_t
|
|
analyze_dat (int final)
|
|
{
|
|
double x, min_x;
|
|
int j, min_j;
|
|
|
|
/* If the threshold is set at dat[0].size, any positive values are bad. */
|
|
x = 0.0;
|
|
for (j = 0; j < ndat; j++)
|
|
if (dat[j].d > 0.0)
|
|
x += dat[j].d;
|
|
|
|
if (option_trace >= 2 && final)
|
|
{
|
|
printf ("\n");
|
|
printf ("x is the sum of the badness from setting thresh at given size\n");
|
|
printf (" (minimum x is sought)\n");
|
|
printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
|
|
}
|
|
|
|
min_x = x;
|
|
min_j = 0;
|
|
|
|
|
|
/* When stepping to the next dat[j].size, positive values are no longer
|
|
bad (so subtracted), negative values become bad (so add the absolute
|
|
value, meaning subtract). */
|
|
for (j = 0; j < ndat; x -= dat[j].d, j++)
|
|
{
|
|
if (option_trace >= 2 && final)
|
|
printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
|
|
|
|
if (x < min_x)
|
|
{
|
|
min_x = x;
|
|
min_j = j;
|
|
}
|
|
}
|
|
|
|
return min_j;
|
|
}
|
|
|
|
|
|
/* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
|
|
|
|
mp_limb_t mpn_divrem_1_tune(mp_ptr qp, mp_size_t xsize, mp_srcptr ap, mp_size_t size, mp_limb_t d);
|
|
mp_limb_t mpn_mod_1_tune(mp_srcptr ap, mp_size_t size, mp_limb_t d);
|
|
|
|
void mpz_fac_ui_tune(mpz_ptr, mpir_ui);
|
|
|
|
double
|
|
speed_mpn_mod_1_tune (struct speed_params *s)
|
|
{
|
|
SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
|
|
}
|
|
double
|
|
speed_mpn_divrem_1_tune (struct speed_params *s)
|
|
{
|
|
SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
|
|
}
|
|
|
|
double
|
|
speed_mpz_fac_ui_tune (struct speed_params *s)
|
|
{
|
|
SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
|
|
}
|
|
|
|
double
|
|
tuneup_measure (speed_function_t fun,gmp_randstate_t rands,
|
|
const struct param_t *param,
|
|
struct speed_params *s)
|
|
{
|
|
static struct param_t dummy;
|
|
double t;
|
|
TMP_DECL;
|
|
|
|
if (! param)
|
|
param = &dummy;
|
|
|
|
s->size += param->size_extra;
|
|
|
|
TMP_MARK;
|
|
SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
|
|
SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
|
|
|
|
mpn_randomb (s->xp, rands, s->size);
|
|
mpn_randomb (s->yp, rands, s->size);
|
|
|
|
switch (param->data_high) {
|
|
case DATA_HIGH_LT_R:
|
|
s->xp[s->size-1] %= s->r;
|
|
s->yp[s->size-1] %= s->r;
|
|
break;
|
|
case DATA_HIGH_GE_R:
|
|
s->xp[s->size-1] |= s->r;
|
|
s->yp[s->size-1] |= s->r;
|
|
break;
|
|
}
|
|
|
|
t = speed_measure (fun, s);
|
|
|
|
s->size -= param->size_extra;
|
|
|
|
TMP_FREE;
|
|
return t;
|
|
}
|
|
|
|
|
|
#define PRINT_WIDTH 28
|
|
|
|
void
|
|
print_define_start (const char *name)
|
|
{
|
|
printf ("#define %-*s ", PRINT_WIDTH, name);
|
|
if (option_trace)
|
|
printf ("...\n");
|
|
}
|
|
|
|
void
|
|
print_define_end_remark (const char *name, mp_size_t value, const char *remark)
|
|
{
|
|
if (option_trace)
|
|
printf ("#define %-*s ", PRINT_WIDTH, name);
|
|
|
|
if (value == MP_SIZE_T_MAX)
|
|
printf ("MP_SIZE_T_MAX");
|
|
else
|
|
printf ("%5ld", (long) value);
|
|
|
|
if (remark != NULL)
|
|
printf (" /* %s */", remark);
|
|
printf ("\n");
|
|
}
|
|
|
|
void
|
|
print_define_end (const char *name, mp_size_t value)
|
|
{
|
|
const char *remark;
|
|
if (value == MP_SIZE_T_MAX)
|
|
remark = "never";
|
|
else if (value == 0)
|
|
remark = "always";
|
|
else
|
|
remark = NULL;
|
|
print_define_end_remark (name, value, remark);
|
|
}
|
|
|
|
void
|
|
print_define (const char *name, mp_size_t value)
|
|
{
|
|
print_define_start (name);
|
|
print_define_end (name, value);
|
|
}
|
|
|
|
void
|
|
print_define_remark (const char *name, mp_size_t value, const char *remark)
|
|
{
|
|
print_define_start (name);
|
|
print_define_end_remark (name, value, remark);
|
|
}
|
|
|
|
|
|
void
|
|
one (mp_size_t *threshold, gmp_randstate_t rands,struct param_t *param)
|
|
{
|
|
int since_positive, since_thresh_change;
|
|
int thresh_idx, new_thresh_idx;
|
|
|
|
#define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
|
|
|
|
DEFAULT (param->function_fudge, 1.0);
|
|
DEFAULT (param->function2, param->function);
|
|
DEFAULT (param->step_factor, 0.01); /* small steps by default */
|
|
DEFAULT (param->stop_since_change, 80);
|
|
DEFAULT (param->stop_factor, 1.2);
|
|
DEFAULT (param->min_size, 10);
|
|
DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
|
|
|
|
if (param->check_size != 0)
|
|
{
|
|
double t1, t2;
|
|
s.size = param->check_size;
|
|
|
|
*threshold = s.size+1;
|
|
t1 = tuneup_measure (param->function, rands,param, &s);
|
|
|
|
*threshold = s.size;
|
|
t2 = tuneup_measure (param->function2, rands, param, &s);
|
|
if (t1 == -1.0 || t2 == -1.0)
|
|
{
|
|
printf ("Oops, can't run both functions at size %ld\n",
|
|
(long) s.size);
|
|
abort ();
|
|
}
|
|
t1 *= param->function_fudge;
|
|
|
|
/* ask that t2 is at least 4% below t1 */
|
|
if (t1 < t2*1.04)
|
|
{
|
|
if (option_trace)
|
|
printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
|
|
*threshold = MP_SIZE_T_MAX;
|
|
if (! param->noprint)
|
|
print_define (param->name, *threshold);
|
|
return;
|
|
}
|
|
|
|
if (option_trace >= 2)
|
|
printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
|
|
(long) s.size, t1, t2);
|
|
}
|
|
|
|
if (! param->noprint || option_trace)
|
|
print_define_start (param->name);
|
|
|
|
ndat = 0;
|
|
since_positive = 0;
|
|
since_thresh_change = 0;
|
|
thresh_idx = 0;
|
|
|
|
if (option_trace >= 2)
|
|
{
|
|
printf (" algorithm-A algorithm-B ratio possible\n");
|
|
printf (" (seconds) (seconds) diff thresh\n");
|
|
}
|
|
|
|
for (s.size = param->min_size;
|
|
s.size < param->max_size;
|
|
s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
|
|
{
|
|
double ti, tiplus1, d;
|
|
|
|
/* If there's a size limit and it's reached then it should still
|
|
be sensible to analyze the data since we want the threshold put
|
|
either at or near the limit. */
|
|
if (s.size >= param->max_size)
|
|
{
|
|
if (option_trace)
|
|
printf ("Reached maximum size (%ld) without otherwise stopping\n",
|
|
(long) param->max_size);
|
|
break;
|
|
}
|
|
|
|
/*
|
|
FIXME: check minimum size requirements are met, possibly by just
|
|
checking for the -1 returns from the speed functions.
|
|
*/
|
|
|
|
/* using method A at this size */
|
|
*threshold = s.size+1;
|
|
ti = tuneup_measure (param->function, rands,param, &s);
|
|
if (ti == -1.0)
|
|
abort ();
|
|
ti *= param->function_fudge;
|
|
|
|
/* using method B at this size */
|
|
*threshold = s.size;
|
|
tiplus1 = tuneup_measure (param->function2, rands, param, &s);
|
|
if (tiplus1 == -1.0)
|
|
abort ();
|
|
|
|
/* Calculate the fraction by which the one or the other routine is
|
|
slower. */
|
|
if (tiplus1 >= ti)
|
|
d = (tiplus1 - ti) / tiplus1; /* negative */
|
|
else
|
|
d = (tiplus1 - ti) / ti; /* positive */
|
|
|
|
add_dat (s.size, d);
|
|
|
|
new_thresh_idx = analyze_dat (0);
|
|
|
|
if (option_trace >= 2)
|
|
printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
|
|
(long) s.size, ti, tiplus1, d,
|
|
ti > tiplus1 ? '#' : ' ',
|
|
(long) dat[new_thresh_idx].size);
|
|
|
|
/* Stop if the last time method i was faster was more than a
|
|
certain number of measurements ago. */
|
|
#define STOP_SINCE_POSITIVE 200
|
|
if (d >= 0)
|
|
since_positive = 0;
|
|
else
|
|
if (++since_positive > STOP_SINCE_POSITIVE)
|
|
{
|
|
if (option_trace >= 1)
|
|
printf ("stopped due to since_positive (%d)\n",
|
|
STOP_SINCE_POSITIVE);
|
|
break;
|
|
}
|
|
|
|
/* Stop if method A has become slower by a certain factor. */
|
|
if (ti >= tiplus1 * param->stop_factor)
|
|
{
|
|
if (option_trace >= 1)
|
|
printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
|
|
param->stop_factor);
|
|
break;
|
|
}
|
|
|
|
/* Stop if the threshold implied hasn't changed in a certain
|
|
number of measurements. (It's this condition that ususally
|
|
stops the loop.) */
|
|
if (thresh_idx != new_thresh_idx)
|
|
since_thresh_change = 0, thresh_idx = new_thresh_idx;
|
|
else
|
|
if (++since_thresh_change > param->stop_since_change)
|
|
{
|
|
if (option_trace >= 1)
|
|
printf ("stopped due to since_thresh_change (%d)\n",
|
|
param->stop_since_change);
|
|
break;
|
|
}
|
|
|
|
/* Stop if the threshold implied is more than a certain number of
|
|
measurements ago. */
|
|
#define STOP_SINCE_AFTER 500
|
|
if (ndat - thresh_idx > STOP_SINCE_AFTER)
|
|
{
|
|
if (option_trace >= 1)
|
|
printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
|
|
STOP_SINCE_AFTER);
|
|
break;
|
|
}
|
|
|
|
/* Stop when the size limit is reached before the end of the
|
|
crossover, but only show this as an error for >= the default max
|
|
size. FIXME: Maybe should make it a param choice whether this is
|
|
an error. */
|
|
if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
|
|
{
|
|
fprintf (stderr, "%s\n", param->name);
|
|
fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
|
|
(long) dat[0].size, (long) dat[ndat-1].size, ndat);
|
|
fprintf (stderr, " max size reached before end of crossover\n");
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (option_trace >= 1)
|
|
printf ("sizes %ld to %ld total %d measurements\n",
|
|
(long) dat[0].size, (long) dat[ndat-1].size, ndat);
|
|
|
|
*threshold = dat[analyze_dat (1)].size;
|
|
|
|
if (param->min_is_always)
|
|
{
|
|
if (*threshold == param->min_size)
|
|
*threshold = 0;
|
|
}
|
|
|
|
if (! param->noprint || option_trace)
|
|
print_define_end (param->name, *threshold);
|
|
}
|
|
|
|
|
|
/* Special probing for the fft thresholds. The size restrictions on the
|
|
FFTs mean the graph of time vs size has a step effect. See this for
|
|
example using
|
|
|
|
./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
|
|
gnuplot foo.gnuplot
|
|
|
|
The current approach is to compare routines at the midpoint of relevant
|
|
steps. Arguably a more sophisticated system of threshold data is wanted
|
|
if this step effect remains. */
|
|
|
|
struct fft_param_t {
|
|
const char *threshold_name;
|
|
mp_size_t *p_threshold;
|
|
mp_size_t first_size;
|
|
mp_size_t max_size;
|
|
speed_function_t function;
|
|
speed_function_t mul_function;
|
|
mp_size_t sqr;
|
|
};
|
|
|
|
mp_size_t
|
|
fft_step_size (int size)
|
|
{
|
|
mp_size_t step;
|
|
|
|
step = mpir_fft_adjust_limbs(size + 1) - size;
|
|
|
|
if (step <= 0)
|
|
{
|
|
printf ("Can't handle size=%d\n", size);
|
|
abort ();
|
|
}
|
|
|
|
return step;
|
|
}
|
|
|
|
void
|
|
fft (struct fft_param_t *p,gmp_randstate_t rands)
|
|
{
|
|
mp_size_t size;
|
|
int i, k;
|
|
|
|
*p->p_threshold = MP_SIZE_T_MAX;
|
|
|
|
option_trace = MAX (option_trace, option_fft_trace);
|
|
|
|
size = p->first_size;
|
|
|
|
/* Declare an FFT faster than a plain toom3 etc multiplication found as
|
|
soon as one faster measurement obtained. A multiplication in the
|
|
middle of the FFT step is tested. */
|
|
for (;;)
|
|
{
|
|
double tk, tm;
|
|
|
|
size = mpir_fft_adjust_limbs (size+1);
|
|
|
|
if (size >= p->max_size)
|
|
break;
|
|
|
|
s.size = size + fft_step_size (size) / 2;
|
|
|
|
tk = tuneup_measure (p->function, rands, NULL, &s);
|
|
if (tk == -1.0)
|
|
abort ();
|
|
|
|
tm = tuneup_measure (p->mul_function, rands, NULL, &s);
|
|
if (tm == -1.0)
|
|
abort ();
|
|
|
|
if (option_trace >= 2)
|
|
printf ("at %ld size=%ld %.9f size=%ld %s mul %.9f\n",
|
|
(long) size,
|
|
(long) size + fft_step_size (size) / 2, tk,
|
|
(long) s.size, "full", tm);
|
|
|
|
if (tk < tm)
|
|
{
|
|
*p->p_threshold = s.size;
|
|
print_define (p->threshold_name, *p->p_threshold);
|
|
break;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
|
|
giving wrong results. */
|
|
void
|
|
tune_mul (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.function = speed_mpn_mul_n;
|
|
|
|
param.name = "MUL_KARATSUBA_THRESHOLD";
|
|
param.min_size = MAX (4, MPN_KARA_MUL_N_MINSIZE);
|
|
param.max_size = MUL_KARATSUBA_THRESHOLD_LIMIT-1;
|
|
one (&mul_karatsuba_threshold, rands,¶m);
|
|
|
|
param.name = "MUL_TOOM3_THRESHOLD";
|
|
param.min_size = MAX (mul_karatsuba_threshold, MPN_TOOM3_MUL_N_MINSIZE);
|
|
param.max_size = MUL_TOOM3_THRESHOLD_LIMIT-1;
|
|
one (&mul_toom3_threshold, rands, ¶m);
|
|
|
|
param.name = "MUL_TOOM4_THRESHOLD";
|
|
param.min_size = MAX (mul_toom3_threshold, MPN_TOOM4_MUL_N_MINSIZE);
|
|
param.max_size = MUL_TOOM4_THRESHOLD_LIMIT-1;
|
|
one (&mul_toom4_threshold, rands, ¶m);
|
|
|
|
param.name = "MUL_TOOM8H_THRESHOLD";
|
|
param.min_size = MAX (mul_toom4_threshold, MPN_TOOM8H_MUL_MINSIZE);
|
|
param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
|
|
one (&mul_toom8h_threshold, rands, ¶m);
|
|
|
|
/* disabled until tuned */
|
|
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
|
|
}
|
|
|
|
|
|
/* This was written by the tuneup challenged tege. Kevin, please delete
|
|
this comment when you've reviewed/rewritten this. :-) */
|
|
void
|
|
tune_mullow (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.function = speed_mpn_mullow_n;
|
|
|
|
param.name = "MULLOW_BASECASE_THRESHOLD";
|
|
param.min_size = 3;
|
|
param.min_is_always = 1;
|
|
//param.max_size = MULLOW_BASECASE_THRESHOLD_LIMIT-1;
|
|
one (&mullow_basecase_threshold, rands, ¶m);
|
|
|
|
param.min_is_always = 0; /* ??? */
|
|
|
|
param.name = "MULLOW_DC_THRESHOLD";
|
|
param.min_size = mullow_basecase_threshold;
|
|
param.max_size = 1000;
|
|
one (&mullow_dc_threshold, rands, ¶m);
|
|
|
|
param.name = "MULLOW_MUL_THRESHOLD";
|
|
param.min_size = mullow_dc_threshold;
|
|
param.max_size = 10000;
|
|
one (&mullow_mul_threshold, rands, ¶m);
|
|
|
|
/* disabled until tuned */
|
|
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
|
|
}
|
|
|
|
void
|
|
tune_mulmid (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.name = "MULMID_TOOM42_THRESHOLD";
|
|
param.function = speed_mpn_mulmid_n;
|
|
param.min_size = 4;
|
|
param.max_size = 100;
|
|
one (&mulmid_toom42_threshold, rands, ¶m);
|
|
|
|
/* disabled until tuned */
|
|
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
|
|
}
|
|
|
|
void
|
|
tune_mulmod_2expm1 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.function = speed_mpn_mulmod_2expm1;
|
|
param.name = "MULMOD_2EXPM1_THRESHOLD";
|
|
param.min_size = 1;
|
|
//param.max_size = ?? ;
|
|
one (&mulmod_2expm1_threshold, rands, ¶m);
|
|
/* disabled until tuned */
|
|
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX; // ??????????????
|
|
}
|
|
|
|
void
|
|
tune_mulhigh (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.function = speed_mpn_mulhigh_n;
|
|
|
|
param.name = "MULHIGH_BASECASE_THRESHOLD";
|
|
param.min_size = 3;
|
|
param.min_is_always = 3;
|
|
//param.max_size = MULHIGH_BASECASE_THRESHOLD_LIMIT-1;
|
|
one (&mulhigh_basecase_threshold, rands, ¶m);
|
|
|
|
param.min_is_always = 0; /* ??? */
|
|
|
|
param.name = "MULHIGH_DC_THRESHOLD";
|
|
param.min_size = MAX(mulhigh_basecase_threshold,4);
|
|
param.max_size = 1000;
|
|
one (&mulhigh_dc_threshold, rands, ¶m);
|
|
|
|
param.name = "MULHIGH_MUL_THRESHOLD";
|
|
param.min_size = mulhigh_dc_threshold;
|
|
param.max_size = 10000;
|
|
one (&mulhigh_mul_threshold, rands, ¶m);
|
|
|
|
/* disabled until tuned */
|
|
MUL_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
|
|
}
|
|
|
|
void
|
|
tune_rootrem (gmp_randstate_t rands)
|
|
{
|
|
|
|
static struct param_t param;
|
|
s.r=5; // tune for 5th root
|
|
param.function = speed_mpn_rootrem;
|
|
param.name = "ROOTREM_THRESHOLD";
|
|
param.min_size = 1;
|
|
one (&rootrem_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_divrem_hensel_qr_1 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.function = speed_mpn_divrem_hensel_qr_1;
|
|
param.name = "DIVREM_HENSEL_QR_1_THRESHOLD";
|
|
param.min_size = 2;
|
|
one (&divrem_hensel_qr_1_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_rsh_divrem_hensel_qr_1 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.function = speed_mpn_rsh_divrem_hensel_qr_1;
|
|
param.name = "RSH_DIVREM_HENSEL_QR_1_THRESHOLD";
|
|
param.min_size = 3;
|
|
one (&rsh_divrem_hensel_qr_1_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_divrem_euclid_hensel (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.function = speed_mpn_divrem_1;
|
|
param.name = "DIVREM_EUCLID_HENSEL_THRESHOLD";
|
|
param.min_size = 8;
|
|
s.r=0x81234567;// tune for this divisor
|
|
one (&divrem_euclid_hensel_threshold, rands, ¶m);
|
|
}
|
|
|
|
// for tuning we dont care if the divisors go out of range as it doesn't affect the runtime
|
|
void tune_mod_1_k (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.function = speed_mpn_divrem_euclidean_r_1;
|
|
|
|
param.name = "MOD_1_1_THRESHOLD";
|
|
param.min_size = 3;
|
|
one (&mod_1_1_threshold, rands, ¶m);
|
|
|
|
param.name = "MOD_1_2_THRESHOLD";
|
|
param.min_size = MAX(mod_1_1_threshold,4);
|
|
//param.max_size = 1000;
|
|
one (&mod_1_2_threshold, rands, ¶m);
|
|
|
|
param.name = "MOD_1_3_THRESHOLD";
|
|
param.min_size = MAX(mod_1_2_threshold,5);
|
|
//param.max_size = 10000;
|
|
one (&mod_1_3_threshold, rands, ¶m);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Start the basecase from 3, since 1 is a special case, and if mul_basecase
|
|
is faster only at size==2 then we don't want to bother with extra code
|
|
just for that. Start karatsuba from 4 same as MUL above. */
|
|
|
|
void
|
|
tune_sqr (gmp_randstate_t rands)
|
|
{
|
|
/* disabled until tuned */
|
|
SQR_FFT_FULL_THRESHOLD = MP_SIZE_T_MAX;
|
|
|
|
if (HAVE_NATIVE_mpn_sqr_basecase)
|
|
{
|
|
print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
|
|
sqr_basecase_threshold = 0;
|
|
}
|
|
else
|
|
{
|
|
static struct param_t param;
|
|
param.name = "SQR_BASECASE_THRESHOLD";
|
|
param.function = speed_mpn_sqr;
|
|
param.min_size = 3;
|
|
param.min_is_always = 1;
|
|
param.max_size = TUNE_SQR_KARATSUBA_MAX;
|
|
param.noprint = 1;
|
|
one (&sqr_basecase_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "SQR_KARATSUBA_THRESHOLD";
|
|
param.function = speed_mpn_sqr;
|
|
param.min_size = MAX (4, MPN_KARA_SQR_N_MINSIZE);
|
|
param.max_size = TUNE_SQR_KARATSUBA_MAX;
|
|
param.noprint = 1;
|
|
one (&sqr_karatsuba_threshold, rands, ¶m);
|
|
|
|
if (! HAVE_NATIVE_mpn_sqr_basecase
|
|
&& sqr_karatsuba_threshold < sqr_basecase_threshold)
|
|
{
|
|
/* Karatsuba becomes faster than mul_basecase before
|
|
sqr_basecase does. Arrange for the expression
|
|
"BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))" which
|
|
selects mpn_sqr_basecase in mpn_sqr to be false, by setting
|
|
SQR_KARATSUBA_THRESHOLD to zero, making
|
|
SQR_BASECASE_THRESHOLD the karatsuba threshold. */
|
|
|
|
sqr_basecase_threshold = SQR_KARATSUBA_THRESHOLD;
|
|
SQR_KARATSUBA_THRESHOLD = 0;
|
|
|
|
print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
|
|
"karatsuba");
|
|
print_define_remark ("SQR_KARATSUBA_THRESHOLD",SQR_KARATSUBA_THRESHOLD,
|
|
"never sqr_basecase");
|
|
}
|
|
else
|
|
{
|
|
if (! HAVE_NATIVE_mpn_sqr_basecase)
|
|
print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
|
|
print_define ("SQR_KARATSUBA_THRESHOLD", SQR_KARATSUBA_THRESHOLD);
|
|
}
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.function = speed_mpn_sqr;
|
|
|
|
{
|
|
param.name = "SQR_TOOM3_THRESHOLD";
|
|
param.min_size = MAX3 (MPN_TOOM3_SQR_N_MINSIZE,
|
|
SQR_KARATSUBA_THRESHOLD, SQR_BASECASE_THRESHOLD);
|
|
param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
|
|
one (&sqr_toom3_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
param.name = "SQR_TOOM4_THRESHOLD";
|
|
param.min_size = MAX (MPN_TOOM4_SQR_N_MINSIZE, sqr_toom3_threshold);
|
|
param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
|
|
one (&sqr_toom4_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
param.name = "SQR_TOOM8_THRESHOLD";
|
|
param.min_size = MAX (MPN_TOOM8_SQR_N_MINSIZE, sqr_toom4_threshold);
|
|
param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
|
|
one (&sqr_toom8_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
tune_dc_div (gmp_randstate_t rands)
|
|
{
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DC_DIV_QR_THRESHOLD";
|
|
param.function = speed_mpn_dc_div_qr_n;
|
|
param.step_factor = 0.02;
|
|
one (&dc_div_qr_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "INV_DIV_QR_THRESHOLD";
|
|
param.max_size = 10000;
|
|
param.function = speed_mpn_inv_div_qr;
|
|
param.min_size = dc_div_qr_threshold;
|
|
param.step_factor = 0.02;
|
|
one (&inv_div_qr_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "INV_DIVAPPR_Q_N_THRESHOLD";
|
|
param.function = speed_mpn_inv_divappr_q;
|
|
param.max_size = 10000;
|
|
param.min_size = dc_divappr_q_threshold;
|
|
param.step_factor = 0.1;
|
|
param.stop_factor = 0.2;
|
|
one (&inv_divappr_q_n_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
void
|
|
tune_tdiv_q (gmp_randstate_t rands)
|
|
{
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DC_DIV_Q_THRESHOLD";
|
|
param.function = speed_mpn_tdiv_q1;
|
|
param.step_factor = 0.02;
|
|
one (&dc_div_q_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "INV_DIV_Q_THRESHOLD";
|
|
param.function = speed_mpn_tdiv_q1;
|
|
param.max_size = 10000;
|
|
param.min_size = dc_div_q_threshold;
|
|
param.step_factor = 0.02;
|
|
one (&inv_div_q_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DC_DIVAPPR_Q_THRESHOLD";
|
|
param.function = speed_mpn_tdiv_q2;
|
|
param.step_factor = 0.02;
|
|
one (&dc_divappr_q_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "INV_DIVAPPR_Q_THRESHOLD";
|
|
param.function = speed_mpn_tdiv_q2;
|
|
param.max_size = 20000;
|
|
param.min_size = dc_divappr_q_threshold;
|
|
param.step_factor = 0.1;
|
|
one (&inv_divappr_q_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
void
|
|
tune_dc_bdiv (gmp_randstate_t rands)
|
|
{
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DC_BDIV_QR_THRESHOLD";
|
|
param.function = speed_mpn_dc_bdiv_qr_n;
|
|
param.step_factor = 0.02;
|
|
one (&dc_bdiv_qr_threshold, rands, ¶m);
|
|
}
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DC_BDIV_Q_THRESHOLD";
|
|
param.function = speed_mpn_dc_bdiv_q;
|
|
param.step_factor = 0.02;
|
|
one (&dc_bdiv_q_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
void
|
|
tune_binvert (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.function = speed_mpn_binvert;
|
|
param.name = "BINV_NEWTON_THRESHOLD";
|
|
param.min_size = 8; /* pointless with smaller operands */
|
|
one (&binv_newton_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_redc (gmp_randstate_t rands)
|
|
{
|
|
#define TUNE_REDC_2_MAX 100
|
|
#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
|
|
#define WANT_REDC_2 1
|
|
#endif
|
|
|
|
#if WANT_REDC_2
|
|
{
|
|
static struct param_t param;
|
|
param.name = "REDC_1_TO_REDC_2_THRESHOLD";
|
|
param.function = speed_mpn_redc_1;
|
|
param.function2 = speed_mpn_redc_2;
|
|
param.min_size = 1;
|
|
param.min_is_always = 1;
|
|
param.max_size = TUNE_REDC_2_MAX;
|
|
param.noprint = 1;
|
|
param.stop_factor = 1.5;
|
|
one (&redc_1_to_redc_2_threshold, rands, ¶m);
|
|
}
|
|
{
|
|
static struct param_t param;
|
|
param.name = "REDC_2_TO_REDC_N_THRESHOLD";
|
|
param.function = speed_mpn_redc_2;
|
|
param.function2 = speed_mpn_redc_n;
|
|
param.min_size = 16;
|
|
param.noprint = 1;
|
|
one (&redc_2_to_redc_n_threshold, rands, ¶m);
|
|
}
|
|
if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
|
|
{
|
|
redc_2_to_redc_n_threshold = 0; /* disable redc_2 */
|
|
|
|
/* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
|
|
REDC_1_TO_REDC_2_THRESHOLD. */
|
|
{
|
|
static struct param_t param;
|
|
param.name = "REDC_1_TO_REDC_2_THRESHOLD";
|
|
param.function = speed_mpn_redc_1;
|
|
param.function2 = speed_mpn_redc_n;
|
|
param.min_size = 16;
|
|
param.noprint = 1;
|
|
one (&redc_1_to_redc_2_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
|
|
print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
|
|
#else
|
|
{
|
|
static struct param_t param;
|
|
param.name = "REDC_1_TO_REDC_N_THRESHOLD";
|
|
param.function = speed_mpn_redc_1;
|
|
param.function2 = speed_mpn_redc_n;
|
|
param.min_size = 16;
|
|
one (&redc_1_to_redc_n_threshold, rands, ¶m);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void
|
|
tune_matrix22_mul (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.name = "MATRIX22_STRASSEN_THRESHOLD";
|
|
param.function = speed_mpn_matrix22_mul;
|
|
param.min_size = 2;
|
|
one (&matrix22_strassen_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_hgcd (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.name = "HGCD_THRESHOLD";
|
|
param.function = speed_mpn_hgcd;
|
|
/* We seem to get strange results for small sizes */
|
|
param.min_size = 30;
|
|
one (&hgcd_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_hgcd_appr (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.name = "HGCD_APPR_THRESHOLD";
|
|
param.function = speed_mpn_hgcd_appr;
|
|
/* We seem to get strange results for small sizes */
|
|
param.min_size = 50;
|
|
param.stop_since_change = 150;
|
|
one (&hgcd_appr_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_hgcd_reduce (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.name = "HGCD_REDUCE_THRESHOLD";
|
|
param.function = speed_mpn_hgcd_reduce;
|
|
param.min_size = 30;
|
|
param.max_size = 7000;
|
|
param.step_factor = 0.04;
|
|
one (&hgcd_reduce_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_gcd_dc (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.name = "GCD_DC_THRESHOLD";
|
|
param.function = speed_mpn_gcd;
|
|
param.min_size = hgcd_threshold;
|
|
param.max_size = 3000;
|
|
param.step_factor = 0.02;
|
|
one (&gcd_dc_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_gcdext_dc (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
param.name = "GCDEXT_DC_THRESHOLD";
|
|
param.function = speed_mpn_gcdext;
|
|
param.min_size = hgcd_threshold;
|
|
param.max_size = 3000;
|
|
param.step_factor = 0.02;
|
|
one (&gcdext_dc_threshold, rands, ¶m);
|
|
}
|
|
|
|
|
|
|
|
/* size_extra==1 reflects the fact that with high<divisor one division is
|
|
always skipped. Forcing high<divisor while testing ensures consistency
|
|
while stepping through sizes, ie. that size-1 divides will be done each
|
|
time.
|
|
|
|
min_size==2 and min_is_always are used so that if plain division is only
|
|
better at size==1 then don't bother including that code just for that
|
|
case, instead go with preinv always and get a size saving. */
|
|
|
|
#define DIV_1_PARAMS \
|
|
param.check_size = 256; \
|
|
param.min_size = 2; \
|
|
param.min_is_always = 1; \
|
|
param.data_high = DATA_HIGH_LT_R; \
|
|
param.size_extra = 1; \
|
|
param.stop_factor = 2.0;
|
|
|
|
|
|
double (*tuned_speed_mpn_divrem_1)(struct speed_params *s);
|
|
|
|
void
|
|
tune_divrem_1 (gmp_randstate_t rands)
|
|
{
|
|
/* plain version by default */
|
|
tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
|
|
|
|
/* No support for tuning native assembler code, do that by hand and put
|
|
the results in the .asm file, there's no need for such thresholds to
|
|
appear in gmp-mparam.h. */
|
|
if (HAVE_NATIVE_mpn_divrem_1)
|
|
return;
|
|
|
|
if (GMP_NAIL_BITS != 0)
|
|
{
|
|
print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
|
|
"no preinv with nails");
|
|
print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
|
|
"no preinv with nails");
|
|
return;
|
|
}
|
|
|
|
if (UDIV_PREINV_ALWAYS)
|
|
{
|
|
print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
|
|
print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
|
|
return;
|
|
}
|
|
|
|
tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
|
|
|
|
/* Tune for the integer part of mpn_divrem_1. This will very possibly be
|
|
a bit out for the fractional part, but that's too bad, the integer part
|
|
is more important. */
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DIVREM_1_NORM_THRESHOLD";
|
|
DIV_1_PARAMS;
|
|
s.r = randlimb_norm (rands);
|
|
param.function = speed_mpn_divrem_1_tune;
|
|
one (&divrem_1_norm_threshold, rands, ¶m);
|
|
}
|
|
{
|
|
static struct param_t param;
|
|
param.name = "DIVREM_1_UNNORM_THRESHOLD";
|
|
DIV_1_PARAMS;
|
|
s.r = randlimb_half (rands);
|
|
param.function = speed_mpn_divrem_1_tune;
|
|
one (&divrem_1_unnorm_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
|
|
double (*tuned_speed_mpn_mod_1)(struct speed_params *s);
|
|
|
|
void
|
|
tune_mod_1 (gmp_randstate_t rands)
|
|
{
|
|
/* plain version by default */
|
|
tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
|
|
|
|
/* No support for tuning native assembler code, do that by hand and put
|
|
the results in the .asm file, there's no need for such thresholds to
|
|
appear in gmp-mparam.h. */
|
|
if (HAVE_NATIVE_mpn_mod_1)
|
|
return;
|
|
|
|
if (GMP_NAIL_BITS != 0)
|
|
{
|
|
print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
|
|
"no preinv with nails");
|
|
print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
|
|
"no preinv with nails");
|
|
return;
|
|
}
|
|
|
|
if (UDIV_PREINV_ALWAYS)
|
|
{
|
|
print_define ("MOD_1_NORM_THRESHOLD", 0L);
|
|
print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
|
|
return;
|
|
}
|
|
|
|
tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
|
|
|
|
{
|
|
static struct param_t param;
|
|
param.name = "MOD_1_NORM_THRESHOLD";
|
|
DIV_1_PARAMS;
|
|
s.r = randlimb_norm (rands);
|
|
param.function = speed_mpn_mod_1_tune;
|
|
one (&mod_1_norm_threshold, rands, ¶m);
|
|
}
|
|
{
|
|
static struct param_t param;
|
|
param.name = "MOD_1_UNNORM_THRESHOLD";
|
|
DIV_1_PARAMS;
|
|
s.r = randlimb_half (rands);
|
|
param.function = speed_mpn_mod_1_tune;
|
|
one (&mod_1_unnorm_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
|
|
/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
|
|
imply that udiv_qrnnd_preinv is worth using, but it seems most
|
|
straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
|
|
directly. */
|
|
|
|
void
|
|
tune_preinv_divrem_1 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
speed_function_t divrem_1;
|
|
const char *divrem_1_name;
|
|
double t1, t2;
|
|
|
|
if (GMP_NAIL_BITS != 0)
|
|
{
|
|
print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
|
|
return;
|
|
}
|
|
|
|
/* Any native version of mpn_preinv_divrem_1 is assumed to exist because
|
|
it's faster than mpn_divrem_1. */
|
|
if (HAVE_NATIVE_mpn_preinv_divrem_1)
|
|
{
|
|
print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
|
|
return;
|
|
}
|
|
|
|
/* If udiv_qrnnd_preinv is the only division method then of course
|
|
mpn_preinv_divrem_1 should be used. */
|
|
if (UDIV_PREINV_ALWAYS)
|
|
{
|
|
print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
|
|
return;
|
|
}
|
|
|
|
/* If we've got an assembler version of mpn_divrem_1, then compare against
|
|
that, not the mpn_divrem_1_div generic C. */
|
|
if (HAVE_NATIVE_mpn_divrem_1)
|
|
{
|
|
divrem_1 = speed_mpn_divrem_1;
|
|
divrem_1_name = "mpn_divrem_1";
|
|
}
|
|
else
|
|
{
|
|
divrem_1 = speed_mpn_divrem_1_div;
|
|
divrem_1_name = "mpn_divrem_1_div";
|
|
}
|
|
|
|
param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
|
|
s.size = 200; /* generous but not too big */
|
|
/* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
|
|
since in general that's probably most common, though in fact for a
|
|
64-bit limb mp_bases[10].big_base is normalized. */
|
|
s.r = urandom(rands) & (GMP_NUMB_MASK >> 4);
|
|
if (s.r == 0) s.r = 123;
|
|
|
|
t1 = tuneup_measure (speed_mpn_preinv_divrem_1, rands, ¶m, &s);
|
|
t2 = tuneup_measure (divrem_1, rands, ¶m, &s);
|
|
if (t1 == -1.0 || t2 == -1.0)
|
|
{
|
|
printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
|
|
divrem_1_name, (long) s.size);
|
|
abort ();
|
|
}
|
|
if (option_trace >= 1)
|
|
printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
|
|
(long) s.size, t1, divrem_1_name, t2);
|
|
|
|
print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
|
|
}
|
|
|
|
|
|
/* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
|
|
that udiv_qrnnd_preinv is worth using, but it seems most straightforward
|
|
to compare mpn_preinv_mod_1 and mpn_mod_1_div directly. */
|
|
|
|
void
|
|
tune_preinv_mod_1 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
speed_function_t mod_1;
|
|
const char *mod_1_name;
|
|
double t1, t2;
|
|
|
|
/* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
|
|
faster than mpn_mod_1. */
|
|
if (HAVE_NATIVE_mpn_preinv_mod_1)
|
|
{
|
|
print_define_remark ("USE_PREINV_MOD_1", 1, "native");
|
|
return;
|
|
}
|
|
|
|
if (GMP_NAIL_BITS != 0)
|
|
{
|
|
print_define_remark ("USE_PREINV_MOD_1", 0, "no preinv with nails");
|
|
return;
|
|
}
|
|
|
|
/* If udiv_qrnnd_preinv is the only division method then of course
|
|
mpn_preinv_mod_1 should be used. */
|
|
if (UDIV_PREINV_ALWAYS)
|
|
{
|
|
print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
|
|
return;
|
|
}
|
|
|
|
/* If we've got an assembler version of mpn_mod_1, then compare against
|
|
that, not the mpn_mod_1_div generic C. */
|
|
if (HAVE_NATIVE_mpn_mod_1)
|
|
{
|
|
mod_1 = speed_mpn_mod_1;
|
|
mod_1_name = "mpn_mod_1";
|
|
}
|
|
else
|
|
{
|
|
mod_1 = speed_mpn_mod_1_div;
|
|
mod_1_name = "mpn_mod_1_div";
|
|
}
|
|
|
|
param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
|
|
s.size = 200; /* generous but not too big */
|
|
s.r = randlimb_norm(rands); /* divisor */
|
|
|
|
t1 = tuneup_measure (speed_mpn_preinv_mod_1, rands, ¶m, &s);
|
|
t2 = tuneup_measure (mod_1, rands, ¶m, &s);
|
|
if (t1 == -1.0 || t2 == -1.0)
|
|
{
|
|
printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
|
|
mod_1_name, (long) s.size);
|
|
abort ();
|
|
}
|
|
if (option_trace >= 1)
|
|
printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
|
|
(long) s.size, t1, mod_1_name, t2);
|
|
|
|
print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
|
|
}
|
|
|
|
|
|
void
|
|
tune_divrem_2 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
/* No support for tuning native assembler code, do that by hand and put
|
|
the results in the .asm file, and there's no need for such thresholds
|
|
to appear in gmp-mparam.h. */
|
|
if (HAVE_NATIVE_mpn_divrem_2)
|
|
return;
|
|
|
|
if (GMP_NAIL_BITS != 0)
|
|
{
|
|
print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
|
|
"no preinv with nails");
|
|
return;
|
|
}
|
|
|
|
if (UDIV_PREINV_ALWAYS)
|
|
{
|
|
print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
|
|
return;
|
|
}
|
|
|
|
/* Tune for the integer part of mpn_divrem_2. This will very possibly be
|
|
a bit out for the fractional part, but that's too bad, the integer part
|
|
is more important.
|
|
|
|
min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
|
|
code space if plain division is better only at size==2 or size==3. */
|
|
param.name = "DIVREM_2_THRESHOLD";
|
|
param.check_size = 256;
|
|
param.min_size = 4;
|
|
param.min_is_always = 1;
|
|
param.size_extra = 2; /* does qsize==nsize-2 divisions */
|
|
param.stop_factor = 2.0;
|
|
|
|
s.r = randlimb_norm (rands);
|
|
param.function = speed_mpn_divrem_2;
|
|
one (&divrem_2_threshold, rands, ¶m);
|
|
}
|
|
|
|
|
|
/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
|
|
tune for that. Its speed can differ on odd or even divisor, so take an
|
|
average threshold for the two.
|
|
|
|
mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
|
|
might not vary that way, but don't test this since high<divisor isn't
|
|
expected to occur often with small divisors. */
|
|
|
|
void
|
|
tune_divexact_1 (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
mp_size_t thresh[2], average;
|
|
int low, i;
|
|
|
|
/* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
|
|
full mpn_divrem_1. */
|
|
if (HAVE_NATIVE_mpn_divexact_1)
|
|
{
|
|
print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
|
|
return;
|
|
}
|
|
|
|
ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
|
|
|
|
param.name = "DIVEXACT_1_THRESHOLD";
|
|
param.data_high = DATA_HIGH_GE_R;
|
|
param.check_size = 256;
|
|
param.min_size = 2;
|
|
param.stop_factor = 1.5;
|
|
param.function = tuned_speed_mpn_divrem_1;
|
|
param.function2 = speed_mpn_divexact_1;
|
|
param.noprint = 1;
|
|
|
|
print_define_start (param.name);
|
|
|
|
for (low = 0; low <= 1; low++)
|
|
{
|
|
s.r = randlimb_half(rands);
|
|
if (low == 0)
|
|
s.r |= 1;
|
|
else
|
|
s.r &= ~CNST_LIMB(7);
|
|
|
|
one (&thresh[low], rands, ¶m);
|
|
if (option_trace)
|
|
printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
|
|
|
|
if (thresh[low] == MP_SIZE_T_MAX)
|
|
{
|
|
average = MP_SIZE_T_MAX;
|
|
goto divexact_1_done;
|
|
}
|
|
}
|
|
|
|
if (option_trace)
|
|
{
|
|
printf ("average of:");
|
|
for (i = 0; i < numberof(thresh); i++)
|
|
printf (" %ld", (long) thresh[i]);
|
|
printf ("\n");
|
|
}
|
|
|
|
average = 0;
|
|
for (i = 0; i < numberof(thresh); i++)
|
|
average += thresh[i];
|
|
average /= numberof(thresh);
|
|
|
|
/* If divexact turns out to be better as early as 3 limbs, then use it
|
|
always, so as to reduce code size and conditional jumps. */
|
|
if (average <= 3)
|
|
average = 0;
|
|
|
|
divexact_1_done:
|
|
print_define_end (param.name, average);
|
|
}
|
|
|
|
|
|
/* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
|
|
same as mpn_mod_1, but this might not be true of an assembler
|
|
implementation. The threshold used is an average based on data where a
|
|
divide can be skipped and where it can't.
|
|
|
|
If modexact turns out to be better as early as 3 limbs, then use it
|
|
always, so as to reduce code size and conditional jumps. */
|
|
|
|
void
|
|
tune_modexact_1_odd (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
mp_size_t thresh_lt, thresh_ge, average;
|
|
|
|
/* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
|
|
of a full mpn_mod_1. */
|
|
if (HAVE_NATIVE_mpn_modexact_1_odd)
|
|
{
|
|
print_define_remark ("MODEXACT_1_ODD_THRESHOLD", 0, "always (native)");
|
|
return;
|
|
}
|
|
|
|
ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
|
|
|
|
param.name = "MODEXACT_1_ODD_THRESHOLD";
|
|
param.check_size = 256;
|
|
param.min_size = 2;
|
|
param.stop_factor = 1.5;
|
|
param.function = tuned_speed_mpn_mod_1;
|
|
param.function2 = speed_mpn_modexact_1c_odd;
|
|
param.noprint = 1;
|
|
s.r = randlimb_half (rands) | 1;
|
|
|
|
print_define_start (param.name);
|
|
|
|
param.data_high = DATA_HIGH_LT_R;
|
|
one (&thresh_lt, rands, ¶m);
|
|
if (option_trace)
|
|
printf ("lt thresh %ld\n", (long) thresh_lt);
|
|
|
|
average = thresh_lt;
|
|
if (thresh_lt != MP_SIZE_T_MAX)
|
|
{
|
|
param.data_high = DATA_HIGH_GE_R;
|
|
one (&thresh_ge, rands, ¶m);
|
|
if (option_trace)
|
|
printf ("ge thresh %ld\n", (long) thresh_ge);
|
|
|
|
if (thresh_ge != MP_SIZE_T_MAX)
|
|
{
|
|
average = (thresh_ge + thresh_lt) / 2;
|
|
if (thresh_ge <= 3)
|
|
average = 0;
|
|
}
|
|
}
|
|
|
|
print_define_end (param.name, average);
|
|
}
|
|
|
|
|
|
void
|
|
tune_jacobi_base (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
double t1, t2, t3, t4;
|
|
int method;
|
|
|
|
s.size = GMP_LIMB_BITS * 3 / 4;
|
|
|
|
t1 = tuneup_measure (speed_mpn_jacobi_base_1, rands, ¶m, &s);
|
|
if (option_trace >= 1)
|
|
printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
|
|
|
|
t2 = tuneup_measure (speed_mpn_jacobi_base_2, rands, ¶m, &s);
|
|
if (option_trace >= 1)
|
|
printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
|
|
|
|
t3 = tuneup_measure (speed_mpn_jacobi_base_3, rands, ¶m, &s);
|
|
if (option_trace >= 1)
|
|
printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
|
|
|
|
t4 = tuneup_measure (speed_mpn_jacobi_base_4, rands, ¶m, &s);
|
|
if (option_trace >= 1)
|
|
printf ("size=%ld, mpn_jacobi_base_4 %.9f\n", (long) s.size, t4);
|
|
|
|
if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0 || t4 == -1.0)
|
|
{
|
|
printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
|
|
(long) s.size);
|
|
abort ();
|
|
}
|
|
|
|
if (t1 < t2 && t1 < t3 && t1 < t4)
|
|
method = 1;
|
|
else if (t2 < t3 && t2 < t4)
|
|
method = 2;
|
|
else if (t3 < t4)
|
|
method = 3;
|
|
else
|
|
method = 4;
|
|
|
|
print_define ("JACOBI_BASE_METHOD", method);
|
|
}
|
|
|
|
|
|
|
|
void
|
|
tune_get_str (gmp_randstate_t rands)
|
|
{
|
|
/* Tune for decimal, it being most common. Some rough testing suggests
|
|
other bases are different, but not by very much. */
|
|
s.r = 10;
|
|
{
|
|
static struct param_t param;
|
|
GET_STR_PRECOMPUTE_THRESHOLD = 0;
|
|
param.name = "GET_STR_DC_THRESHOLD";
|
|
param.function = speed_mpn_get_str;
|
|
param.min_size = 4;
|
|
param.max_size = GET_STR_THRESHOLD_LIMIT;
|
|
one (&get_str_dc_threshold, rands, ¶m);
|
|
}
|
|
{
|
|
static struct param_t param;
|
|
param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
|
|
param.function = speed_mpn_get_str;
|
|
param.min_size = GET_STR_DC_THRESHOLD;
|
|
param.max_size = GET_STR_THRESHOLD_LIMIT;
|
|
one (&get_str_precompute_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
double
|
|
speed_mpn_pre_set_str (struct speed_params *s)
|
|
{
|
|
unsigned char *str;
|
|
mp_ptr wp;
|
|
mp_size_t wn;
|
|
unsigned i;
|
|
int base;
|
|
double t;
|
|
mp_ptr powtab_mem, tp;
|
|
powers_t powtab[GMP_LIMB_BITS];
|
|
mp_size_t un;
|
|
int chars_per_limb;
|
|
TMP_DECL;
|
|
|
|
SPEED_RESTRICT_COND (s->size >= 1);
|
|
|
|
base = s->r == 0 ? 10 : s->r;
|
|
SPEED_RESTRICT_COND (base >= 2 && base <= 256);
|
|
|
|
TMP_MARK;
|
|
|
|
str = TMP_ALLOC (s->size);
|
|
for (i = 0; i < s->size; i++)
|
|
str[i] = s->xp[i] % base;
|
|
|
|
wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
|
|
/ GMP_LIMB_BITS + 2;
|
|
SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
|
|
|
|
/* use this during development to check wn is big enough */
|
|
/*
|
|
ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
|
|
*/
|
|
|
|
speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
|
|
speed_operand_dst (s, wp, wn);
|
|
speed_cache_fill (s);
|
|
|
|
chars_per_limb = mp_bases[base].chars_per_limb;
|
|
un = s->size / chars_per_limb + 1;
|
|
powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
|
|
mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
|
|
tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
|
|
|
|
speed_starttime ();
|
|
i = s->reps;
|
|
do
|
|
{
|
|
mpn_pre_set_str (wp, str, s->size, powtab, tp);
|
|
}
|
|
while (--i != 0);
|
|
t = speed_endtime ();
|
|
|
|
TMP_FREE;
|
|
return t;
|
|
}
|
|
|
|
void
|
|
tune_set_str (gmp_randstate_t rands)
|
|
{
|
|
s.r = 10; /* decimal */
|
|
{
|
|
static struct param_t param;
|
|
SET_STR_PRECOMPUTE_THRESHOLD = 0;
|
|
param.step_factor = 0.01;
|
|
param.name = "SET_STR_DC_THRESHOLD";
|
|
param.function = speed_mpn_pre_set_str;
|
|
param.min_size = 100;
|
|
param.max_size = 50000;
|
|
one (&set_str_dc_threshold, rands, ¶m);
|
|
}
|
|
{
|
|
static struct param_t param;
|
|
param.step_factor = 0.02;
|
|
param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
|
|
param.function = speed_mpn_set_str;
|
|
param.min_size = SET_STR_DC_THRESHOLD;
|
|
param.max_size = 100000;
|
|
one (&set_str_precompute_threshold, rands, ¶m);
|
|
}
|
|
}
|
|
|
|
void
|
|
tune_fft(gmp_randstate_t state)
|
|
{
|
|
mp_bitcnt_t depth, w, depth1, w1;
|
|
clock_t start, end;
|
|
double elapsed;
|
|
double best = 0.0;
|
|
mp_size_t best_off, off, best_d, best_w, num_twos, num_printed;
|
|
|
|
if (option_fft_max_size == 0)
|
|
return;
|
|
|
|
printf("/* fft_tuning -- autogenerated by tune-fft */\n\n");
|
|
printf("#define FFT_TAB \\\n");
|
|
fflush(stdout);
|
|
|
|
printf(" { "); fflush(stdout);
|
|
for (depth = 6; depth <= 10; depth++)
|
|
{
|
|
printf("{ "); fflush(stdout);
|
|
for (w = 1; w <= 2; w++)
|
|
{
|
|
int iters = 100*((mp_size_t) 1 << (3*(10 - depth)/2)), i;
|
|
|
|
mp_size_t n = ((mp_limb_t)1<<depth);
|
|
mp_bitcnt_t bits1 = (n*w - (depth + 1))/2;
|
|
mp_size_t len1 = 2*n;
|
|
mp_size_t len2 = 2*n;
|
|
|
|
mp_bitcnt_t b1 = len1*bits1, b2 = len2*bits1;
|
|
mp_size_t n1, n2;
|
|
mp_size_t j;
|
|
mp_limb_t * i1, *i2, *r1;
|
|
|
|
n1 = (b1 - 1)/GMP_LIMB_BITS + 1;
|
|
n2 = (b2 - 1)/GMP_LIMB_BITS + 1;
|
|
|
|
i1 = malloc(2*(n1 + n2)*sizeof(mp_limb_t));
|
|
i2 = i1 + n1;
|
|
r1 = i2 + n2;
|
|
|
|
mpn_urandomb(i1, state, b1);
|
|
mpn_urandomb(i2, state, b2);
|
|
|
|
best_off = -1;
|
|
|
|
for (off = 0; off <= 4; off++)
|
|
{
|
|
start = clock();
|
|
for (i = 0; i < iters; i++)
|
|
mpn_mul_trunc_sqrt2(r1, i1, n1, i2, n2, depth - off, w*((mp_size_t)1 << (off*2)));
|
|
end = clock();
|
|
|
|
elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
|
|
|
|
if (elapsed < best || best_off == -1)
|
|
{
|
|
best_off = off;
|
|
best = elapsed;
|
|
}
|
|
}
|
|
|
|
printf("%ld", best_off);
|
|
if (w != 2) printf(",");
|
|
printf(" "); fflush(stdout);
|
|
|
|
free(i1);
|
|
}
|
|
printf("}");
|
|
if (depth != 10) printf(",");
|
|
printf(" "); fflush(stdout);
|
|
}
|
|
|
|
printf("}\n\n");
|
|
|
|
best_d = 12;
|
|
best_w = 1;
|
|
best_off = -1;
|
|
num_printed = 0;
|
|
num_twos = 0;
|
|
|
|
printf("#define MULMOD_TAB \\\n");
|
|
fflush(stdout);
|
|
printf(" { "); fflush(stdout);
|
|
for (depth = 12; best_off != 1 && !(num_printed >= 25 && best_off == 2 && num_twos >= 5) ; depth++)
|
|
{
|
|
for (w = 1; w <= 2; w++)
|
|
{
|
|
int iters = 100*((mp_size_t) 1 << (3*(18 - depth)/2)), i;
|
|
mp_size_t n = ((mp_limb_t)1<<depth);
|
|
mp_bitcnt_t bits = n*w;
|
|
mp_size_t int_limbs = (bits - 1)/GMP_LIMB_BITS + 1;
|
|
mp_size_t j;
|
|
mp_limb_t c, * i1, * i2, * r1, * tt;
|
|
|
|
if (depth <= 21) iters = 32*((mp_size_t) 1 << (21 - depth));
|
|
else iters = MAX(32/((mp_size_t) 1 << (depth - 21)), 1);
|
|
|
|
i1 = malloc(6*(int_limbs+1)*sizeof(mp_limb_t));
|
|
i2 = i1 + int_limbs + 1;
|
|
r1 = i2 + int_limbs + 1;
|
|
tt = r1 + 2*(int_limbs + 1);
|
|
|
|
mpn_urandomb(i1, state, int_limbs*GMP_LIMB_BITS);
|
|
mpn_urandomb(i2, state, int_limbs*GMP_LIMB_BITS);
|
|
i1[int_limbs] = 0;
|
|
i2[int_limbs] = 0;
|
|
|
|
depth1 = 1;
|
|
while ((((mp_limb_t)1)<<depth1) < bits) depth1++;
|
|
depth1 = depth1/2;
|
|
|
|
w1 = bits/(((mp_limb_t)1)<<(2*depth1));
|
|
|
|
best_off = -1;
|
|
|
|
for (off = 0; off <= 4; off++)
|
|
{
|
|
start = clock();
|
|
for (i = 0; i < iters; i++)
|
|
mpir_fft_mulmod_2expp1(r1, i1, i2, int_limbs, depth1 - off, w1*((mp_size_t)1 << (off*2)));
|
|
end = clock();
|
|
|
|
elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
|
|
|
|
if (best_off == -1 || elapsed < best)
|
|
{
|
|
best_off = off;
|
|
best = elapsed;
|
|
}
|
|
}
|
|
|
|
start = clock();
|
|
for (i = 0; i < iters; i++)
|
|
mpn_mulmod_2expp1_basecase(r1, i1, i2, 0, bits, tt);
|
|
end = clock();
|
|
|
|
elapsed = ((double) (end - start)) / CLOCKS_PER_SEC;
|
|
if (elapsed < best)
|
|
{
|
|
best_d = depth + (w == 2);
|
|
best_w = w + 1 - 2*(w == 2);
|
|
}
|
|
|
|
printf("%ld", best_off);
|
|
if (best_off == 2)
|
|
num_twos++;
|
|
else
|
|
num_twos = 0;
|
|
num_printed++;
|
|
if (w != 2) printf(", "); fflush(stdout);
|
|
|
|
free(i1);
|
|
}
|
|
printf(", "); fflush(stdout);
|
|
}
|
|
if (best_off == 2)
|
|
{
|
|
printf("2, 2, 2, 2, 2, 1, 1 }\n\n");
|
|
num_printed += 6;
|
|
} else
|
|
printf("1 }\n\n");
|
|
|
|
printf("#define FFT_N_NUM %ld\n\n", num_printed + 1);
|
|
|
|
printf("#define FFT_MULMOD_2EXPP1_CUTOFF %ld\n\n", ((mp_limb_t) 1 << best_d)*best_w/(2*GMP_LIMB_BITS));
|
|
}
|
|
|
|
void
|
|
tune_fac_ui (gmp_randstate_t rands)
|
|
{
|
|
static struct param_t param;
|
|
|
|
param.function = speed_mpz_fac_ui_tune;
|
|
|
|
param.name = "FAC_DSC_THRESHOLD";
|
|
param.min_size = 70;
|
|
param.max_size = FAC_DSC_THRESHOLD_LIMIT;
|
|
one (&fac_dsc_threshold, rands, ¶m);
|
|
|
|
param.name = "FAC_ODD_THRESHOLD";
|
|
param.min_size = 22;
|
|
param.stop_factor = 1.7;
|
|
param.min_is_always = 1;
|
|
one (&fac_odd_threshold, rands, ¶m);
|
|
}
|
|
|
|
void
|
|
tune_fft_mul (gmp_randstate_t rands)
|
|
{
|
|
static struct fft_param_t param;
|
|
|
|
if (option_fft_max_size == 0)
|
|
return;
|
|
|
|
param.threshold_name = "MUL_FFT_FULL_THRESHOLD";
|
|
param.p_threshold = &mul_fft_full_threshold;
|
|
param.first_size = MUL_TOOM8H_THRESHOLD / 2;
|
|
param.max_size = option_fft_max_size;
|
|
param.function = speed_mpn_mul_fft_main;
|
|
param.mul_function = speed_mpn_mul_n;
|
|
param.sqr = 0;
|
|
fft (¶m,rands);
|
|
}
|
|
|
|
|
|
void
|
|
tune_fft_sqr (gmp_randstate_t rands)
|
|
{
|
|
static struct fft_param_t param;
|
|
|
|
if (option_fft_max_size == 0)
|
|
return;
|
|
|
|
param.threshold_name = "SQR_FFT_FULL_THRESHOLD";
|
|
param.p_threshold = &sqr_fft_full_threshold;
|
|
param.first_size = SQR_TOOM8_THRESHOLD / 2;
|
|
param.max_size = option_fft_max_size;
|
|
param.function = speed_mpn_sqr_fft_main;
|
|
param.mul_function = speed_mpn_sqr;
|
|
param.sqr = 0;
|
|
fft (¶m,rands);
|
|
}
|
|
|
|
#ifdef _MSC_VER
|
|
#define GMP_MPARAM_H_SUGGEST "vc_gmp_mparam.h"
|
|
#endif
|
|
|
|
void
|
|
all (gmp_randstate_t rands)
|
|
{
|
|
time_t start_time, end_time;
|
|
TMP_DECL;
|
|
|
|
TMP_MARK;
|
|
SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
|
|
SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
|
|
|
|
mpn_randomb (s.xp_block, rands, SPEED_BLOCK_SIZE);
|
|
mpn_randomb (s.yp_block, rands, SPEED_BLOCK_SIZE);
|
|
|
|
fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
|
|
|
|
speed_time_init ();
|
|
fprintf (stderr, "Using: %s\n", speed_time_string);
|
|
|
|
fprintf (stderr, "speed_precision %d", speed_precision);
|
|
if (speed_unittime == 1.0)
|
|
fprintf (stderr, ", speed_unittime 1 cycle");
|
|
else
|
|
fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
|
|
if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
|
|
fprintf (stderr, ", CPU freq unknown\n");
|
|
else
|
|
fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
|
|
|
|
fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
|
|
DEFAULT_MAX_SIZE, (long) option_fft_max_size);
|
|
fprintf (stderr, "\n");
|
|
|
|
time (&start_time);
|
|
{
|
|
struct tm *tp;
|
|
tp = localtime (&start_time);
|
|
printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
|
|
tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
|
|
|
|
#ifdef __GNUC__
|
|
/* gcc sub-minor version doesn't seem to come through as a define */
|
|
printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
|
|
#define PRINTED_COMPILER
|
|
#endif
|
|
#if defined (__SUNPRO_C)
|
|
printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
|
|
#define PRINTED_COMPILER
|
|
#endif
|
|
#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
|
|
/* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
|
|
printf ("MIPSpro C %d.%d.%d */\n",
|
|
_COMPILER_VERSION / 100,
|
|
_COMPILER_VERSION / 10 % 10,
|
|
_COMPILER_VERSION % 10);
|
|
#define PRINTED_COMPILER
|
|
#endif
|
|
#if defined (__DECC) && defined (__DECC_VER)
|
|
printf ("DEC C %d */\n", __DECC_VER);
|
|
#define PRINTED_COMPILER
|
|
#endif
|
|
#if ! defined (PRINTED_COMPILER)
|
|
printf ("system compiler */\n");
|
|
#endif
|
|
}
|
|
printf ("\n");
|
|
|
|
tune_mul (rands);
|
|
printf("\n");
|
|
|
|
tune_sqr (rands);
|
|
printf("\n");
|
|
|
|
tune_divrem_1 (rands);
|
|
tune_mod_1 (rands);
|
|
tune_preinv_divrem_1 (rands);
|
|
tune_preinv_mod_1 (rands);
|
|
tune_divrem_2 (rands);
|
|
tune_divexact_1 (rands);
|
|
tune_modexact_1_odd (rands);
|
|
tune_mod_1_k(rands);
|
|
tune_divrem_hensel_qr_1(rands);
|
|
tune_rsh_divrem_hensel_qr_1(rands);
|
|
tune_divrem_euclid_hensel(rands);
|
|
printf("\n");
|
|
|
|
tune_fft_mul (rands);
|
|
printf("\n");
|
|
|
|
tune_fft_sqr (rands);
|
|
printf ("\n");
|
|
|
|
tune_mullow (rands);
|
|
printf("\n");
|
|
tune_mulmid (rands);
|
|
printf("\n");
|
|
tune_mulhigh (rands);
|
|
printf("\n");
|
|
|
|
tune_mulmod_2expm1(rands);
|
|
printf("\n");
|
|
|
|
/* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */
|
|
tune_dc_div (rands);
|
|
|
|
/* mpn_tdiv_q : balanced */
|
|
tune_tdiv_q (rands);
|
|
|
|
/* dc_bdiv_qr_n, dc_bdiv_q */
|
|
tune_dc_bdiv (rands);
|
|
printf("\n");
|
|
|
|
tune_binvert (rands);
|
|
tune_redc (rands);
|
|
printf("\n");
|
|
|
|
tune_rootrem(rands);
|
|
printf("\n");
|
|
|
|
tune_matrix22_mul (rands);
|
|
tune_hgcd (rands);
|
|
tune_hgcd_appr (rands);
|
|
tune_hgcd_reduce(rands);
|
|
tune_gcd_dc (rands);
|
|
tune_gcdext_dc (rands);
|
|
tune_jacobi_base (rands);
|
|
printf("\n");
|
|
|
|
tune_get_str (rands);
|
|
tune_set_str (rands);
|
|
printf("\n");
|
|
|
|
tune_fac_ui (rands);
|
|
printf("\n");
|
|
|
|
tune_fft (rands);
|
|
printf("\n");
|
|
|
|
time (&end_time);
|
|
printf ("/* Tuneup completed successfully, took %ld seconds */\n",
|
|
end_time - start_time);
|
|
|
|
TMP_FREE;
|
|
}
|
|
|
|
|
|
int
|
|
main (int argc, char *argv[])
|
|
{
|
|
int opt;
|
|
gmp_randstate_t rands;
|
|
|
|
gmp_randinit_default(rands);
|
|
/* Unbuffered so if output is redirected to a file it isn't lost if the
|
|
program is killed part way through. */
|
|
setbuf (stdout, NULL);
|
|
setbuf (stderr, NULL);
|
|
|
|
while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
|
|
{
|
|
switch (opt) {
|
|
case 'f':
|
|
if (optarg[0] == 't')
|
|
option_fft_trace = 2;
|
|
else
|
|
option_fft_max_size = atol (optarg);
|
|
break;
|
|
case 'o':
|
|
speed_option_set (optarg);
|
|
break;
|
|
case 'p':
|
|
speed_precision = atoi (optarg);
|
|
break;
|
|
case 't':
|
|
option_trace++;
|
|
break;
|
|
case '?':
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
all (rands);
|
|
exit (0);
|
|
}
|