Attempt to tune some of the division functions.

This commit is contained in:
(no author) 2010-02-19 12:54:56 +00:00
parent 89ef2d6caf
commit f444a2bf6c
14 changed files with 447 additions and 124 deletions

View File

@ -584,51 +584,6 @@ void __gmp_tmp_debug_free _PROTO ((const char *, int, int,
#define GMP_NAIL_LOWBIT (CNST_LIMB(1) << GMP_NUMB_BITS)
#endif
#if GMP_NAIL_BITS != 0
/* Set various *_THRESHOLD values to be used for nails. Thus we avoid using
code that has not yet been qualified. */
#undef DIV_SB_PREINV_THRESHOLD
#undef DIV_DC_THRESHOLD
#undef POWM_THRESHOLD
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX
#define DIV_DC_THRESHOLD 50
#define POWM_THRESHOLD 0
#undef GCD_ACCEL_THRESHOLD
#define GCD_ACCEL_THRESHOLD 3
#undef DIVREM_1_NORM_THRESHOLD
#undef DIVREM_1_UNNORM_THRESHOLD
#undef MOD_1_NORM_THRESHOLD
#undef MOD_1_UNNORM_THRESHOLD
#undef USE_PREINV_DIVREM_1
#undef USE_PREINV_MOD_1
#undef DIVREM_2_THRESHOLD
#undef DIVEXACT_1_THRESHOLD
#undef MODEXACT_1_ODD_THRESHOLD
#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
#define USE_PREINV_DIVREM_1 0 /* no preinv */
#define USE_PREINV_MOD_1 0 /* no preinv */
#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv */
#undef GET_STR_DC_THRESHOLD
#undef GET_STR_PRECOMPUTE_THRESHOLD
#undef SET_STR_THRESHOLD
#define GET_STR_DC_THRESHOLD 22
#define GET_STR_PRECOMPUTE_THRESHOLD 42
#define SET_STR_THRESHOLD 3259
/* mpn/generic/mul_fft.c is not nails-capable. */
#undef MUL_FFT_THRESHOLD
#undef SQR_FFT_THRESHOLD
#define MUL_FFT_THRESHOLD MP_SIZE_T_MAX
#define SQR_FFT_THRESHOLD MP_SIZE_T_MAX
#endif
/* Swap macros. */
#define MP_LIMB_T_SWAP(x, y) \
@ -1270,6 +1225,8 @@ void mpn_mul_fft_full _PROTO ((mp_ptr op,
#define mpn_fft_next_size __MPN(fft_next_size)
mp_size_t mpn_fft_next_size _PROTO ((mp_size_t pl, int k)) ATTRIBUTE_CONST;
#define DC_DIVAPPR_Q_N_ITCH(n) (n*10)
#define mpn_sb_divrem_mn __MPN(sb_divrem_mn)
mp_limb_t mpn_sb_divrem_mn _PROTO ((mp_ptr, mp_ptr, mp_size_t,
mp_srcptr, mp_size_t));
@ -1790,18 +1747,37 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[];
#define MPN_FFT_TABLE_SIZE 16
/* mpn_dc_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
i.e. n/2 >= MUL_KARATSUBA_THRESHOLD
Measured values are between 2 and 4 times MUL_KARATSUBA_THRESHOLD, so go
for 3 as an average. */
#ifndef DIV_DC_THRESHOLD
#define DIV_DC_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
#ifndef DC_DIV_QR_THRESHOLD
#define DC_DIV_QR_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
#endif
#ifndef DC_DIVAPPR_Q_N_THRESHOLD
#define DC_DIVAPPR_Q_N_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
#endif
#ifndef INV_DIV_QR_THRESHOLD
#define INV_DIV_QR_THRESHOLD (MUL_FFT_THRESHOLD/3)
#endif
#ifndef INV_DIVAPPR_Q_N_THRESHOLD
#define INV_DIVAPPR_Q_N_THRESHOLD (MUL_FFT_THRESHOLD/3)
#endif
#ifndef DC_DIV_Q_THRESHOLD
#define DC_DIV_Q_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
#endif
#ifndef INV_DIV_Q_THRESHOLD
#define INV_DIV_Q_THRESHOLD (MUL_FFT_THRESHOLD/3)
#endif
#ifndef DC_DIVAPPR_Q_THRESHOLD
#define DC_DIVAPPR_Q_THRESHOLD (3 * MUL_TOOM3_THRESHOLD)
#endif
#ifndef INV_DIVAPPR_Q_THRESHOLD
#define INV_DIVAPPR_Q_THRESHOLD (MUL_FFT_THRESHOLD/2)
#endif
/* Return non-zero if xp,xsize and yp,ysize overlap.
If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no
@ -4180,9 +4156,21 @@ extern mp_size_t mulmod_2expm1_threshold;
extern mp_size_t div_sb_preinv_threshold;
#endif
#undef DIV_DC_THRESHOLD
#define DIV_DC_THRESHOLD div_dc_threshold
extern mp_size_t div_dc_threshold;
#undef DC_DIV_QR_THRESHOLD
#define DC_DIV_QR_THRESHOLD dc_div_qr_threshold
extern mp_size_t dc_div_qr_threshold;
#undef DC_DIVAPPR_Q_N_THRESHOLD
#define DC_DIVAPPR_Q_N_THRESHOLD dc_divappr_q_n_threshold
extern mp_size_t dc_divappr_q_n_threshold;
#undef INV_DIV_QR_THRESHOLD
#define INV_DIV_QR_THRESHOLD inv_div_qr_threshold
extern mp_size_t inv_div_qr_threshold;
#undef INV_DIVAPPR_Q_N_THRESHOLD
#define INV_DIVAPPR_Q_N_THRESHOLD inv_divappr_q_n_threshold
extern mp_size_t inv_divappr_q_n_threshold;
#undef POWM_THRESHOLD
#define POWM_THRESHOLD powm_threshold

View File

@ -28,8 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
#define DC_DIV_QR_THRESHOLD 56
mp_limb_t
mpn_dc_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
mp_limb_t dinv, mp_ptr tp)

View File

@ -28,10 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */
#define DC_DIVAPPR_QR_THRESHOLD 145
#define DC_DIV_QR_THRESHOLD 56
mp_limb_t
mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
@ -187,7 +183,7 @@ mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
q2p = TMP_SALLOC_LIMBS (qn + 1);
if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_N_THRESHOLD))
{
qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
dp - (qn + 1), qn + 1, dinv);

View File

@ -64,8 +64,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
has that description.
*/
#define DC_DIVAPPR_Q_N_THRESHOLD 56
mp_limb_t
mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
mp_limb_t dip, mp_ptr tp)

View File

@ -29,6 +29,8 @@ MA 02110-1301, USA. */
#include "mpir.h"
#include "gmp-impl.h"
#define DIV_DC_THRESHOLD 16
/*
[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
Technical report MPI-I-98-1-022, october 1998.

View File

@ -30,9 +30,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
#define DC_DIV_QR_THRESHOLD 56
#define INV_DIV_QR_THRESHOLD 1200
mp_limb_t
mpn_inv_div_qr (mp_ptr qp,
mp_ptr np, mp_size_t nn,

View File

@ -28,12 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */
#define INV_DIVAPPR_Q_THRESHOLD 1000
#define INV_DIV_QR_THRESHOLD 1200
#define DC_DIVAPPR_QR_THRESHOLD 145
#define DC_DIV_QR_THRESHOLD 56
mp_limb_t
mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
mp_srcptr dp, mp_size_t dn, mp_srcptr dinv)
@ -190,7 +184,7 @@ mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
dp - (qn + 1), qn + 1, dinv2);
}
else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_THRESHOLD))
else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_N_THRESHOLD))
{
/* It is tempting to use qp for recursive scratch and put quotient in
tp, but the recursive scratch needs one limb too many. */

View File

@ -81,11 +81,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
for the code to be correct. */
#define FUDGE 5 /* FIXME: tune this */
#define DC_DIV_Q_THRESHOLD 156 /* FIXME: tune this */
#define DC_DIVAPPR_Q_THRESHOLD 156 /* FIXME: tune this */
#define INV_DIV_Q_THRESHOLD 1000
#define INV_DIVAPPR_Q_THRESHOLD 2000
void
mpn_tdiv_q (mp_ptr qp,
mp_srcptr np, mp_size_t nn,

View File

@ -35,9 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
#include "gmp-impl.h"
#include "longlong.h"
#define DC_DIV_QR_THRESHOLD 152 /*FIXME: tune this!*/
#define INV_DIV_QR_THRESHOLD 1000
void
mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)

View File

@ -23,7 +23,10 @@
#define MULMOD_2EXPM1_THRESHOLD 18
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
#define DIV_DC_THRESHOLD 372
#define DC_DIV_QR_THRESHOLD 14
#define DC_DIVAPPR_Q_N_THRESHOLD 10
#define INV_DIV_QR_THRESHOLD 926
#define INV_DIVAPPR_Q_N_THRESHOLD 926
#define POWM_THRESHOLD 154
#define FAC_UI_THRESHOLD 17010

View File

@ -129,7 +129,8 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \
mul_n.c mullow_n.c mulhigh_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c toom4_mul_n.c \
toom7_mul_n.c mulmod_2expm1.c rootrem.c divrem_euclidean_r_1.c divrem_hensel_qr_1.c \
rsh_divrem_hensel_qr_1.c
rsh_divrem_hensel_qr_1.c dc_divappr_q.c dc_div_qr.c inv_divappr_q.c \
inv_div_qr.c
$(TUNE_MPN_SRCS_BASIC):
for i in $(TUNE_MPN_SRCS_BASIC); do \

View File

@ -874,6 +874,26 @@ speed_mpn_sb_divrem_m3_inv (struct speed_params *s)
{
SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv);
}
double
speed_mpn_dc_div_qr_n (struct speed_params *s)
{
SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE (mpn_dc_div_qr_n, DC_DIVAPPR_Q_N_ITCH(s->size));
}
double
speed_mpn_dc_divappr_q (struct speed_params *s)
{
SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q (mpn_dc_divappr_q);
}
double
speed_mpn_inv_div_qr (struct speed_params *s)
{
SPEED_ROUTINE_MPN_INV_DIV (mpn_inv_div_qr);
}
double
speed_mpn_inv_divappr_q (struct speed_params *s)
{
SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q (mpn_inv_divappr_q);
}
double
speed_mpz_mod (struct speed_params *s)

View File

@ -178,6 +178,10 @@ double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s));
double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s));
double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s));
double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s));
double speed_mpn_dc_div_qr_n _PROTO ((struct speed_params *s));
double speed_mpn_dc_divappr_q _PROTO ((struct speed_params *s));
double speed_mpn_inv_div_qr _PROTO ((struct speed_params *s));
double speed_mpn_inv_divappr_q _PROTO ((struct speed_params *s));
double speed_MPN_COPY _PROTO ((struct speed_params *s));
double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
@ -1499,6 +1503,263 @@ int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
return t; \
}
#define SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE(function, tsize) \
{ \
unsigned i; \
mp_ptr a, d, q, tmp; \
mp_limb_t inv; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 2); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (tmp, tsize, s->align_wp2); \
\
MPN_COPY (a, s->xp, s->size); \
MPN_COPY (a+s->size, s->xp, s->size); \
\
MPN_COPY (d, s->yp, s->size); \
\
/* normalize the data */ \
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
a[2*s->size-1] = d[s->size-1] - 1; \
\
speed_operand_src (s, a, 2*s->size); \
speed_operand_src (s, d, s->size); \
speed_operand_dst (s, q, s->size+1); \
speed_cache_fill (s); \
\
invert_1(inv, d[s->size-1], d[s->size-2]); \
\
speed_starttime (); \
i = s->reps; \
do \
function(q, a, d, s->size, inv, tmp); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_DC_DIV(function) \
{ \
unsigned i; \
mp_ptr a, d, q; \
mp_limb_t inv; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 2); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
\
MPN_COPY (a, s->xp, s->size); \
MPN_COPY (a+s->size, s->xp, s->size); \
\
MPN_COPY (d, s->yp, s->size); \
\
/* normalize the data */ \
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
a[2*s->size-1] = d[s->size-1] - 1; \
\
speed_operand_src (s, a, 2*s->size); \
speed_operand_src (s, d, s->size); \
speed_operand_dst (s, q, s->size+1); \
speed_cache_fill (s); \
\
invert_1(inv, d[s->size-1], d[s->size-2]); \
\
speed_starttime (); \
i = s->reps; \
do \
function(q, a, 2*s->size, d, s->size, inv); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q(function) \
{ \
unsigned i; \
mp_ptr a, d, q; \
mp_limb_t inv; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 2); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (d, 2*s->size, s->align_yp); \
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
\
MPN_COPY (a, s->xp, s->size); \
MPN_COPY (a+s->size, s->xp, s->size); \
MPN_COPY (a+2*s->size, s->xp, s->size); \
\
MPN_COPY (d, s->yp, s->size); \
MPN_COPY (d+s->size, s->yp, s->size); \
\
/* normalize the data */ \
d[2*s->size-1] |= GMP_NUMB_HIGHBIT; \
a[3*s->size-1] = d[2*s->size-1] - 1; \
\
speed_operand_src (s, a, 3*s->size); \
speed_operand_src (s, d, 2*s->size); \
speed_operand_dst (s, q, s->size+1); \
speed_cache_fill (s); \
\
invert_1(inv, d[2*s->size-1], d[2*s->size-2]); \
\
speed_starttime (); \
i = s->reps; \
do \
function(q, a, 3*s->size, d, 2*s->size, inv); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_INV_DIV_N(function) \
{ \
unsigned i; \
mp_ptr a, d, q, inv; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 2); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2); \
\
MPN_COPY (a, s->xp, s->size); \
MPN_COPY (a+s->size, s->xp, s->size); \
\
MPN_COPY (d, s->yp, s->size); \
\
/* normalize the data */ \
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
a[2*s->size-1] = d[s->size-1] - 1; \
\
speed_operand_src (s, a, 2*s->size); \
speed_operand_src (s, d, s->size); \
speed_operand_dst (s, q, s->size+1); \
speed_cache_fill (s); \
\
mpn_invert(inv, d, s->size); \
\
speed_starttime (); \
i = s->reps; \
do \
function(q, a, d, s->size, inv); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_INV_DIV(function) \
{ \
unsigned i; \
mp_ptr a, d, q, inv; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 2); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2); \
\
MPN_COPY (a, s->xp, s->size); \
MPN_COPY (a+s->size, s->xp, s->size); \
\
MPN_COPY (d, s->yp, s->size); \
\
/* normalize the data */ \
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
a[2*s->size-1] = d[s->size-1] - 1; \
\
speed_operand_src (s, a, 2*s->size); \
speed_operand_src (s, d, s->size); \
speed_operand_dst (s, q, s->size+1); \
speed_cache_fill (s); \
\
mpn_invert(inv, d, s->size); \
\
speed_starttime (); \
i = s->reps; \
do \
function(q, a, 2*s->size, d, s->size, inv); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q(function) \
{ \
unsigned i; \
mp_ptr a, d, q, inv; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 2); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp); \
SPEED_TMP_ALLOC_LIMBS (d, 2*s->size, s->align_yp); \
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (inv, 2*s->size, s->align_wp2); \
\
MPN_COPY (a, s->xp, s->size); \
MPN_COPY (a+s->size, s->xp, s->size); \
MPN_COPY (a+2*s->size, s->xp, s->size); \
\
MPN_COPY (d, s->yp, s->size); \
MPN_COPY (d+s->size, s->yp, s->size); \
\
/* normalize the data */ \
d[2*s->size-1] |= GMP_NUMB_HIGHBIT; \
a[3*s->size-1] = d[2*s->size-1] - 1; \
\
speed_operand_src (s, a, 3*s->size); \
speed_operand_src (s, d, 2*s->size); \
speed_operand_dst (s, q, s->size+1); \
speed_cache_fill (s); \
\
mpn_invert(inv, d, 2*s->size); \
\
speed_starttime (); \
i = s->reps; \
do \
function(q, a, 3*s->size, d, 2*s->size, inv); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_DC_DIVREM_N(function) \
SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size))

View File

@ -24,40 +24,16 @@ MA 02110-1301, USA. */
-t turns on some diagnostic traces, a second -t turns on more traces.
Notes:
The code here isn't a vision of loveliness, mainly because it's subject
to ongoing changes according to new things wanting to be tuned, and
practical requirements of systems tested.
Sometimes running the program twice produces slightly different results.
This is probably because there's so little separating algorithms near
their crossover, and on that basis it should make little or no difference
to the final speed of the relevant routines, but nothing has been done to
check that carefully.
Algorithm:
The thresholds are determined as follows. A crossover may not be a
single size but rather a range where it oscillates between method A or
method B faster. If the threshold is set making B used where A is faster
(or vice versa) that's bad. Badness is the percentage time lost and
total badness is the sum of this over all sizes measured. The threshold
is set to minimize total badness.
The thresholds are determined as follows: two algorithms A and B are
compared over a range of sizes. At each point, we define "badness" to
be the percentage time lost if algorithm B is chosen over algorithm A.
Then total badness is the sum of this over all sizes measured. The
threshold is set to minimize total badness.
Suppose, as sizes increase, method B becomes faster than method A. The
effect of the rule is that, as you look at increasing sizes, isolated
points where B is faster are ignored, but when it's consistently faster,
or faster on balance, then the threshold is set there. The same result
is obtained thinking in the other direction of A becoming faster at
smaller sizes.
In practice the thresholds tend to be chosen to bring on the next
algorithm fairly quickly.
This rule is attractive because it's got a basis in reason and is fairly
easy to implement, but no work has been done to actually compare it in
absolute terms to other possibilities.
In practice the thresholds tend to be chosen to bring on algorithm B
fairly quickly.
Implementation:
@ -97,6 +73,57 @@ MA 02110-1301, USA. */
so each k is probably being brought on a touch early. This isn't likely
to make a difference, and the simpler probing means fewer tests.
Code:
- main : checks for various command line options and calls all()
- all : prints the tuneup message, date and compiler, then calls
each of the individual tuning functions in turn, e.g.
tune_mul()
- tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the
karatsuba and toom cutoffs. It sets up a param struct with the
following parameters:
a) name : the name of the threshold being tuned, e.g.
MUL_TOOM3_THRESHOLD
b) function : the first function being compared (this must be
of the form speed_blah and the function speed_blah will
exist in speed.h and speed.c
c) function2 : the second function being compared (if set to
NULL, this is automatically set to equal function
d) step_factor : the size of the step between sizes,
set to 0.01 by default, i.e. 1% increments
e) function_fudge : multiplier for the speed of function, used
to adjust for overheads, by default set to 1.0
f) stop_since_change is a stop condition. If the threshold
has not changed for this many iterations, then stop. This
is set to 80 iterations by default.
g) stop_factor : this is another stop factor. If method B
becomes faster by at least this factor, then stop. By
default this is set to 1.2, i.e. 20% faster.
h) min_size : the minimum size to start comparing from.
i) min_is_always : if this is set to 1, then if the threshold
just ends up being min_size, then the threshold is actually
set to 0, i.e. algorithm B is always used.
j) max_size : the maximum size to compare up to. By default this
is set to DEFAULT_MAX_SIZE which is 1000 limbs.
h) check_size : if set, will check that the given starting size
is valid for both algorithms and that algorithm B is at least
4% slower than algorithm A at that point.
i) size_extra : this is a bias added to each size when doing
measurements. It is subtracted off after each measurement.
It is basically used for shifting a threshold from the
measured value.
j) data_high : if set to 1, the high limb of xp and yp are set to
be less than s->r, if set to 2, the high limb of xp and yp are
set to be greater than or equal to s->r
k) noprint : if set, the threshold is computed but not printed.
After setting all the appropriate parameters, the function one() is
called. It takes a reference to a parameter, e.g. mul_toom3_threshold
which is defined in a table below. That threshold will have been given
some initial value (usually MP_SIZE_T_MAX) in the table. It also takes
a reference to the param struct.
- one() : does repeated timings over the given range of sizes always setting
the threshold to size+1 for function and size for function2.
*/
#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
@ -172,7 +199,10 @@ mp_size_t mulhigh_basecase_threshold = MP_SIZE_T_MAX;
mp_size_t mulhigh_dc_threshold = MP_SIZE_T_MAX;
mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX;
mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
mp_size_t div_dc_threshold = MP_SIZE_T_MAX;
mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
mp_size_t dc_divappr_q_n_threshold = MP_SIZE_T_MAX;
mp_size_t inv_div_qr_threshold = MP_SIZE_T_MAX;
mp_size_t inv_divappr_q_n_threshold = MP_SIZE_T_MAX;
mp_size_t powm_threshold = MP_SIZE_T_MAX;
mp_size_t fac_ui_threshold = MP_SIZE_T_MAX;
mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
@ -1130,13 +1160,45 @@ tune_sb_preinv (gmp_randstate_t rands)
void
tune_dc (gmp_randstate_t rands)
tune_dc_div (gmp_randstate_t rands)
{
{
static struct param_t param;
param.name = "DIV_DC_THRESHOLD";
param.function = speed_mpn_dc_tdiv_qr;
param.name = "DC_DIV_QR_THRESHOLD";
param.function = speed_mpn_dc_div_qr_n;
param.stop_factor = 1.1;
param.step_factor = 0.02;
one (&div_dc_threshold, rands, &param);
one (&dc_div_qr_threshold, rands, &param);
}
{
static struct param_t param;
param.name = "DC_DIVAPPR_Q_N_THRESHOLD";
param.function = speed_mpn_dc_divappr_q;
param.stop_factor = 1.1;
param.step_factor = 0.02;
one (&dc_divappr_q_n_threshold, rands, &param);
}
{
static struct param_t param;
param.name = "INV_DIV_QR_THRESHOLD";
param.function = speed_mpn_inv_div_qr;
param.min_size = dc_div_qr_threshold*4;
param.stop_factor = 2.0;
param.step_factor = 0.2;
one (&inv_div_qr_threshold, rands, &param);
}
{
static struct param_t param;
param.name = "INV_DIVAPPR_Q_N_THRESHOLD";
param.function = speed_mpn_inv_divappr_q;
param.min_size = dc_divappr_q_n_threshold*4;
param.stop_factor = 2.0;
param.step_factor = 0.2;
one (&inv_divappr_q_n_threshold, rands, &param);
}
}
@ -1866,7 +1928,18 @@ all (gmp_randstate_t rands)
printf("\n");
tune_sb_preinv (rands);
tune_dc (rands);
/* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */
tune_dc_div (rands);
/* mpn_tdiv_q : balanced
tune_dc_div_q (rands);
tune_inv_div_q (rands);
/* mpn_tdiv_q : small quotient
tune_dc_divappr_q (rands);
tune_inv_divappr_q (rands); */
tune_powm (rands);
tune_fac_ui(rands);
printf("\n");