Attempt to tune some of the division functions.
This commit is contained in:
parent
89ef2d6caf
commit
f444a2bf6c
104
gmp-impl.h
104
gmp-impl.h
@ -584,51 +584,6 @@ void __gmp_tmp_debug_free _PROTO ((const char *, int, int,
|
||||
#define GMP_NAIL_LOWBIT (CNST_LIMB(1) << GMP_NUMB_BITS)
|
||||
#endif
|
||||
|
||||
#if GMP_NAIL_BITS != 0
|
||||
/* Set various *_THRESHOLD values to be used for nails. Thus we avoid using
|
||||
code that has not yet been qualified. */
|
||||
|
||||
#undef DIV_SB_PREINV_THRESHOLD
|
||||
#undef DIV_DC_THRESHOLD
|
||||
#undef POWM_THRESHOLD
|
||||
#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX
|
||||
#define DIV_DC_THRESHOLD 50
|
||||
#define POWM_THRESHOLD 0
|
||||
|
||||
#undef GCD_ACCEL_THRESHOLD
|
||||
#define GCD_ACCEL_THRESHOLD 3
|
||||
|
||||
#undef DIVREM_1_NORM_THRESHOLD
|
||||
#undef DIVREM_1_UNNORM_THRESHOLD
|
||||
#undef MOD_1_NORM_THRESHOLD
|
||||
#undef MOD_1_UNNORM_THRESHOLD
|
||||
#undef USE_PREINV_DIVREM_1
|
||||
#undef USE_PREINV_MOD_1
|
||||
#undef DIVREM_2_THRESHOLD
|
||||
#undef DIVEXACT_1_THRESHOLD
|
||||
#undef MODEXACT_1_ODD_THRESHOLD
|
||||
#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
|
||||
#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
|
||||
#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
|
||||
#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv */
|
||||
#define USE_PREINV_DIVREM_1 0 /* no preinv */
|
||||
#define USE_PREINV_MOD_1 0 /* no preinv */
|
||||
#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv */
|
||||
|
||||
#undef GET_STR_DC_THRESHOLD
|
||||
#undef GET_STR_PRECOMPUTE_THRESHOLD
|
||||
#undef SET_STR_THRESHOLD
|
||||
#define GET_STR_DC_THRESHOLD 22
|
||||
#define GET_STR_PRECOMPUTE_THRESHOLD 42
|
||||
#define SET_STR_THRESHOLD 3259
|
||||
|
||||
/* mpn/generic/mul_fft.c is not nails-capable. */
|
||||
#undef MUL_FFT_THRESHOLD
|
||||
#undef SQR_FFT_THRESHOLD
|
||||
#define MUL_FFT_THRESHOLD MP_SIZE_T_MAX
|
||||
#define SQR_FFT_THRESHOLD MP_SIZE_T_MAX
|
||||
#endif
|
||||
|
||||
/* Swap macros. */
|
||||
|
||||
#define MP_LIMB_T_SWAP(x, y) \
|
||||
@ -1270,6 +1225,8 @@ void mpn_mul_fft_full _PROTO ((mp_ptr op,
|
||||
#define mpn_fft_next_size __MPN(fft_next_size)
|
||||
mp_size_t mpn_fft_next_size _PROTO ((mp_size_t pl, int k)) ATTRIBUTE_CONST;
|
||||
|
||||
#define DC_DIVAPPR_Q_N_ITCH(n) (n*10)
|
||||
|
||||
#define mpn_sb_divrem_mn __MPN(sb_divrem_mn)
|
||||
mp_limb_t mpn_sb_divrem_mn _PROTO ((mp_ptr, mp_ptr, mp_size_t,
|
||||
mp_srcptr, mp_size_t));
|
||||
@ -1790,18 +1747,37 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[];
|
||||
|
||||
#define MPN_FFT_TABLE_SIZE 16
|
||||
|
||||
|
||||
/* mpn_dc_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
|
||||
div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
|
||||
i.e. n/2 >= MUL_KARATSUBA_THRESHOLD
|
||||
|
||||
Measured values are between 2 and 4 times MUL_KARATSUBA_THRESHOLD, so go
|
||||
for 3 as an average. */
|
||||
|
||||
#ifndef DIV_DC_THRESHOLD
|
||||
#define DIV_DC_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
|
||||
#ifndef DC_DIV_QR_THRESHOLD
|
||||
#define DC_DIV_QR_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
|
||||
#endif
|
||||
|
||||
#ifndef DC_DIVAPPR_Q_N_THRESHOLD
|
||||
#define DC_DIVAPPR_Q_N_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
|
||||
#endif
|
||||
|
||||
#ifndef INV_DIV_QR_THRESHOLD
|
||||
#define INV_DIV_QR_THRESHOLD (MUL_FFT_THRESHOLD/3)
|
||||
#endif
|
||||
|
||||
#ifndef INV_DIVAPPR_Q_N_THRESHOLD
|
||||
#define INV_DIVAPPR_Q_N_THRESHOLD (MUL_FFT_THRESHOLD/3)
|
||||
#endif
|
||||
|
||||
#ifndef DC_DIV_Q_THRESHOLD
|
||||
#define DC_DIV_Q_THRESHOLD (3 * MUL_KARATSUBA_THRESHOLD)
|
||||
#endif
|
||||
|
||||
#ifndef INV_DIV_Q_THRESHOLD
|
||||
#define INV_DIV_Q_THRESHOLD (MUL_FFT_THRESHOLD/3)
|
||||
#endif
|
||||
|
||||
#ifndef DC_DIVAPPR_Q_THRESHOLD
|
||||
#define DC_DIVAPPR_Q_THRESHOLD (3 * MUL_TOOM3_THRESHOLD)
|
||||
#endif
|
||||
|
||||
#ifndef INV_DIVAPPR_Q_THRESHOLD
|
||||
#define INV_DIVAPPR_Q_THRESHOLD (MUL_FFT_THRESHOLD/2)
|
||||
#endif
|
||||
|
||||
/* Return non-zero if xp,xsize and yp,ysize overlap.
|
||||
If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no
|
||||
@ -4180,9 +4156,21 @@ extern mp_size_t mulmod_2expm1_threshold;
|
||||
extern mp_size_t div_sb_preinv_threshold;
|
||||
#endif
|
||||
|
||||
#undef DIV_DC_THRESHOLD
|
||||
#define DIV_DC_THRESHOLD div_dc_threshold
|
||||
extern mp_size_t div_dc_threshold;
|
||||
#undef DC_DIV_QR_THRESHOLD
|
||||
#define DC_DIV_QR_THRESHOLD dc_div_qr_threshold
|
||||
extern mp_size_t dc_div_qr_threshold;
|
||||
|
||||
#undef DC_DIVAPPR_Q_N_THRESHOLD
|
||||
#define DC_DIVAPPR_Q_N_THRESHOLD dc_divappr_q_n_threshold
|
||||
extern mp_size_t dc_divappr_q_n_threshold;
|
||||
|
||||
#undef INV_DIV_QR_THRESHOLD
|
||||
#define INV_DIV_QR_THRESHOLD inv_div_qr_threshold
|
||||
extern mp_size_t inv_div_qr_threshold;
|
||||
|
||||
#undef INV_DIVAPPR_Q_N_THRESHOLD
|
||||
#define INV_DIVAPPR_Q_N_THRESHOLD inv_divappr_q_n_threshold
|
||||
extern mp_size_t inv_divappr_q_n_threshold;
|
||||
|
||||
#undef POWM_THRESHOLD
|
||||
#define POWM_THRESHOLD powm_threshold
|
||||
|
@ -28,8 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
#define DC_DIV_QR_THRESHOLD 56
|
||||
|
||||
mp_limb_t
|
||||
mpn_dc_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
|
||||
mp_limb_t dinv, mp_ptr tp)
|
||||
|
@ -28,10 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */
|
||||
#define DC_DIVAPPR_QR_THRESHOLD 145
|
||||
#define DC_DIV_QR_THRESHOLD 56
|
||||
|
||||
mp_limb_t
|
||||
mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
|
||||
mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
|
||||
@ -187,7 +183,7 @@ mpn_dc_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
|
||||
|
||||
q2p = TMP_SALLOC_LIMBS (qn + 1);
|
||||
|
||||
if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
|
||||
if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_N_THRESHOLD))
|
||||
{
|
||||
qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
|
||||
dp - (qn + 1), qn + 1, dinv);
|
||||
|
@ -64,8 +64,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
has that description.
|
||||
*/
|
||||
|
||||
#define DC_DIVAPPR_Q_N_THRESHOLD 56
|
||||
|
||||
mp_limb_t
|
||||
mpn_dc_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
|
||||
mp_limb_t dip, mp_ptr tp)
|
||||
|
@ -29,6 +29,8 @@ MA 02110-1301, USA. */
|
||||
#include "mpir.h"
|
||||
#include "gmp-impl.h"
|
||||
|
||||
#define DIV_DC_THRESHOLD 16
|
||||
|
||||
/*
|
||||
[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
|
||||
Technical report MPI-I-98-1-022, october 1998.
|
||||
|
@ -30,9 +30,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
#define DC_DIV_QR_THRESHOLD 56
|
||||
#define INV_DIV_QR_THRESHOLD 1200
|
||||
|
||||
mp_limb_t
|
||||
mpn_inv_div_qr (mp_ptr qp,
|
||||
mp_ptr np, mp_size_t nn,
|
||||
|
@ -28,12 +28,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
#define DC_DIVAPPR_Q_THRESHOLD 141 /*FIXME: tune these */
|
||||
#define INV_DIVAPPR_Q_THRESHOLD 1000
|
||||
#define INV_DIV_QR_THRESHOLD 1200
|
||||
#define DC_DIVAPPR_QR_THRESHOLD 145
|
||||
#define DC_DIV_QR_THRESHOLD 56
|
||||
|
||||
mp_limb_t
|
||||
mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
|
||||
mp_srcptr dp, mp_size_t dn, mp_srcptr dinv)
|
||||
@ -190,7 +184,7 @@ mpn_inv_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
|
||||
qh = mpn_sb_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
|
||||
dp - (qn + 1), qn + 1, dinv2);
|
||||
}
|
||||
else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_THRESHOLD))
|
||||
else if (BELOW_THRESHOLD (qn, INV_DIVAPPR_Q_N_THRESHOLD))
|
||||
{
|
||||
/* It is tempting to use qp for recursive scratch and put quotient in
|
||||
tp, but the recursive scratch needs one limb too many. */
|
||||
|
@ -81,11 +81,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
for the code to be correct. */
|
||||
#define FUDGE 5 /* FIXME: tune this */
|
||||
|
||||
#define DC_DIV_Q_THRESHOLD 156 /* FIXME: tune this */
|
||||
#define DC_DIVAPPR_Q_THRESHOLD 156 /* FIXME: tune this */
|
||||
#define INV_DIV_Q_THRESHOLD 1000
|
||||
#define INV_DIVAPPR_Q_THRESHOLD 2000
|
||||
|
||||
void
|
||||
mpn_tdiv_q (mp_ptr qp,
|
||||
mp_srcptr np, mp_size_t nn,
|
||||
|
@ -35,9 +35,6 @@ along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
|
||||
#include "gmp-impl.h"
|
||||
#include "longlong.h"
|
||||
|
||||
#define DC_DIV_QR_THRESHOLD 152 /*FIXME: tune this!*/
|
||||
#define INV_DIV_QR_THRESHOLD 1000
|
||||
|
||||
void
|
||||
mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
|
||||
mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
|
||||
|
@ -23,7 +23,10 @@
|
||||
#define MULMOD_2EXPM1_THRESHOLD 18
|
||||
|
||||
#define DIV_SB_PREINV_THRESHOLD 0 /* always */
|
||||
#define DIV_DC_THRESHOLD 372
|
||||
#define DC_DIV_QR_THRESHOLD 14
|
||||
#define DC_DIVAPPR_Q_N_THRESHOLD 10
|
||||
#define INV_DIV_QR_THRESHOLD 926
|
||||
#define INV_DIVAPPR_Q_N_THRESHOLD 926
|
||||
#define POWM_THRESHOLD 154
|
||||
#define FAC_UI_THRESHOLD 17010
|
||||
|
||||
|
@ -129,7 +129,8 @@ TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c
|
||||
TUNE_MPN_SRCS_BASIC = dc_divrem_n.c divrem_2.c gcd.c gcdext.c get_str.c \
|
||||
mul_n.c mullow_n.c mulhigh_n.c mul_fft.c mul.c sb_divrem_mn.c tdiv_qr.c toom4_mul_n.c \
|
||||
toom7_mul_n.c mulmod_2expm1.c rootrem.c divrem_euclidean_r_1.c divrem_hensel_qr_1.c \
|
||||
rsh_divrem_hensel_qr_1.c
|
||||
rsh_divrem_hensel_qr_1.c dc_divappr_q.c dc_div_qr.c inv_divappr_q.c \
|
||||
inv_div_qr.c
|
||||
|
||||
$(TUNE_MPN_SRCS_BASIC):
|
||||
for i in $(TUNE_MPN_SRCS_BASIC); do \
|
||||
|
@ -874,6 +874,26 @@ speed_mpn_sb_divrem_m3_inv (struct speed_params *s)
|
||||
{
|
||||
SPEED_ROUTINE_MPN_SB_DIVREM_M3 (mpn_sb_divrem_mn_inv);
|
||||
}
|
||||
double
|
||||
speed_mpn_dc_div_qr_n (struct speed_params *s)
|
||||
{
|
||||
SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE (mpn_dc_div_qr_n, DC_DIVAPPR_Q_N_ITCH(s->size));
|
||||
}
|
||||
double
|
||||
speed_mpn_dc_divappr_q (struct speed_params *s)
|
||||
{
|
||||
SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q (mpn_dc_divappr_q);
|
||||
}
|
||||
double
|
||||
speed_mpn_inv_div_qr (struct speed_params *s)
|
||||
{
|
||||
SPEED_ROUTINE_MPN_INV_DIV (mpn_inv_div_qr);
|
||||
}
|
||||
double
|
||||
speed_mpn_inv_divappr_q (struct speed_params *s)
|
||||
{
|
||||
SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q (mpn_inv_divappr_q);
|
||||
}
|
||||
|
||||
double
|
||||
speed_mpz_mod (struct speed_params *s)
|
||||
|
261
tune/speed.h
261
tune/speed.h
@ -178,6 +178,10 @@ double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_dc_div_qr_n _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_dc_divappr_q _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_inv_div_qr _PROTO ((struct speed_params *s));
|
||||
double speed_mpn_inv_divappr_q _PROTO ((struct speed_params *s));
|
||||
double speed_MPN_COPY _PROTO ((struct speed_params *s));
|
||||
double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
|
||||
double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
|
||||
@ -1499,6 +1503,263 @@ int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_DC_DIV_N_TSIZE(function, tsize) \
|
||||
{ \
|
||||
unsigned i; \
|
||||
mp_ptr a, d, q, tmp; \
|
||||
mp_limb_t inv; \
|
||||
double t; \
|
||||
TMP_DECL; \
|
||||
\
|
||||
SPEED_RESTRICT_COND (s->size >= 2); \
|
||||
\
|
||||
TMP_MARK; \
|
||||
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (tmp, tsize, s->align_wp2); \
|
||||
\
|
||||
MPN_COPY (a, s->xp, s->size); \
|
||||
MPN_COPY (a+s->size, s->xp, s->size); \
|
||||
\
|
||||
MPN_COPY (d, s->yp, s->size); \
|
||||
\
|
||||
/* normalize the data */ \
|
||||
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
|
||||
a[2*s->size-1] = d[s->size-1] - 1; \
|
||||
\
|
||||
speed_operand_src (s, a, 2*s->size); \
|
||||
speed_operand_src (s, d, s->size); \
|
||||
speed_operand_dst (s, q, s->size+1); \
|
||||
speed_cache_fill (s); \
|
||||
\
|
||||
invert_1(inv, d[s->size-1], d[s->size-2]); \
|
||||
\
|
||||
speed_starttime (); \
|
||||
i = s->reps; \
|
||||
do \
|
||||
function(q, a, d, s->size, inv, tmp); \
|
||||
while (--i != 0); \
|
||||
t = speed_endtime (); \
|
||||
\
|
||||
TMP_FREE; \
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_DC_DIV(function) \
|
||||
{ \
|
||||
unsigned i; \
|
||||
mp_ptr a, d, q; \
|
||||
mp_limb_t inv; \
|
||||
double t; \
|
||||
TMP_DECL; \
|
||||
\
|
||||
SPEED_RESTRICT_COND (s->size >= 2); \
|
||||
\
|
||||
TMP_MARK; \
|
||||
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
|
||||
\
|
||||
MPN_COPY (a, s->xp, s->size); \
|
||||
MPN_COPY (a+s->size, s->xp, s->size); \
|
||||
\
|
||||
MPN_COPY (d, s->yp, s->size); \
|
||||
\
|
||||
/* normalize the data */ \
|
||||
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
|
||||
a[2*s->size-1] = d[s->size-1] - 1; \
|
||||
\
|
||||
speed_operand_src (s, a, 2*s->size); \
|
||||
speed_operand_src (s, d, s->size); \
|
||||
speed_operand_dst (s, q, s->size+1); \
|
||||
speed_cache_fill (s); \
|
||||
\
|
||||
invert_1(inv, d[s->size-1], d[s->size-2]); \
|
||||
\
|
||||
speed_starttime (); \
|
||||
i = s->reps; \
|
||||
do \
|
||||
function(q, a, 2*s->size, d, s->size, inv); \
|
||||
while (--i != 0); \
|
||||
t = speed_endtime (); \
|
||||
\
|
||||
TMP_FREE; \
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_DC_DIV_SMALL_Q(function) \
|
||||
{ \
|
||||
unsigned i; \
|
||||
mp_ptr a, d, q; \
|
||||
mp_limb_t inv; \
|
||||
double t; \
|
||||
TMP_DECL; \
|
||||
\
|
||||
SPEED_RESTRICT_COND (s->size >= 2); \
|
||||
\
|
||||
TMP_MARK; \
|
||||
SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (d, 2*s->size, s->align_yp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
|
||||
\
|
||||
MPN_COPY (a, s->xp, s->size); \
|
||||
MPN_COPY (a+s->size, s->xp, s->size); \
|
||||
MPN_COPY (a+2*s->size, s->xp, s->size); \
|
||||
\
|
||||
MPN_COPY (d, s->yp, s->size); \
|
||||
MPN_COPY (d+s->size, s->yp, s->size); \
|
||||
\
|
||||
/* normalize the data */ \
|
||||
d[2*s->size-1] |= GMP_NUMB_HIGHBIT; \
|
||||
a[3*s->size-1] = d[2*s->size-1] - 1; \
|
||||
\
|
||||
speed_operand_src (s, a, 3*s->size); \
|
||||
speed_operand_src (s, d, 2*s->size); \
|
||||
speed_operand_dst (s, q, s->size+1); \
|
||||
speed_cache_fill (s); \
|
||||
\
|
||||
invert_1(inv, d[2*s->size-1], d[2*s->size-2]); \
|
||||
\
|
||||
speed_starttime (); \
|
||||
i = s->reps; \
|
||||
do \
|
||||
function(q, a, 3*s->size, d, 2*s->size, inv); \
|
||||
while (--i != 0); \
|
||||
t = speed_endtime (); \
|
||||
\
|
||||
TMP_FREE; \
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_INV_DIV_N(function) \
|
||||
{ \
|
||||
unsigned i; \
|
||||
mp_ptr a, d, q, inv; \
|
||||
double t; \
|
||||
TMP_DECL; \
|
||||
\
|
||||
SPEED_RESTRICT_COND (s->size >= 2); \
|
||||
\
|
||||
TMP_MARK; \
|
||||
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2); \
|
||||
\
|
||||
MPN_COPY (a, s->xp, s->size); \
|
||||
MPN_COPY (a+s->size, s->xp, s->size); \
|
||||
\
|
||||
MPN_COPY (d, s->yp, s->size); \
|
||||
\
|
||||
/* normalize the data */ \
|
||||
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
|
||||
a[2*s->size-1] = d[s->size-1] - 1; \
|
||||
\
|
||||
speed_operand_src (s, a, 2*s->size); \
|
||||
speed_operand_src (s, d, s->size); \
|
||||
speed_operand_dst (s, q, s->size+1); \
|
||||
speed_cache_fill (s); \
|
||||
\
|
||||
mpn_invert(inv, d, s->size); \
|
||||
\
|
||||
speed_starttime (); \
|
||||
i = s->reps; \
|
||||
do \
|
||||
function(q, a, d, s->size, inv); \
|
||||
while (--i != 0); \
|
||||
t = speed_endtime (); \
|
||||
\
|
||||
TMP_FREE; \
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_INV_DIV(function) \
|
||||
{ \
|
||||
unsigned i; \
|
||||
mp_ptr a, d, q, inv; \
|
||||
double t; \
|
||||
TMP_DECL; \
|
||||
\
|
||||
SPEED_RESTRICT_COND (s->size >= 2); \
|
||||
\
|
||||
TMP_MARK; \
|
||||
SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (inv, s->size, s->align_wp2); \
|
||||
\
|
||||
MPN_COPY (a, s->xp, s->size); \
|
||||
MPN_COPY (a+s->size, s->xp, s->size); \
|
||||
\
|
||||
MPN_COPY (d, s->yp, s->size); \
|
||||
\
|
||||
/* normalize the data */ \
|
||||
d[s->size-1] |= GMP_NUMB_HIGHBIT; \
|
||||
a[2*s->size-1] = d[s->size-1] - 1; \
|
||||
\
|
||||
speed_operand_src (s, a, 2*s->size); \
|
||||
speed_operand_src (s, d, s->size); \
|
||||
speed_operand_dst (s, q, s->size+1); \
|
||||
speed_cache_fill (s); \
|
||||
\
|
||||
mpn_invert(inv, d, s->size); \
|
||||
\
|
||||
speed_starttime (); \
|
||||
i = s->reps; \
|
||||
do \
|
||||
function(q, a, 2*s->size, d, s->size, inv); \
|
||||
while (--i != 0); \
|
||||
t = speed_endtime (); \
|
||||
\
|
||||
TMP_FREE; \
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_INV_DIV_SMALL_Q(function) \
|
||||
{ \
|
||||
unsigned i; \
|
||||
mp_ptr a, d, q, inv; \
|
||||
double t; \
|
||||
TMP_DECL; \
|
||||
\
|
||||
SPEED_RESTRICT_COND (s->size >= 2); \
|
||||
\
|
||||
TMP_MARK; \
|
||||
SPEED_TMP_ALLOC_LIMBS (a, 3*s->size, s->align_xp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (d, 2*s->size, s->align_yp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
|
||||
SPEED_TMP_ALLOC_LIMBS (inv, 2*s->size, s->align_wp2); \
|
||||
\
|
||||
MPN_COPY (a, s->xp, s->size); \
|
||||
MPN_COPY (a+s->size, s->xp, s->size); \
|
||||
MPN_COPY (a+2*s->size, s->xp, s->size); \
|
||||
\
|
||||
MPN_COPY (d, s->yp, s->size); \
|
||||
MPN_COPY (d+s->size, s->yp, s->size); \
|
||||
\
|
||||
/* normalize the data */ \
|
||||
d[2*s->size-1] |= GMP_NUMB_HIGHBIT; \
|
||||
a[3*s->size-1] = d[2*s->size-1] - 1; \
|
||||
\
|
||||
speed_operand_src (s, a, 3*s->size); \
|
||||
speed_operand_src (s, d, 2*s->size); \
|
||||
speed_operand_dst (s, q, s->size+1); \
|
||||
speed_cache_fill (s); \
|
||||
\
|
||||
mpn_invert(inv, d, 2*s->size); \
|
||||
\
|
||||
speed_starttime (); \
|
||||
i = s->reps; \
|
||||
do \
|
||||
function(q, a, 3*s->size, d, 2*s->size, inv); \
|
||||
while (--i != 0); \
|
||||
t = speed_endtime (); \
|
||||
\
|
||||
TMP_FREE; \
|
||||
return t; \
|
||||
}
|
||||
|
||||
#define SPEED_ROUTINE_MPN_DC_DIVREM_N(function) \
|
||||
SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size))
|
||||
|
||||
|
147
tune/tuneup.c
147
tune/tuneup.c
@ -24,40 +24,16 @@ MA 02110-1301, USA. */
|
||||
|
||||
-t turns on some diagnostic traces, a second -t turns on more traces.
|
||||
|
||||
Notes:
|
||||
|
||||
The code here isn't a vision of loveliness, mainly because it's subject
|
||||
to ongoing changes according to new things wanting to be tuned, and
|
||||
practical requirements of systems tested.
|
||||
|
||||
Sometimes running the program twice produces slightly different results.
|
||||
This is probably because there's so little separating algorithms near
|
||||
their crossover, and on that basis it should make little or no difference
|
||||
to the final speed of the relevant routines, but nothing has been done to
|
||||
check that carefully.
|
||||
|
||||
Algorithm:
|
||||
|
||||
The thresholds are determined as follows. A crossover may not be a
|
||||
single size but rather a range where it oscillates between method A or
|
||||
method B faster. If the threshold is set making B used where A is faster
|
||||
(or vice versa) that's bad. Badness is the percentage time lost and
|
||||
total badness is the sum of this over all sizes measured. The threshold
|
||||
is set to minimize total badness.
|
||||
The thresholds are determined as follows: two algorithms A and B are
|
||||
compared over a range of sizes. At each point, we define "badness" to
|
||||
be the percentage time lost if algorithm B is chosen over algorithm A.
|
||||
Then total badness is the sum of this over all sizes measured. The
|
||||
threshold is set to minimize total badness.
|
||||
|
||||
Suppose, as sizes increase, method B becomes faster than method A. The
|
||||
effect of the rule is that, as you look at increasing sizes, isolated
|
||||
points where B is faster are ignored, but when it's consistently faster,
|
||||
or faster on balance, then the threshold is set there. The same result
|
||||
is obtained thinking in the other direction of A becoming faster at
|
||||
smaller sizes.
|
||||
|
||||
In practice the thresholds tend to be chosen to bring on the next
|
||||
algorithm fairly quickly.
|
||||
|
||||
This rule is attractive because it's got a basis in reason and is fairly
|
||||
easy to implement, but no work has been done to actually compare it in
|
||||
absolute terms to other possibilities.
|
||||
In practice the thresholds tend to be chosen to bring on algorithm B
|
||||
fairly quickly.
|
||||
|
||||
Implementation:
|
||||
|
||||
@ -97,6 +73,57 @@ MA 02110-1301, USA. */
|
||||
so each k is probably being brought on a touch early. This isn't likely
|
||||
to make a difference, and the simpler probing means fewer tests.
|
||||
|
||||
Code:
|
||||
- main : checks for various command line options and calls all()
|
||||
- all : prints the tuneup message, date and compiler, then calls
|
||||
each of the individual tuning functions in turn, e.g.
|
||||
tune_mul()
|
||||
- tune_blah() : tunes function of type blah, e.g. tune_mul() tunes the
|
||||
karatsuba and toom cutoffs. It sets up a param struct with the
|
||||
following parameters:
|
||||
a) name : the name of the threshold being tuned, e.g.
|
||||
MUL_TOOM3_THRESHOLD
|
||||
b) function : the first function being compared (this must be
|
||||
of the form speed_blah and the function speed_blah will
|
||||
exist in speed.h and speed.c
|
||||
c) function2 : the second function being compared (if set to
|
||||
NULL, this is automatically set to equal function
|
||||
d) step_factor : the size of the step between sizes,
|
||||
set to 0.01 by default, i.e. 1% increments
|
||||
e) function_fudge : multiplier for the speed of function, used
|
||||
to adjust for overheads, by default set to 1.0
|
||||
f) stop_since_change is a stop condition. If the threshold
|
||||
has not changed for this many iterations, then stop. This
|
||||
is set to 80 iterations by default.
|
||||
g) stop_factor : this is another stop factor. If method B
|
||||
becomes faster by at least this factor, then stop. By
|
||||
default this is set to 1.2, i.e. 20% faster.
|
||||
h) min_size : the minimum size to start comparing from.
|
||||
i) min_is_always : if this is set to 1, then if the threshold
|
||||
just ends up being min_size, then the threshold is actually
|
||||
set to 0, i.e. algorithm B is always used.
|
||||
j) max_size : the maximum size to compare up to. By default this
|
||||
is set to DEFAULT_MAX_SIZE which is 1000 limbs.
|
||||
h) check_size : if set, will check that the given starting size
|
||||
is valid for both algorithms and that algorithm B is at least
|
||||
4% slower than algorithm A at that point.
|
||||
i) size_extra : this is a bias added to each size when doing
|
||||
measurements. It is subtracted off after each measurement.
|
||||
It is basically used for shifting a threshold from the
|
||||
measured value.
|
||||
j) data_high : if set to 1, the high limb of xp and yp are set to
|
||||
be less than s->r, if set to 2, the high limb of xp and yp are
|
||||
set to be greater than or equal to s->r
|
||||
k) noprint : if set, the threshold is computed but not printed.
|
||||
|
||||
After setting all the appropriate parameters, the function one() is
|
||||
called. It takes a reference to a parameter, e.g. mul_toom3_threshold
|
||||
which is defined in a table below. That threshold will have been given
|
||||
some initial value (usually MP_SIZE_T_MAX) in the table. It also takes
|
||||
a reference to the param struct.
|
||||
- one() : does repeated timings over the given range of sizes always setting
|
||||
the threshold to size+1 for function and size for function2.
|
||||
|
||||
*/
|
||||
|
||||
#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
|
||||
@ -172,7 +199,10 @@ mp_size_t mulhigh_basecase_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t mulhigh_dc_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t div_dc_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t dc_divappr_q_n_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t inv_div_qr_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t inv_divappr_q_n_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t powm_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t fac_ui_threshold = MP_SIZE_T_MAX;
|
||||
mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
|
||||
@ -1130,13 +1160,45 @@ tune_sb_preinv (gmp_randstate_t rands)
|
||||
|
||||
|
||||
void
|
||||
tune_dc (gmp_randstate_t rands)
|
||||
tune_dc_div (gmp_randstate_t rands)
|
||||
{
|
||||
{
|
||||
static struct param_t param;
|
||||
param.name = "DIV_DC_THRESHOLD";
|
||||
param.function = speed_mpn_dc_tdiv_qr;
|
||||
param.name = "DC_DIV_QR_THRESHOLD";
|
||||
param.function = speed_mpn_dc_div_qr_n;
|
||||
param.stop_factor = 1.1;
|
||||
param.step_factor = 0.02;
|
||||
one (&div_dc_threshold, rands, ¶m);
|
||||
one (&dc_div_qr_threshold, rands, ¶m);
|
||||
}
|
||||
|
||||
{
|
||||
static struct param_t param;
|
||||
param.name = "DC_DIVAPPR_Q_N_THRESHOLD";
|
||||
param.function = speed_mpn_dc_divappr_q;
|
||||
param.stop_factor = 1.1;
|
||||
param.step_factor = 0.02;
|
||||
one (&dc_divappr_q_n_threshold, rands, ¶m);
|
||||
}
|
||||
|
||||
{
|
||||
static struct param_t param;
|
||||
param.name = "INV_DIV_QR_THRESHOLD";
|
||||
param.function = speed_mpn_inv_div_qr;
|
||||
param.min_size = dc_div_qr_threshold*4;
|
||||
param.stop_factor = 2.0;
|
||||
param.step_factor = 0.2;
|
||||
one (&inv_div_qr_threshold, rands, ¶m);
|
||||
}
|
||||
|
||||
{
|
||||
static struct param_t param;
|
||||
param.name = "INV_DIVAPPR_Q_N_THRESHOLD";
|
||||
param.function = speed_mpn_inv_divappr_q;
|
||||
param.min_size = dc_divappr_q_n_threshold*4;
|
||||
param.stop_factor = 2.0;
|
||||
param.step_factor = 0.2;
|
||||
one (&inv_divappr_q_n_threshold, rands, ¶m);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1866,7 +1928,18 @@ all (gmp_randstate_t rands)
|
||||
printf("\n");
|
||||
|
||||
tune_sb_preinv (rands);
|
||||
tune_dc (rands);
|
||||
|
||||
/* dc_div_qr_n, dc_divappr_q, inv_div_qr, inv_divappr_q */
|
||||
tune_dc_div (rands);
|
||||
|
||||
/* mpn_tdiv_q : balanced
|
||||
tune_dc_div_q (rands);
|
||||
tune_inv_div_q (rands);
|
||||
|
||||
/* mpn_tdiv_q : small quotient
|
||||
tune_dc_divappr_q (rands);
|
||||
tune_inv_divappr_q (rands); */
|
||||
|
||||
tune_powm (rands);
|
||||
tune_fac_ui(rands);
|
||||
printf("\n");
|
||||
|
Loading…
Reference in New Issue
Block a user