/* Speed measuring program. Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* Usage message is in the code below, run with no arguments to print it. See README for interesting applications. To add a new routine foo(), create a speed_foo() function in the style of the existing ones and add an entry in the routine[] array. Put FLAG_R if speed_foo() wants an "r" parameter. The routines don't have help messages or descriptions, but most have suggestive names. See the source code for full details. */ #include "config.h" #include #include #include #include #if HAVE_UNISTD_H || _MSC_VER #include /* for getpid, R_OK */ #endif #if TIME_WITH_SYS_TIME # include /* for struct timeval */ # include #else # if HAVE_SYS_TIME_H # include # else # include # endif #endif #if HAVE_SYS_RESOURCE_H #include /* for getrusage() */ #endif #include "mpir.h" #include "gmp-impl.h" #include "longlong.h" /* for the benefit of speed-many.c */ #include "tests.h" #include "speed.h" #if 1 && defined( _MSC_VER) #include "win_timing.h" #endif #if !HAVE_DECL_OPTARG extern char *optarg; extern int optind, opterr; #endif #if !HAVE_STRTOUL #define strtoul(p,e,b) (unsigned long) strtol(p,e,b) #endif #ifdef SPEED_EXTRA_PROTOS SPEED_EXTRA_PROTOS #endif #ifdef SPEED_EXTRA_PROTOS2 SPEED_EXTRA_PROTOS2 #endif #define MPN_FILL(ptr, size, n) \ do { \ mp_size_t __i; \ ASSERT ((size) >= 0); \ for (__i = 0; __i < (size); __i++) \ (ptr)[__i] = (n); \ } while (0) #if BITS_PER_MP_LIMB == 32 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK) #endif #if BITS_PER_MP_LIMB == 64 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK) #endif #define CMP_ABSOLUTE 1 #define CMP_RATIO 2 #define CMP_DIFFERENCE 3 #define CMP_DIFFPREV 4 int option_cmp = CMP_ABSOLUTE; int option_cmp_pos = 0; #define UNIT_SECONDS 1 #define UNIT_CYCLES 2 #define UNIT_CYCLESPERLIMB 3 int option_unit = UNIT_SECONDS; #define DATA_RANDOM 1 #define DATA_RANDOM2 2 #define DATA_ZEROS 3 #define DATA_AAS 4 #define DATA_FFS 5 #define DATA_2FD 6 int option_data = DATA_RANDOM; int option_square = 0; double option_factor = 0.0; mp_size_t option_step = 1; int option_gnuplot = 0; char *option_gnuplot_basename; struct size_array_t { mp_size_t start, end, inc; } *size_array = NULL; mp_size_t size_num = 0; mp_size_t size_allocnum = 0; int option_resource_usage = 0; long option_seed = 123456789; #define XCOLMAX 16 int xcoln,xcol[XCOLMAX]; struct speed_params sp; #define COLUMN_WIDTH 13 /* for the free-form output */ #define FLAG_R (1<<0) /* require ".r" */ #define FLAG_R_OPTIONAL (1<<1) /* optional ".r" */ #define FLAG_RSIZE (1<<2) #define FLAG_NODATA (1<<3) /* don't alloc xp, yp */ const struct routine_t { /* constants */ const char *name; speed_function_t fun; int flag; } routine[] = { { "noop", speed_noop }, { "noop_wxs", speed_noop_wxs }, { "noop_wxys", speed_noop_wxys }, { "mpn_add_n", speed_mpn_add_n, FLAG_R_OPTIONAL }, { "mpn_sub_n", speed_mpn_sub_n, FLAG_R_OPTIONAL }, #if HAVE_NATIVE_mpn_sumdiff_n { "mpn_sumdiff_n", speed_mpn_sumdiff_n, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addadd_n { "mpn_addadd_n", speed_mpn_addadd_n, }, #endif #if HAVE_NATIVE_mpn_subadd_n { "mpn_subadd_n", speed_mpn_subadd_n, }, #endif #if HAVE_NATIVE_mpn_addsub_n { "mpn_addsub_n", speed_mpn_addsub_n, }, #endif #if HAVE_NATIVE_mpn_karaadd { "mpn_karaadd", speed_mpn_karaadd, }, #endif #if HAVE_NATIVE_mpn_karasub { "mpn_karasub", speed_mpn_karasub, }, #endif { "mpn_addmul_1", speed_mpn_addmul_1, FLAG_R }, { "mpn_submul_1", speed_mpn_submul_1, FLAG_R }, #if HAVE_NATIVE_mpn_addmul_2 { "mpn_addmul_2", speed_mpn_addmul_2, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addmul_3 { "mpn_addmul_3", speed_mpn_addmul_3, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addmul_4 { "mpn_addmul_4", speed_mpn_addmul_4, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addmul_5 { "mpn_addmul_5", speed_mpn_addmul_5, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addmul_6 { "mpn_addmul_6", speed_mpn_addmul_6, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addmul_7 { "mpn_addmul_7", speed_mpn_addmul_7, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_addmul_8 { "mpn_addmul_8", speed_mpn_addmul_8, FLAG_R_OPTIONAL }, #endif { "mpn_mul_1", speed_mpn_mul_1, FLAG_R }, { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R }, #if HAVE_NATIVE_mpn_mul_2 { "mpn_mul_2", speed_mpn_mul_2, FLAG_R_OPTIONAL }, #endif { "mpn_divrem_euclidean_qr_1", speed_mpn_divrem_euclidean_qr_1, FLAG_R }, { "mpn_divrem_euclidean_qr_2", speed_mpn_divrem_euclidean_qr_2, }, { "mpn_divrem_euclidean_r_1", speed_mpn_divrem_euclidean_r_1, FLAG_R }, { "mpn_divrem_hensel_qr_1", speed_mpn_divrem_hensel_qr_1, FLAG_R}, { "mpn_divrem_hensel_qr_1_1", speed_mpn_divrem_hensel_qr_1_1, FLAG_R}, { "mpn_divrem_hensel_qr_1_2", speed_mpn_divrem_hensel_qr_1_2, FLAG_R}, { "mpn_divrem_hensel_r_1", speed_mpn_divrem_hensel_r_1, FLAG_R }, { "mpn_rsh_divrem_hensel_qr_1", speed_mpn_rsh_divrem_hensel_qr_1, FLAG_R}, { "mpn_rsh_divrem_hensel_qr_1_1", speed_mpn_rsh_divrem_hensel_qr_1_1, FLAG_R}, { "mpn_rsh_divrem_hensel_qr_1_2", speed_mpn_rsh_divrem_hensel_qr_1_2, FLAG_R}, { "mpn_divrem_hensel_rsh_qr_1", speed_mpn_divrem_hensel_rsh_qr_1, FLAG_R}, { "mpn_divrem_1", speed_mpn_divrem_1, FLAG_R }, { "mpn_divrem_1f", speed_mpn_divrem_1f, FLAG_R }, #if HAVE_NATIVE_mpn_divrem_1c { "mpn_divrem_1c", speed_mpn_divrem_1c, FLAG_R }, { "mpn_divrem_1cf", speed_mpn_divrem_1cf,FLAG_R }, #endif { "mpn_mod_1", speed_mpn_mod_1, FLAG_R }, { "mpn_mod_1_1", speed_mpn_mod_1_1,}, { "mpn_mod_1_2", speed_mpn_mod_1_2,}, { "mpn_mod_1_3", speed_mpn_mod_1_3,}, { "mpn_mod_1_k", speed_mpn_mod_1_k, FLAG_R }, #if HAVE_NATIVE_mpn_mod_1c { "mpn_mod_1c", speed_mpn_mod_1c, FLAG_R }, #endif { "mpn_preinv_divrem_1", speed_mpn_preinv_divrem_1, FLAG_R }, { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R }, { "mpn_preinv_mod_1", speed_mpn_preinv_mod_1, FLAG_R }, { "mpn_add_err1_n", speed_mpn_add_err1_n, }, { "mpn_sub_err1_n", speed_mpn_sub_err1_n, }, { "mpn_inv_divappr_q", speed_mpn_inv_divappr_q, }, { "mpn_inv_div_qr", speed_mpn_inv_div_qr, }, { "mpn_dc_divappr_q", speed_mpn_dc_divappr_q, }, { "mpn_dc_div_qr_n", speed_mpn_dc_div_qr_n, }, { "mpn_divrem_1_inv", speed_mpn_divrem_1_inv, FLAG_R }, { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R }, { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R }, { "mpn_mod_1_div", speed_mpn_mod_1_div, FLAG_R }, { "mpn_mod_1_inv", speed_mpn_mod_1_inv, FLAG_R }, { "mpn_divrem_2", speed_mpn_divrem_2, }, { "mpn_divrem_2_div", speed_mpn_divrem_2_div, }, { "mpn_divrem_2_inv", speed_mpn_divrem_2_inv, }, { "mpn_divexact_1", speed_mpn_divexact_1, FLAG_R }, { "mpn_divexact_by3", speed_mpn_divexact_by3 }, { "mpn_divexact_byff", speed_mpn_divexact_byff }, { "mpn_divexact_byfobm1", speed_mpn_divexact_byfobm1, FLAG_R }, #if HAVE_NATIVE_mpn_modexact_1_odd { "mpn_modexact_1_odd", speed_mpn_modexact_1_odd, FLAG_R }, #endif { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R }, #if GMP_NUMB_BITS % 4 == 0 { "mpn_mod_34lsub1", speed_mpn_mod_34lsub1 }, #endif { "mpn_dc_tdiv_qr", speed_mpn_dc_tdiv_qr }, { "mpn_lshift", speed_mpn_lshift, FLAG_R }, { "mpn_rshift", speed_mpn_rshift, FLAG_R }, #if HAVE_NATIVE_mpn_lshiftc { "mpn_lshiftc", speed_mpn_lshiftc, FLAG_R }, #endif { "mpn_lshift1", speed_mpn_lshift1, }, { "mpn_rshift1", speed_mpn_rshift1, }, { "mpn_double", speed_mpn_double, }, { "mpn_half", speed_mpn_half, }, { "mpn_lshift2", speed_mpn_lshift2, }, { "mpn_rshift2", speed_mpn_rshift2, }, { "mpn_and_n", speed_mpn_and_n, FLAG_R_OPTIONAL }, { "mpn_andn_n", speed_mpn_andn_n, FLAG_R_OPTIONAL }, { "mpn_nand_n", speed_mpn_nand_n, FLAG_R_OPTIONAL }, { "mpn_ior_n", speed_mpn_ior_n, FLAG_R_OPTIONAL }, { "mpn_iorn_n", speed_mpn_iorn_n, FLAG_R_OPTIONAL }, { "mpn_nior_n", speed_mpn_nior_n, FLAG_R_OPTIONAL }, { "mpn_xor_n", speed_mpn_xor_n, FLAG_R_OPTIONAL }, { "mpn_xnor_n", speed_mpn_xnor_n, FLAG_R_OPTIONAL }, { "mpn_com_n", speed_mpn_com_n }, { "mpn_not", speed_mpn_not }, { "mpn_popcount", speed_mpn_popcount }, { "mpn_hamdist", speed_mpn_hamdist }, { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL }, { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL }, { "mpn_hgcd", speed_mpn_hgcd }, { "mpn_gcd", speed_mpn_gcd }, { "mpn_gcdext", speed_mpn_gcdext }, #if 0 { "mpn_gcdext_lehmer", speed_mpn_gcdext_lehmer }, #endif { "mpz_jacobi", speed_mpz_jacobi }, { "mpn_jacobi_base", speed_mpn_jacobi_base }, { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1 }, { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2 }, { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3 }, { "mpn_mul_basecase", speed_mpn_mul_basecase, FLAG_R_OPTIONAL }, { "mpn_sqr_basecase", speed_mpn_sqr_basecase }, #if HAVE_NATIVE_mpn_sqr_diagonal { "mpn_sqr_diagonal", speed_mpn_sqr_diagonal }, #endif { "mpn_mul_n", speed_mpn_mul_n }, { "mpn_sqr", speed_mpn_sqr }, { "mpn_kara_mul_n", speed_mpn_kara_mul_n }, { "mpn_kara_sqr_n", speed_mpn_kara_sqr_n }, { "mpn_toom3_mul_n", speed_mpn_toom3_mul_n }, { "mpn_toom4_mul_n", speed_mpn_toom4_mul_n }, { "mpn_toom8h_mul", speed_mpn_toom8h_mul }, { "mpn_toom3_sqr_n", speed_mpn_toom3_sqr_n }, { "mpn_toom4_sqr_n", speed_mpn_toom4_sqr_n }, { "mpn_mul_fft_full", speed_mpn_mul_fft_full }, { "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr }, { "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL }, { "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL }, { "mpn_mullow_n", speed_mpn_mullow_n }, { "mpn_mullow_n_basecase", speed_mpn_mullow_n_basecase}, { "mpn_mulhigh_n", speed_mpn_mulhigh_n }, { "mpn_mulmod_2expm1", speed_mpn_mulmod_2expm1 }, { "mpn_mulmod_2expp1", speed_mpn_mulmod_2expp1 }, { "mpn_get_str", speed_mpn_get_str, FLAG_R_OPTIONAL }, { "mpn_set_str", speed_mpn_set_str, FLAG_R_OPTIONAL }, { "mpn_set_str_basecase", speed_mpn_set_str_basecase, FLAG_R_OPTIONAL }, { "mpn_set_str_subquad", speed_mpn_set_str_subquad, FLAG_R_OPTIONAL }, { "mpn_sqrtrem", speed_mpn_sqrtrem }, { "mpn_rootrem", speed_mpn_rootrem, FLAG_R }, { "mpn_fib2_ui", speed_mpn_fib2_ui, FLAG_NODATA }, { "mpz_fib_ui", speed_mpz_fib_ui, FLAG_NODATA }, { "mpz_fib2_ui", speed_mpz_fib2_ui, FLAG_NODATA }, { "mpz_lucnum_ui", speed_mpz_lucnum_ui, FLAG_NODATA }, { "mpz_lucnum2_ui", speed_mpz_lucnum2_ui, FLAG_NODATA }, { "mpz_add", speed_mpz_add }, { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL }, { "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA }, { "mpz_fac_ui_small", speed_mpz_fac_ui_small, FLAG_NODATA }, { "mpz_fac_ui_large", speed_mpz_fac_ui_large, FLAG_NODATA }, { "mpz_powm", speed_mpz_powm }, { "mpz_powm_mod", speed_mpz_powm_mod }, { "mpz_powm_redc", speed_mpz_powm_redc }, { "mpz_powm_ui", speed_mpz_powm_ui }, { "mpz_mod", speed_mpz_mod }, { "redc", speed_redc }, { "mpn_store", speed_mpn_store }, { "MPN_COPY", speed_MPN_COPY }, { "MPN_COPY_INCR", speed_MPN_COPY_INCR }, { "MPN_COPY_DECR", speed_MPN_COPY_DECR }, { "memcpy", speed_memcpy }, #if HAVE_NATIVE_mpn_copyi { "mpn_copyi", speed_mpn_copyi }, #endif #if HAVE_NATIVE_mpn_copyd { "mpn_copyd", speed_mpn_copyd }, #endif #if HAVE_NATIVE_mpn_addlsh1_n { "mpn_addlsh1_n", speed_mpn_addlsh1_n }, #endif #if HAVE_NATIVE_mpn_sublsh1_n { "mpn_sublsh1_n", speed_mpn_sublsh1_n }, #endif #if HAVE_NATIVE_mpn_addlsh_n { "mpn_addlsh_n", speed_mpn_addlsh_n,FLAG_R }, #endif #if HAVE_NATIVE_mpn_sublsh_n { "mpn_sublsh_n", speed_mpn_sublsh_n,FLAG_R }, #endif #if HAVE_NATIVE_mpn_inclsh_n { "mpn_inclsh_n", speed_mpn_inclsh_n,FLAG_R }, #endif #if HAVE_NATIVE_mpn_declsh_n { "mpn_declsh_n", speed_mpn_declsh_n,FLAG_R }, #endif #if HAVE_NATIVE_mpn_rsh1add_n { "mpn_rsh1add_n", speed_mpn_rsh1add_n }, #endif #if HAVE_NATIVE_mpn_rsh1sub_n { "mpn_rsh1sub_n", speed_mpn_rsh1sub_n }, #endif { "MPN_ZERO", speed_MPN_ZERO }, { "modlimb_invert", speed_modlimb_invert, FLAG_NODATA }, { "modlimb_invert_mul1", speed_modlimb_invert_mul1, FLAG_NODATA }, { "modlimb_invert_loop", speed_modlimb_invert_loop, FLAG_NODATA }, { "modlimb_invert_cond", speed_modlimb_invert_cond, FLAG_NODATA }, { "modlimb_invert_arith", speed_modlimb_invert_arith, FLAG_NODATA }, { "malloc_free", speed_malloc_free }, { "malloc_realloc_free", speed_malloc_realloc_free }, { "gmp_allocate_free", speed_gmp_allocate_free }, { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free }, { "mpz_init_clear", speed_mpz_init_clear }, { "mpq_init_clear", speed_mpq_init_clear }, { "mpf_init_clear", speed_mpf_init_clear }, { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear }, { "umul_ppmm", speed_umul_ppmm, FLAG_R_OPTIONAL }, #if HAVE_NATIVE_mpn_umul_ppmm { "mpn_umul_ppmm", speed_mpn_umul_ppmm, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_umul_ppmm_r { "mpn_umul_ppmm_r", speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL }, #endif { "count_leading_zeros", speed_count_leading_zeros, FLAG_NODATA | FLAG_R_OPTIONAL }, { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL }, { "udiv_qrnnd", speed_udiv_qrnnd, FLAG_R_OPTIONAL }, { "udiv_qrnnd_preinv1", speed_udiv_qrnnd_preinv1, FLAG_R_OPTIONAL }, { "udiv_qrnnd_preinv2", speed_udiv_qrnnd_preinv2, FLAG_R_OPTIONAL }, { "udiv_qrnnd_c", speed_udiv_qrnnd_c, FLAG_R_OPTIONAL }, #if HAVE_NATIVE_mpn_udiv_qrnnd { "mpn_udiv_qrnnd", speed_mpn_udiv_qrnnd, FLAG_R_OPTIONAL }, #endif #if HAVE_NATIVE_mpn_udiv_qrnnd_r { "mpn_udiv_qrnnd_r", speed_mpn_udiv_qrnnd_r, FLAG_R_OPTIONAL }, #endif { "invert_limb", speed_invert_limb, FLAG_R_OPTIONAL }, { "operator_div", speed_operator_div, FLAG_R_OPTIONAL }, { "operator_mod", speed_operator_mod, FLAG_R_OPTIONAL }, { "gmp_randseed", speed_gmp_randseed, FLAG_R_OPTIONAL }, { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA }, { "mpz_urandomb", speed_mpz_urandomb, FLAG_R_OPTIONAL | FLAG_NODATA }, #ifdef SPEED_EXTRA_ROUTINES SPEED_EXTRA_ROUTINES #endif #ifdef SPEED_EXTRA_ROUTINES2 SPEED_EXTRA_ROUTINES2 #endif }; #define SUMMAX 4 struct choice_t { const struct routine_t *p; mp_limb_t r; double scale; double time; int no_time; double prev_time; const char *name; char *filename; int colfile; int nsum; int sum[SUMMAX]; }; struct choice_t *choice; int num_choices = 0; void data_fill (mp_ptr ptr, mp_size_t size) { switch (option_data) { case DATA_RANDOM: mpn_random (ptr, size); break; case DATA_RANDOM2: mpn_random2 (ptr, size); break; case DATA_ZEROS: MPN_ZERO (ptr, size); break; case DATA_AAS: MPN_FILL (ptr, size, GMP_NUMB_0xAA); break; case DATA_FFS: MPN_FILL (ptr, size, GMP_NUMB_MAX); break; case DATA_2FD: MPN_FILL (ptr, size, GMP_NUMB_MAX); ptr[0] -= 2; break; default: abort(); /*NOTREACHED*/ } } /* The code here handling the various combinations of output options isn't too attractive, but it works and is fairly clean. */ #define SIZE_TO_DIVISOR(n) \ (option_square == 1 ? (n)*(n) \ : option_square == 2 ? (n)*((n)+1)/2 \ : (n)) void run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size) { const char *first_open_fastest, *first_open_notfastest, *first_close; int j,i, fastest, want_data; double fastest_time; TMP_DECL; TMP_MARK; /* allocate data, unless all routines are NODATA */ want_data = 0; for (i = 0; i < num_choices; i++) want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0); if (want_data) { SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp); SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp); data_fill (s->xp, s->size); data_fill (s->yp, s->size); } else { sp.xp = NULL; sp.yp = NULL; } if (prev_size == -1 && option_cmp == CMP_DIFFPREV) { first_open_fastest = "(#"; first_open_notfastest = " ("; first_close = ")"; } else { first_open_fastest = "#"; first_open_notfastest = " "; first_close = ""; } fastest = -1; fastest_time = -1.0; for (i = 0; i < num_choices; i++) { if( choice[i].nsum!=0)continue; s->r = choice[i].r; if( choice[i].colfile==-1) {choice[i].time = speed_measure (choice[i].p->fun, s);} else {FILE *fp;char buf[1024],buf2[1024],*p;int got=0; choice[i].time=-1.0; fp=fopen(choice[i].filename,"rt"); if(fp==0){printf("Cant open %s\n",choice[i].filename);exit(1);} while(fgets(buf,1024,fp)!=0) if(atoi(buf)==s->size) {p=buf; for(j=0;j<=choice[i].colfile;j++) {if(sscanf(p," %s",buf2)!=1)break; p=strstr(p,buf2)+strlen(buf2);} if(j==choice[i].colfile+1) {while((p=strstr(buf2,"#"))!=0)*p=' ';// exclude # choice[i].time=atof(buf2); } break;} fclose(fp); } choice[i].no_time = (choice[i].time == -1.0); if (! choice[i].no_time) choice[i].time *= choice[i].scale; /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time is before any differences. */ if(choice[i].colfile==-1) { double t; t = choice[i].time; if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1) { if (choice[i].prev_time == -1.0) choice[i].no_time = 1; else choice[i].time = choice[i].time - choice[i].prev_time; } choice[i].prev_time = t; } } for (i = 0; i < num_choices; i++) {if(choice[i].nsum==0)continue; choice[i].time=0;choice[i].no_time=0; for(j=0;jsize) - SIZE_TO_DIVISOR(prev_size))); } } else { if (option_unit == UNIT_CYCLES) choice[i].time /= speed_cycletime; else if (option_unit == UNIT_CYCLESPERLIMB) choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size)); } } for (i = 0; i < num_choices; i++) { if (choice[i].no_time) continue; /* Look for the fastest after CMP_DIFFPREV has been applied, but before CMP_RATIO or CMP_DIFFERENCE. There's only a fastest shown if there's more than one routine. */ for(j=0;j 1 && (fastest == -1 || choice[i].time < fastest_time)) { fastest = i; fastest_time = choice[i].time; } } for (i = 0; i < num_choices; i++) { if (choice[i].no_time ) continue; if (option_cmp != CMP_DIFFPREV) { if (option_cmp == CMP_RATIO && i != option_cmp_pos) { /* A ratio isn't affected by the units chosen. */ if (choice[option_cmp_pos].no_time || choice[option_cmp_pos].time == 0.0) choice[i].no_time = 1; else choice[i].time /= choice[option_cmp_pos].time; } else if (option_cmp == CMP_DIFFERENCE && i != option_cmp_pos) { if (choice[option_cmp_pos].no_time) { choice[i].no_time = 1; continue; } choice[i].time -= choice[option_cmp_pos].time; } } } if (option_gnuplot) { /* In CMP_DIFFPREV, don't print anything for the first size, start with the second where an actual difference is available. In CMP_RATIO, print the "first" ie option_cmp_pos column as 1.0. The 9 decimals printed is much more than the expected precision of the measurements actually. */ if (! (option_cmp == CMP_DIFFPREV && prev_size == -1)) { fprintf (fp, "%-6ld ", s->size); for (i = 0; i < num_choices; i++) fprintf (fp, " %.9e", choice[i].no_time ? 0.0 : (option_cmp == CMP_RATIO && i == option_cmp_pos) ? 1.0 : choice[i].time); fprintf (fp, "\n"); } } else { fprintf (fp, "%-6ld ", s->size); for (i = 0; i < num_choices; i++) { char buf[128]; int decimals; if (choice[i].no_time) { fprintf (fp, " %*s", COLUMN_WIDTH, "n/a"); } else {if (option_unit == UNIT_CYCLESPERLIMB || (option_cmp == CMP_RATIO && i != option_cmp_pos)) decimals = 4; else if (option_unit == UNIT_CYCLES) decimals = 2; else decimals = 9; sprintf (buf, "%s%.*f%s", i == fastest ? first_open_fastest : first_open_notfastest, decimals, choice[i].time, first_close); fprintf (fp, " %*s", COLUMN_WIDTH, buf); } } fprintf (fp, "\n"); } TMP_FREE; } void run_all (FILE *fp) { mp_size_t prev_size; int i; TMP_DECL; TMP_MARK; SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp); SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp); data_fill (sp.xp_block, SPEED_BLOCK_SIZE); data_fill (sp.yp_block, SPEED_BLOCK_SIZE); for (i = 0; i < size_num; i++) { sp.size = size_array[i].start; prev_size = -1; for (;;) { mp_size_t step; if (option_data == DATA_2FD && sp.size >= 2) sp.xp[sp.size-1] = 2; run_one (fp, &sp, prev_size); prev_size = sp.size; if (option_data == DATA_2FD && sp.size >= 2) sp.xp[sp.size-1] = MP_LIMB_T_MAX; if (option_factor != 0.0) { step = (mp_size_t) (sp.size * option_factor - sp.size); if (step < 1) step = 1; } else if(size_array[i].inc > 0) step = size_array[i].inc; else step = 1; if (step < option_step) step = option_step; sp.size += step; if (sp.size > size_array[i].end) break; } } TMP_FREE; } FILE * fopen_for_write (const char *filename) { FILE *fp; if ((fp = fopen (filename, "w")) == NULL) { fprintf (stderr, "Cannot create %s\n", filename); exit(1); } return fp; } void fclose_written (FILE *fp, const char *filename) { int err; err = ferror (fp); err |= fclose (fp); if (err) { fprintf (stderr, "Error writing %s\n", filename); exit(1); } } void run_gnuplot (int argc, char *argv[]) { char *plot_filename; char *data_filename; FILE *fp; int i; plot_filename = (char *) (*__gmp_allocate_func) (strlen (option_gnuplot_basename) + 20); data_filename = (char *) (*__gmp_allocate_func) (strlen (option_gnuplot_basename) + 20); sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename); sprintf (data_filename, "%s.data", option_gnuplot_basename); fp = fopen_for_write (plot_filename); fprintf (fp, "# Generated with:\n"); fprintf (fp, "#"); for (i = 0; i < argc; i++) fprintf (fp, " %s", argv[i]); fprintf (fp, "\n"); fprintf (fp, "\n"); fprintf (fp, "reset\n"); /* Putting the key at the top left is usually good, and you can change it interactively if it's not. */ fprintf (fp, "set key left\n"); /* designed to make it possible to see crossovers easily */ fprintf (fp, "set data style lines\n"); fprintf (fp, "plot "); for (i = 0; i < num_choices; i++) { fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2); fprintf (fp, " title \"%s\"", choice[i].name); if (i != num_choices-1) fprintf (fp, ", \\"); fprintf (fp, "\n"); } fprintf (fp, "load \"-\"\n"); fclose_written (fp, plot_filename); fp = fopen_for_write (data_filename); /* Unbuffered so you can see where the program was up to if it crashes or you kill it. */ setbuf (fp, NULL); run_all (fp); fclose_written (fp, data_filename); } /* Return a limb with n many one bits (starting from the least significant) */ #define LIMB_ONES(n) \ ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX \ : (n) == 0 ? CNST_LIMB(0) \ : (CNST_LIMB(1) << (n)) - 1) mp_limb_t r_string (const char *s) { const char *s_orig = s; long n; if (strcmp (s, "aas") == 0) return GMP_NUMB_0xAA; { mpz_t z; mp_limb_t l; int set, siz; mpz_init (z); set = mpz_set_str (z, s, 0); siz = SIZ(z); l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]); mpz_clear (z); if (set == 0) { if (siz > 1 || siz < -1) printf ("Warning, r parameter %s truncated to %d bits\n", s_orig, BITS_PER_MP_LIMB); return l; } } if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) n = strtoul (s+2, (char **) &s, 16); else n = strtol (s, (char **) &s, 10); if (strcmp (s, "bits") == 0) { mp_limb_t l; if (n > BITS_PER_MP_LIMB) { fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n", n, BITS_PER_MP_LIMB); exit (1); } mpn_random (&l, 1); return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n); } else if (strcmp (s, "ones") == 0) { if (n > BITS_PER_MP_LIMB) { fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n", n, BITS_PER_MP_LIMB); exit (1); } return LIMB_ONES (n); } else if (*s != '\0') { fprintf (stderr, "invalid r parameter: %s\n", s_orig); exit (1); } return n; } void routine_find (struct choice_t *c, const char *s_orig) { const char *s; int i; size_t nlen; c->name = s_orig; c->nsum=0; c->colfile=-1; s = strchr (s_orig, '*'); if (s != NULL) { c->scale = atof(s_orig); s++; } else { c->scale = 1.0; s = s_orig; } for (i = 0; i < numberof (routine); i++) { nlen = strlen (routine[i].name); if (memcmp (s, routine[i].name, nlen) != 0) continue; if (s[nlen] == '.') { /* match, with a .r parameter */ if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL))) { fprintf (stderr, "Choice %s bad: doesn't take a \".\" parameter\n", s_orig); exit (1); } c->p = &routine[i]; c->r = r_string (s + nlen + 1); return; } if (s[nlen] == '\0') { /* match, with no parameter */ if (routine[i].flag & FLAG_R) { fprintf (stderr, "Choice %s bad: needs a \".\" parameter\n", s_orig); exit (1); } c->p = &routine[i]; c->r = 0; return; } } #if SUMMAX > 4 #error Change line below as summax > 4 #endif c->nsum=sscanf(s,"colsum=%d+%d+%d+%d",&c->sum[0],&c->sum[1],&c->sum[2],&c->sum[3]); for(i=0;insum;i++)c->sum[i]--; for(i=0;insum;i++) {xcol[xcoln++]=c->sum[i]; if(xcoln>XCOLMAX){fprintf(stderr,"XCOLMAX not big enough\n");exit(1);}} if(c->nsum!=0)return; c->filename=malloc(1024);// bit of a over kill , but remember what if we have two colfile's , these pointer can get lost if(sscanf(s,"colfile=%d,%s",&i,c->filename)==2) {c->colfile=i;return;} fprintf (stderr, "Choice %s unrecognised\n", s_orig); exit (1); } void usage (void) { int i; speed_time_init (); printf ("Usage: speed [-options] -s size ...\n"); printf ("Measure the speed of some routines.\n"); printf ("Times are in seconds, accuracy is shown.\n"); printf ("\n"); printf (" -p num set precision as number of time units each routine must run\n"); printf (" -s x [,x]... sizes to measure: x = | | \n"); printf (" single sizes or ranges, sep with comma or use multiple -s\n"); printf (" -t step step through sizes by given amount\n"); printf (" -f factor step through sizes by given factor (eg. 1.05)\n"); #if _GNU_SOURCE printf (" -r [col] show times as ratios of the routine in column col(default column 1)\n"); printf (" -d [col] show times as difference from the routine in column col(default column 1)\n"); #else printf (" -r col show times as ratios of the routine in column col\n"); printf (" -d col show times as difference from the routine in column col\n"); #endif printf (" -D show times as difference from previous size shown\n"); printf (" -c show times in CPU cycles\n"); printf (" -C show times in cycles per limb\n"); printf (" -u print resource usage (memory) at end\n"); printf (" -P name output plot files \"name.gnuplot\" and \"name.data\"\n"); printf (" -a use given data: random(default), random2, zeros, aas, ffs, 2fd\n"); printf (" -x, -y, -w, -W specify data alignments, sources and dests\n"); printf (" -o addrs print addresses of data blocks\n"); printf (" colsum=A+B+...+Z Sums the columns A,B,.. upto a max of %d columns\n",SUMMAX); printf (" colfile=col,filename Use the data in column col from file filename\n"); printf ("\n"); printf ("float*routine prefix can be used to multiply a column by a scale factor float.\n\n"); printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n"); printf ("is greater.\n"); printf ("If both -C and -D are used, it means cycles per however many limbs between a\n"); printf ("size and the previous size.\n"); printf ("\n"); printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n"); printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n"); printf ("a log/log plot).\n"); printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n"); printf ("when viewing more than one routine, it means same axis scales for all data).\n"); printf ("\n"); printf ("The available routines are as follows.\n"); printf ("\n"); for (i = 0; i < numberof (routine); i++) { if (routine[i].flag & FLAG_R) printf ("\t%s.r\n", routine[i].name); else if (routine[i].flag & FLAG_R_OPTIONAL) printf ("\t%s (optional .r)\n", routine[i].name); else printf ("\t%s\n", routine[i].name); } printf ("\n"); printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n"); printf ("r should be in decimal, or use 0xN for hexadecimal.\n"); printf ("\n"); printf ("Special forms for r are \"bits\" for a random N bit number, \"ones\" for\n"); printf ("N one bits, or \"aas\" for 0xAA..AA.\n"); printf ("\n"); printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n"); printf ("The fastest routine at each size is marked with a # (free form output only)\n"); printf ("but columns that are summed are excluded.\n"); printf ("\n"); printf ("%s", speed_time_string); printf ("\n"); printf ("Gnuplot home page http://www.cs.dartmouth.edu/gnuplot_info.html\n"); printf ("Quickplot home page http://www.kachinatech.com/~quickplot\n"); } void check_align_option (const char *name, mp_size_t align) { if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK) { fprintf (stderr, "Alignment request out of range: %s %ld\n", name, (long) align); fprintf (stderr, " should be 0 to %d (limbs), inclusive\n", SPEED_TMP_ALLOC_ADJUST_MASK); exit (1); } } int main (int argc, char *argv[]) { int i; int opt; /* Unbuffered so output goes straight out when directed to a pipe or file and isn't lost on killing the program half way. */ setbuf (stdout, NULL); for (;;) { #if _GNU_SOURCE opt = getopt(argc, argv, "a:CcDd::EFf:o:p:P:r::Rs:t:ux:y:w:W:z"); #else opt = getopt(argc, argv, "a:CcDd:EFf:o:p:P:r:Rs:t:ux:y:w:W:z"); #endif if (opt == EOF) break; switch (opt) { case 'a': if (strcmp (optarg, "random") == 0) option_data = DATA_RANDOM; else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2; else if (strcmp (optarg, "zeros") == 0) option_data = DATA_ZEROS; else if (strcmp (optarg, "aas") == 0) option_data = DATA_AAS; else if (strcmp (optarg, "ffs") == 0) option_data = DATA_FFS; else if (strcmp (optarg, "2fd") == 0) option_data = DATA_2FD; else { fprintf (stderr, "unrecognised data option: %s\n", optarg); exit (1); } break; case 'C': if (option_unit != UNIT_SECONDS) goto bad_unit; option_unit = UNIT_CYCLESPERLIMB; break; case 'c': if (option_unit != UNIT_SECONDS) { bad_unit: fprintf (stderr, "cannot use more than one of -c, -C\n"); exit (1); } option_unit = UNIT_CYCLES; break; case 'D': if (option_cmp != CMP_ABSOLUTE) goto bad_cmp; option_cmp = CMP_DIFFPREV; break; case 'd': if (option_cmp != CMP_ABSOLUTE) { bad_cmp: fprintf (stderr, "cannot use more than one of -d, -D, -r\n"); exit (1); } option_cmp = CMP_DIFFERENCE; option_cmp_pos=0; if(optarg!=0)option_cmp_pos=atoi(optarg)-1; break; case 'E': option_square = 1; break; case 'F': option_square = 2; break; case 'f': option_factor = atof (optarg); if (option_factor <= 1.0) { fprintf (stderr, "-f factor must be > 1.0\n"); exit (1); } break; case 'o': speed_option_set (optarg); break; case 'P': option_gnuplot = 1; option_gnuplot_basename = optarg; break; case 'p': speed_precision = atoi (optarg); break; case 'R': option_seed = time (NULL); break; case 'r': if (option_cmp != CMP_ABSOLUTE) goto bad_cmp; option_cmp = CMP_RATIO; option_cmp_pos=0; if(optarg!=0)option_cmp_pos = atoi(optarg)-1; break; case 's': { char *s; for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ",")) { if (size_num == size_allocnum) { size_array = (struct size_array_t *) __gmp_allocate_or_reallocate (size_array, size_allocnum * sizeof(size_array[0]), (size_allocnum+10) * sizeof(size_array[0])); size_allocnum += 10; } size_array[size_num].inc = 0; if (sscanf (s, "%ld(%ld)%ld", &size_array[size_num].start, &size_array[size_num].inc, &size_array[size_num].end) != 3) { if (sscanf (s, "%ld-%ld", &size_array[size_num].start, &size_array[size_num].end) != 2) { size_array[size_num].start = size_array[size_num].end = atol (s); } } if (size_array[size_num].start < 0 || size_array[size_num].end < 0 || size_array[size_num].start > size_array[size_num].end) { fprintf (stderr, "invalid size parameter: %s\n", s); exit (1); } size_num++; } } break; case 't': option_step = atol (optarg); if (option_step < 1) { fprintf (stderr, "-t step must be >= 1\n"); exit (1); } break; case 'u': option_resource_usage = 1; break; case 'z': sp.cache = 1; break; case 'x': sp.align_xp = atol (optarg); check_align_option ("-x", sp.align_xp); break; case 'y': sp.align_yp = atol (optarg); check_align_option ("-y", sp.align_yp); break; case 'w': sp.align_wp = atol (optarg); check_align_option ("-w", sp.align_wp); break; case 'W': sp.align_wp2 = atol (optarg); check_align_option ("-W", sp.align_wp2); break; case '?': exit(1); } } if (optind >= argc) { usage (); exit (1); } if (size_num == 0) { fprintf (stderr, "-s must be specified\n"); exit (1); } gmp_randinit_default (__gmp_rands); __gmp_rands_initialized = 1; gmp_randseed_ui (__gmp_rands, option_seed); choice = (struct choice_t *) (*__gmp_allocate_func) ((argc - optind) * sizeof(choice[0])); for ( ; optind < argc; optind++) { struct choice_t c; routine_find (&c, argv[optind]); choice[num_choices] = c; num_choices++; } if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) && num_choices < 2) { fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n"); } speed_time_init (); if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB) speed_cycletime_need_cycles (); else speed_cycletime_need_seconds (); if (option_gnuplot) { run_gnuplot (argc, argv); } else { if (option_unit == UNIT_SECONDS) printf ("overhead %.9f secs", speed_measure (speed_noop, NULL)); else printf ("overhead %.2f cycles", speed_measure (speed_noop, NULL) / speed_cycletime); printf (", precision %d units of %.2e secs", speed_precision, speed_unittime); if (speed_cycletime == 1.0 || speed_cycletime == 0.0) printf (", CPU freq unknown\n"); else printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); printf (" "); for (i = 0; i < num_choices; i++) printf (" %*s", COLUMN_WIDTH, choice[i].name); printf ("\n"); run_all (stdout); } if (option_resource_usage) { #if HAVE_GETRUSAGE { /* This doesn't give data sizes on linux 2.0.x, only utime. */ struct rusage r; if (getrusage (RUSAGE_SELF, &r) != 0) perror ("getrusage"); else printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n", r.ru_utime.tv_sec, r.ru_utime.tv_usec, r.ru_idrss, r.ru_isrss, r.ru_ixrss); } #else printf ("getrusage() not available\n"); #endif /* Linux kernel. */ { char buf[128]; sprintf (buf, "/proc/%d/status", getpid()); if (access (buf, R_OK) == 0) { sprintf (buf, "cat /proc/%d/status", getpid()); system (buf); } } } return 0; }