From 6668b221cbab1638cd8085d2022b6d09d4b56daa Mon Sep 17 00:00:00 2001 From: jasonmoxham Date: Mon, 3 Aug 2009 23:09:30 +0000 Subject: [PATCH] Add Robert Gerbicz mpz factorial code + tuning threshold --- gmp-impl.h | 9 +++ mpz/fac_ui.c | 149 +++++++++++++++++++++++++++++++++++++++++++- tune/Makefile.am | 1 + tune/Makefile.in | 35 ++++++----- tune/common.c | 11 ++++ tune/fac_ui_large.c | 31 +++++++++ tune/fac_ui_small.c | 31 +++++++++ tune/speed.c | 2 + tune/speed.h | 4 ++ tune/tuneup.c | 17 +++++ 10 files changed, 275 insertions(+), 15 deletions(-) create mode 100644 tune/fac_ui_large.c create mode 100644 tune/fac_ui_small.c diff --git a/gmp-impl.h b/gmp-impl.h index 93763b6e..71cd3ee7 100644 --- a/gmp-impl.h +++ b/gmp-impl.h @@ -1516,6 +1516,11 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[]; #define MULHIGH_MUL_THRESHOLD 8192 #endif +#ifndef FAC_UI_THRESHOLD +#define FAC_UI_THRESHOLD 8192 +#endif + + /* MUL_KARATSUBA_THRESHOLD_LIMIT is the maximum for MUL_KARATSUBA_THRESHOLD. In a normal build MUL_KARATSUBA_THRESHOLD is a constant and we use that. In a fat binary or tune program build MUL_KARATSUBA_THRESHOLD is a @@ -3942,6 +3947,10 @@ extern mp_size_t div_dc_threshold; #define POWM_THRESHOLD powm_threshold extern mp_size_t powm_threshold; +#undef FAC_UI_THRESHOLD +#define FAC_UI_THRESHOLD fac_ui_threshold +extern mp_size_t fac_ui_threshold; + #undef GCD_ACCEL_THRESHOLD #define GCD_ACCEL_THRESHOLD gcd_accel_threshold extern mp_size_t gcd_accel_threshold; diff --git a/mpz/fac_ui.c b/mpz/fac_ui.c index c5c977eb..55642bc3 100644 --- a/mpz/fac_ui.c +++ b/mpz/fac_ui.c @@ -29,7 +29,9 @@ MA 02110-1301, USA. */ static void odd_product _PROTO ((unsigned long low, unsigned long high, mpz_t * st)); static void ap_product_small _PROTO ((mpz_t ret, mp_limb_t start, mp_limb_t step, unsigned long count, unsigned long nm)); - +static void binary_splitting _PROTO ((mpz_ptr result,unsigned long int *a,unsigned long int L)); +static void large_mpz_fac_ui _PROTO ((mpz_ptr result, unsigned long int n)); +static unsigned long int ulsqrt _PROTO ((unsigned long int n)); /* must be >=2 */ #define APCONST 5 @@ -189,6 +191,8 @@ mpz_fac_ui (mpz_ptr x, unsigned long n) return; } + if (ABOVE_THRESHOLD(n,FAC_UI_THRESHOLD)){large_mpz_fac_ui(x,n);return;} + count_leading_zeros (stt, (mp_limb_t) n); stt = GMP_LIMB_BITS - stt + 1 - APCONST; @@ -396,3 +400,146 @@ odd_product (unsigned long low, unsigned long high, mpz_t * st) ASSERT (stn == 1); return; } + + +// Computation of n factorial by computing the prime factoriation of n!, +// using iterated squaring and multiplication by a "small" number idea and binary splitting +// written by Robert Gerbicz + + +/* mpz_fac_ui(result, n) -- Set RESULT to N!. + +Copyright 1991, 1993, 1994, 1995, 2000, 2001, 2002, 2003 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +MA 02110-1301, USA. */ + + +static void binary_splitting (mpz_ptr result,unsigned long int *a,unsigned long int L) { +// mulptiplication by binary splitting + unsigned long int i,L0,L1; + mpz_t temp; + + if(L==0) { + mpz_set_ui (result, 1); + return; + } + + if(L<=3) { + mpz_set_ui(result,a[0]); + for(i=1;i>1,n64=(n>>6)+1,memalloc; + int h,expo; + mpz_t temp; + mpz_init(temp); + isprime=__GMP_ALLOCATE_FUNC_TYPE(n64,unsigned long int); + ASSERT(n>=2);// cant handle n<2 + Bit[0]=1; + for(i=1;i<32;i++) Bit[i]=Bit[i-1]<<1; // Bit[i]=2^i + + // determine all odd primes up to n by sieve + for(i=0;i>6]&Bit[(p>>1)&31]) { + for(i=(p*p-1)>>1;i<=n2;i+=p) isprime[i>>5]&=~Bit[i&31]; + } + } + + primepi=0; + for(i=0;i<=n2;i++) + primepi+=((isprime[i>>5]&Bit[i&31])>0); + + memalloc=primepi; + primes=__GMP_ALLOCATE_FUNC_TYPE(memalloc,unsigned long int); + S=__GMP_ALLOCATE_FUNC_TYPE(memalloc,unsigned long int); + exponent=__GMP_ALLOCATE_FUNC_TYPE(memalloc,unsigned long int); + + primepi=0; + // 1 is not prime and hasn't cancelled, so start from 1*2+1=3 + for(i=1;i<=n2;i++) + if((isprime[i>>5]&Bit[i&31])>0) { + p=2*i+1; + N=n; + e=0; + while(N) N/=p,e+=N; + primes[primepi]=p; // store prime + exponent[primepi]=e; // exponent of p in the factorization of n! + primepi++; + } + + __GMP_FREE_FUNC_TYPE(isprime,n64,unsigned long int); + + mpz_set_ui(result,1); + + expo=0,p2=1; + N=n; + while(N) N>>=1,p2<<=1,expo++; + for(h=expo;h>=0;h--) { + // collect all primes for which in the factorization of n! the primes[g]'s exponent's h-th bit is 1 + count=0; + // note that exponent[] is a decreasing array + for(g=0;(g=p2);g++) + if(((exponent[g]>>h)&1)==1) { + S[count]=primes[g]; + count++; + } + binary_splitting(temp,S,count); // build the product by binary splitting + mpz_pow_ui(result,result,2); // squaring + mpz_mul(result,result,temp); // multiplcation by a not so large number + p2>>=1; + } + + N=n; + e=0; + while(N) N>>=1,e+=N; + mpz_mul_2exp(result,result,e); // shift the number to finally get n! + + __GMP_FREE_FUNC_TYPE(S,memalloc,unsigned long int); + __GMP_FREE_FUNC_TYPE(primes,memalloc,unsigned long int); + __GMP_FREE_FUNC_TYPE(exponent,memalloc,unsigned long int); + mpz_clear(temp); + return; +} + + diff --git a/tune/Makefile.am b/tune/Makefile.am index e643a96d..4fa87d6b 100644 --- a/tune/Makefile.am +++ b/tune/Makefile.am @@ -48,6 +48,7 @@ libspeed_la_SOURCES = \ jacbase1.c jacbase2.c jacbase3.c \ mod_1_div.c mod_1_inv.c modlinv.c \ noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ + fac_ui_large.c fac_ui_small.c \ set_strb.c set_strs.c time.c \ sb_div.c sb_inv.c diff --git a/tune/Makefile.in b/tune/Makefile.in index a6e5c23f..6b28915a 100644 --- a/tune/Makefile.in +++ b/tune/Makefile.in @@ -100,7 +100,8 @@ am_libspeed_la_OBJECTS = common$U.lo divrem1div$U.lo divrem1inv$U.lo \ gcdextod$U.lo gcdextos$U.lo jacbase1$U.lo jacbase2$U.lo \ jacbase3$U.lo mod_1_div$U.lo mod_1_inv$U.lo modlinv$U.lo \ noop$U.lo powm_mod$U.lo powm_redc$U.lo pre_divrem_1$U.lo \ - set_strb$U.lo set_strs$U.lo time$U.lo sb_div$U.lo sb_inv$U.lo + fac_ui_large$U.lo fac_ui_small$U.lo set_strb$U.lo \ + set_strs$U.lo time$U.lo sb_div$U.lo sb_inv$U.lo libspeed_la_OBJECTS = $(am_libspeed_la_OBJECTS) am_speed_OBJECTS = speed$U.$(OBJEXT) speed_OBJECTS = $(am_speed_OBJECTS) @@ -326,6 +327,7 @@ libspeed_la_SOURCES = \ jacbase1.c jacbase2.c jacbase3.c \ mod_1_div.c mod_1_inv.c modlinv.c \ noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ + fac_ui_large.c fac_ui_small.c \ set_strb.c set_strs.c time.c \ sb_div.c sb_inv.c @@ -490,6 +492,10 @@ divrem_1_.c: divrem_1.c $(ANSI2KNR) $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem_1.c; then echo $(srcdir)/divrem_1.c; else echo divrem_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ divrem_2_.c: divrem_2.c $(ANSI2KNR) $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem_2.c; then echo $(srcdir)/divrem_2.c; else echo divrem_2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ +fac_ui_large_.c: fac_ui_large.c $(ANSI2KNR) + $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/fac_ui_large.c; then echo $(srcdir)/fac_ui_large.c; else echo fac_ui_large.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ +fac_ui_small_.c: fac_ui_small.c $(ANSI2KNR) + $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/fac_ui_small.c; then echo $(srcdir)/fac_ui_small.c; else echo fac_ui_small.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ freq_.c: freq.c $(ANSI2KNR) $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/freq.c; then echo $(srcdir)/freq.c; else echo freq.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ gcd_.c: gcd.c $(ANSI2KNR) @@ -572,19 +578,20 @@ common_.$(OBJEXT) common_.lo dc_divrem_n_.$(OBJEXT) dc_divrem_n_.lo \ divrem1div_.$(OBJEXT) divrem1div_.lo divrem1inv_.$(OBJEXT) \ divrem1inv_.lo divrem2div_.$(OBJEXT) divrem2div_.lo \ divrem2inv_.$(OBJEXT) divrem2inv_.lo divrem_1_.$(OBJEXT) divrem_1_.lo \ -divrem_2_.$(OBJEXT) divrem_2_.lo freq_.$(OBJEXT) freq_.lo \ -gcd_.$(OBJEXT) gcd_.lo gcd_bin_.$(OBJEXT) gcd_bin_.lo \ -gcd_finda_gen_.$(OBJEXT) gcd_finda_gen_.lo gcdext_.$(OBJEXT) \ -gcdext_.lo gcdext_double_.$(OBJEXT) gcdext_double_.lo \ -gcdext_single_.$(OBJEXT) gcdext_single_.lo gcdextod_.$(OBJEXT) \ -gcdextod_.lo gcdextos_.$(OBJEXT) gcdextos_.lo get_str_.$(OBJEXT) \ -get_str_.lo jacbase1_.$(OBJEXT) jacbase1_.lo jacbase2_.$(OBJEXT) \ -jacbase2_.lo jacbase3_.$(OBJEXT) jacbase3_.lo mod_1_.$(OBJEXT) \ -mod_1_.lo mod_1_div_.$(OBJEXT) mod_1_div_.lo mod_1_inv_.$(OBJEXT) \ -mod_1_inv_.lo modlinv_.$(OBJEXT) modlinv_.lo mul_.$(OBJEXT) mul_.lo \ -mul_fft_.$(OBJEXT) mul_fft_.lo mul_n_.$(OBJEXT) mul_n_.lo \ -mulhigh_n_.$(OBJEXT) mulhigh_n_.lo mullow_n_.$(OBJEXT) mullow_n_.lo \ -noop_.$(OBJEXT) noop_.lo powm_mod_.$(OBJEXT) powm_mod_.lo \ +divrem_2_.$(OBJEXT) divrem_2_.lo fac_ui_large_.$(OBJEXT) \ +fac_ui_large_.lo fac_ui_small_.$(OBJEXT) fac_ui_small_.lo \ +freq_.$(OBJEXT) freq_.lo gcd_.$(OBJEXT) gcd_.lo gcd_bin_.$(OBJEXT) \ +gcd_bin_.lo gcd_finda_gen_.$(OBJEXT) gcd_finda_gen_.lo \ +gcdext_.$(OBJEXT) gcdext_.lo gcdext_double_.$(OBJEXT) \ +gcdext_double_.lo gcdext_single_.$(OBJEXT) gcdext_single_.lo \ +gcdextod_.$(OBJEXT) gcdextod_.lo gcdextos_.$(OBJEXT) gcdextos_.lo \ +get_str_.$(OBJEXT) get_str_.lo jacbase1_.$(OBJEXT) jacbase1_.lo \ +jacbase2_.$(OBJEXT) jacbase2_.lo jacbase3_.$(OBJEXT) jacbase3_.lo \ +mod_1_.$(OBJEXT) mod_1_.lo mod_1_div_.$(OBJEXT) mod_1_div_.lo \ +mod_1_inv_.$(OBJEXT) mod_1_inv_.lo modlinv_.$(OBJEXT) modlinv_.lo \ +mul_.$(OBJEXT) mul_.lo mul_fft_.$(OBJEXT) mul_fft_.lo mul_n_.$(OBJEXT) \ +mul_n_.lo mulhigh_n_.$(OBJEXT) mulhigh_n_.lo mullow_n_.$(OBJEXT) \ +mullow_n_.lo noop_.$(OBJEXT) noop_.lo powm_mod_.$(OBJEXT) powm_mod_.lo \ powm_redc_.$(OBJEXT) powm_redc_.lo pre_divrem_1_.$(OBJEXT) \ pre_divrem_1_.lo sb_div_.$(OBJEXT) sb_div_.lo sb_divrem_mn_.$(OBJEXT) \ sb_divrem_mn_.lo sb_inv_.$(OBJEXT) sb_inv_.lo set_strb_.$(OBJEXT) \ diff --git a/tune/common.c b/tune/common.c index a5dfd13c..91ff5b15 100644 --- a/tune/common.c +++ b/tune/common.c @@ -1200,6 +1200,17 @@ speed_mpz_fac_ui (struct speed_params *s) { SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui); } +double +speed_mpz_fac_ui_small (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_small); +} + +double +speed_mpz_fac_ui_large (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_large); +} double diff --git a/tune/fac_ui_large.c b/tune/fac_ui_large.c new file mode 100644 index 00000000..ae09804a --- /dev/null +++ b/tune/fac_ui_large.c @@ -0,0 +1,31 @@ +/* mpz/fac_ui.c forced to use large algorithm . */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +MA 02110-1301, USA. +*/ + +#include "mpir.h" +#include "gmp-impl.h" + +#undef FAC_UI_THRESHOLD +#define FAC_UI_THRESHOLD 1 +#define __gmpz_fac_ui mpz_fac_ui_large + +#include "../mpz/fac_ui.c" diff --git a/tune/fac_ui_small.c b/tune/fac_ui_small.c new file mode 100644 index 00000000..33b2cdc3 --- /dev/null +++ b/tune/fac_ui_small.c @@ -0,0 +1,31 @@ +/* mpz/fac_ui.c forced to use small algorithm */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +MA 02110-1301, USA. +*/ + +#include "mpir.h" +#include "gmp-impl.h" + +#undef FAC_UI_THRESHOLD +#define FAC_UI_THRESHOLD MP_SIZE_T_MAX +#define __gmpz_fac_ui mpz_fac_ui_small + +#include "../mpz/fac_ui.c" diff --git a/tune/speed.c b/tune/speed.c index c7876fba..b9343663 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -348,6 +348,8 @@ const struct routine_t { { "mpz_add", speed_mpz_add }, { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL }, { "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA }, + { "mpz_fac_ui_small", speed_mpz_fac_ui_small, FLAG_NODATA }, + { "mpz_fac_ui_large", speed_mpz_fac_ui_large, FLAG_NODATA }, { "mpz_powm", speed_mpz_powm }, { "mpz_powm_mod", speed_mpz_powm_mod }, { "mpz_powm_redc", speed_mpz_powm_redc }, diff --git a/tune/speed.h b/tune/speed.h index c73aad15..f026c94f 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -284,6 +284,8 @@ double speed_mpq_init_clear _PROTO ((struct speed_params *s)); double speed_mpz_add _PROTO ((struct speed_params *s)); double speed_mpz_bin_uiui _PROTO ((struct speed_params *s)); double speed_mpz_fac_ui _PROTO ((struct speed_params *s)); +double speed_mpz_fac_ui_small _PROTO ((struct speed_params *s)); +double speed_mpz_fac_ui_large _PROTO ((struct speed_params *s)); double speed_mpz_fib_ui _PROTO ((struct speed_params *s)); double speed_mpz_fib2_ui _PROTO ((struct speed_params *s)); double speed_mpz_init_clear _PROTO ((struct speed_params *s)); @@ -462,6 +464,8 @@ void mpz_powm_redc _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod)); void redc _PROTO ((mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim, mp_ptr tp)); +void mpz_fac_ui_small _PROTO ((mpz_ptr,unsigned long)); +void mpz_fac_ui_large _PROTO ((mpz_ptr,unsigned long)); int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s, mp_ptr xp, int leading, diff --git a/tune/tuneup.c b/tune/tuneup.c index f12669e9..37b1bdb1 100644 --- a/tune/tuneup.c +++ b/tune/tuneup.c @@ -171,6 +171,7 @@ mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX; mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; mp_size_t div_dc_threshold = MP_SIZE_T_MAX; mp_size_t powm_threshold = MP_SIZE_T_MAX; +mp_size_t fac_ui_threshold = MP_SIZE_T_MAX; mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; mp_size_t gcdext_threshold = MP_SIZE_T_MAX; mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; @@ -1072,6 +1073,21 @@ tune_powm (void) one (&powm_threshold, ¶m); } +void +tune_fac_ui (void) +{ + static struct param_t param; + param.name = "FAC_UI_THRESHOLD"; + param.function = speed_mpz_fac_ui_small; + param.function2 = speed_mpz_fac_ui_large; + //param.step_factor = 0.03; + //param.stop_factor = 1.1; + param.min_size = 1024; + param.max_size = 32768; + //param.min_is_always=1; + one (&fac_ui_threshold, ¶m); +} + void tune_gcd_accel (void) { @@ -1761,6 +1777,7 @@ all (void) tune_sb_preinv (); tune_dc (); tune_powm (); + tune_fac_ui(); printf("\n"); tune_gcd_accel ();