Add Robert Gerbicz mpz factorial code + tuning threshold

This commit is contained in:
jasonmoxham 2009-08-03 23:09:30 +00:00
parent 497662212c
commit 6668b221cb
10 changed files with 275 additions and 15 deletions

View File

@ -1516,6 +1516,11 @@ __GMP_DECLSPEC extern const mp_limb_t __gmp_fib_table[];
#define MULHIGH_MUL_THRESHOLD 8192
#endif
#ifndef FAC_UI_THRESHOLD
#define FAC_UI_THRESHOLD 8192
#endif
/* MUL_KARATSUBA_THRESHOLD_LIMIT is the maximum for MUL_KARATSUBA_THRESHOLD.
In a normal build MUL_KARATSUBA_THRESHOLD is a constant and we use that.
In a fat binary or tune program build MUL_KARATSUBA_THRESHOLD is a
@ -3942,6 +3947,10 @@ extern mp_size_t div_dc_threshold;
#define POWM_THRESHOLD powm_threshold
extern mp_size_t powm_threshold;
#undef FAC_UI_THRESHOLD
#define FAC_UI_THRESHOLD fac_ui_threshold
extern mp_size_t fac_ui_threshold;
#undef GCD_ACCEL_THRESHOLD
#define GCD_ACCEL_THRESHOLD gcd_accel_threshold
extern mp_size_t gcd_accel_threshold;

View File

@ -29,7 +29,9 @@ MA 02110-1301, USA. */
static void odd_product _PROTO ((unsigned long low, unsigned long high, mpz_t * st));
static void ap_product_small _PROTO ((mpz_t ret, mp_limb_t start, mp_limb_t step, unsigned long count, unsigned long nm));
static void binary_splitting _PROTO ((mpz_ptr result,unsigned long int *a,unsigned long int L));
static void large_mpz_fac_ui _PROTO ((mpz_ptr result, unsigned long int n));
static unsigned long int ulsqrt _PROTO ((unsigned long int n));
/* must be >=2 */
#define APCONST 5
@ -189,6 +191,8 @@ mpz_fac_ui (mpz_ptr x, unsigned long n)
return;
}
if (ABOVE_THRESHOLD(n,FAC_UI_THRESHOLD)){large_mpz_fac_ui(x,n);return;}
count_leading_zeros (stt, (mp_limb_t) n);
stt = GMP_LIMB_BITS - stt + 1 - APCONST;
@ -396,3 +400,146 @@ odd_product (unsigned long low, unsigned long high, mpz_t * st)
ASSERT (stn == 1);
return;
}
// Computation of n factorial by computing the prime factoriation of n!,
// using iterated squaring and multiplication by a "small" number idea and binary splitting
// written by Robert Gerbicz
/* mpz_fac_ui(result, n) -- Set RESULT to N!.
Copyright 1991, 1993, 1994, 1995, 2000, 2001, 2002, 2003 Free Software
Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
static void binary_splitting (mpz_ptr result,unsigned long int *a,unsigned long int L) {
// mulptiplication by binary splitting
unsigned long int i,L0,L1;
mpz_t temp;
if(L==0) {
mpz_set_ui (result, 1);
return;
}
if(L<=3) {
mpz_set_ui(result,a[0]);
for(i=1;i<L;i++)
mpz_mul_ui(result,result,a[i]);
return;
}
L0=L/2;
L1=L-L0;
binary_splitting(result,a,L1);
mpz_init(temp);
binary_splitting(temp,a+L1,L0);
mpz_mul(result,result,temp);
mpz_clear(temp);
return;
}
static unsigned long int ulsqrt(unsigned long int n)
{unsigned long int x,y;
x=y=n;
do{x=y;
y=(n/x+x)/2;
}while(y<x);
return x;}
static void large_mpz_fac_ui(mpz_ptr result, unsigned long int n) {
unsigned long int Bit[32],e,N,g,p2,*S,*exponent,*isprime,*primes,count,i,p,primepi,sq,n2=(n-1)>>1,n64=(n>>6)+1,memalloc;
int h,expo;
mpz_t temp;
mpz_init(temp);
isprime=__GMP_ALLOCATE_FUNC_TYPE(n64,unsigned long int);
ASSERT(n>=2);// cant handle n<2
Bit[0]=1;
for(i=1;i<32;i++) Bit[i]=Bit[i-1]<<1; // Bit[i]=2^i
// determine all odd primes up to n by sieve
for(i=0;i<n64;i++) isprime[i]=0xffffffff;
sq=ulsqrt(n)+1;
for(p=3;p<=sq;p+=2) {
if(isprime[p>>6]&Bit[(p>>1)&31]) {
for(i=(p*p-1)>>1;i<=n2;i+=p) isprime[i>>5]&=~Bit[i&31];
}
}
primepi=0;
for(i=0;i<=n2;i++)
primepi+=((isprime[i>>5]&Bit[i&31])>0);
memalloc=primepi;
primes=__GMP_ALLOCATE_FUNC_TYPE(memalloc,unsigned long int);
S=__GMP_ALLOCATE_FUNC_TYPE(memalloc,unsigned long int);
exponent=__GMP_ALLOCATE_FUNC_TYPE(memalloc,unsigned long int);
primepi=0;
// 1 is not prime and hasn't cancelled, so start from 1*2+1=3
for(i=1;i<=n2;i++)
if((isprime[i>>5]&Bit[i&31])>0) {
p=2*i+1;
N=n;
e=0;
while(N) N/=p,e+=N;
primes[primepi]=p; // store prime
exponent[primepi]=e; // exponent of p in the factorization of n!
primepi++;
}
__GMP_FREE_FUNC_TYPE(isprime,n64,unsigned long int);
mpz_set_ui(result,1);
expo=0,p2=1;
N=n;
while(N) N>>=1,p2<<=1,expo++;
for(h=expo;h>=0;h--) {
// collect all primes for which in the factorization of n! the primes[g]'s exponent's h-th bit is 1
count=0;
// note that exponent[] is a decreasing array
for(g=0;(g<primepi)&&(exponent[g]>=p2);g++)
if(((exponent[g]>>h)&1)==1) {
S[count]=primes[g];
count++;
}
binary_splitting(temp,S,count); // build the product by binary splitting
mpz_pow_ui(result,result,2); // squaring
mpz_mul(result,result,temp); // multiplcation by a not so large number
p2>>=1;
}
N=n;
e=0;
while(N) N>>=1,e+=N;
mpz_mul_2exp(result,result,e); // shift the number to finally get n!
__GMP_FREE_FUNC_TYPE(S,memalloc,unsigned long int);
__GMP_FREE_FUNC_TYPE(primes,memalloc,unsigned long int);
__GMP_FREE_FUNC_TYPE(exponent,memalloc,unsigned long int);
mpz_clear(temp);
return;
}

View File

@ -48,6 +48,7 @@ libspeed_la_SOURCES = \
jacbase1.c jacbase2.c jacbase3.c \
mod_1_div.c mod_1_inv.c modlinv.c \
noop.c powm_mod.c powm_redc.c pre_divrem_1.c \
fac_ui_large.c fac_ui_small.c \
set_strb.c set_strs.c time.c \
sb_div.c sb_inv.c

View File

@ -100,7 +100,8 @@ am_libspeed_la_OBJECTS = common$U.lo divrem1div$U.lo divrem1inv$U.lo \
gcdextod$U.lo gcdextos$U.lo jacbase1$U.lo jacbase2$U.lo \
jacbase3$U.lo mod_1_div$U.lo mod_1_inv$U.lo modlinv$U.lo \
noop$U.lo powm_mod$U.lo powm_redc$U.lo pre_divrem_1$U.lo \
set_strb$U.lo set_strs$U.lo time$U.lo sb_div$U.lo sb_inv$U.lo
fac_ui_large$U.lo fac_ui_small$U.lo set_strb$U.lo \
set_strs$U.lo time$U.lo sb_div$U.lo sb_inv$U.lo
libspeed_la_OBJECTS = $(am_libspeed_la_OBJECTS)
am_speed_OBJECTS = speed$U.$(OBJEXT)
speed_OBJECTS = $(am_speed_OBJECTS)
@ -326,6 +327,7 @@ libspeed_la_SOURCES = \
jacbase1.c jacbase2.c jacbase3.c \
mod_1_div.c mod_1_inv.c modlinv.c \
noop.c powm_mod.c powm_redc.c pre_divrem_1.c \
fac_ui_large.c fac_ui_small.c \
set_strb.c set_strs.c time.c \
sb_div.c sb_inv.c
@ -490,6 +492,10 @@ divrem_1_.c: divrem_1.c $(ANSI2KNR)
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem_1.c; then echo $(srcdir)/divrem_1.c; else echo divrem_1.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
divrem_2_.c: divrem_2.c $(ANSI2KNR)
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/divrem_2.c; then echo $(srcdir)/divrem_2.c; else echo divrem_2.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
fac_ui_large_.c: fac_ui_large.c $(ANSI2KNR)
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/fac_ui_large.c; then echo $(srcdir)/fac_ui_large.c; else echo fac_ui_large.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
fac_ui_small_.c: fac_ui_small.c $(ANSI2KNR)
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/fac_ui_small.c; then echo $(srcdir)/fac_ui_small.c; else echo fac_ui_small.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
freq_.c: freq.c $(ANSI2KNR)
$(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/freq.c; then echo $(srcdir)/freq.c; else echo freq.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@
gcd_.c: gcd.c $(ANSI2KNR)
@ -572,19 +578,20 @@ common_.$(OBJEXT) common_.lo dc_divrem_n_.$(OBJEXT) dc_divrem_n_.lo \
divrem1div_.$(OBJEXT) divrem1div_.lo divrem1inv_.$(OBJEXT) \
divrem1inv_.lo divrem2div_.$(OBJEXT) divrem2div_.lo \
divrem2inv_.$(OBJEXT) divrem2inv_.lo divrem_1_.$(OBJEXT) divrem_1_.lo \
divrem_2_.$(OBJEXT) divrem_2_.lo freq_.$(OBJEXT) freq_.lo \
gcd_.$(OBJEXT) gcd_.lo gcd_bin_.$(OBJEXT) gcd_bin_.lo \
gcd_finda_gen_.$(OBJEXT) gcd_finda_gen_.lo gcdext_.$(OBJEXT) \
gcdext_.lo gcdext_double_.$(OBJEXT) gcdext_double_.lo \
gcdext_single_.$(OBJEXT) gcdext_single_.lo gcdextod_.$(OBJEXT) \
gcdextod_.lo gcdextos_.$(OBJEXT) gcdextos_.lo get_str_.$(OBJEXT) \
get_str_.lo jacbase1_.$(OBJEXT) jacbase1_.lo jacbase2_.$(OBJEXT) \
jacbase2_.lo jacbase3_.$(OBJEXT) jacbase3_.lo mod_1_.$(OBJEXT) \
mod_1_.lo mod_1_div_.$(OBJEXT) mod_1_div_.lo mod_1_inv_.$(OBJEXT) \
mod_1_inv_.lo modlinv_.$(OBJEXT) modlinv_.lo mul_.$(OBJEXT) mul_.lo \
mul_fft_.$(OBJEXT) mul_fft_.lo mul_n_.$(OBJEXT) mul_n_.lo \
mulhigh_n_.$(OBJEXT) mulhigh_n_.lo mullow_n_.$(OBJEXT) mullow_n_.lo \
noop_.$(OBJEXT) noop_.lo powm_mod_.$(OBJEXT) powm_mod_.lo \
divrem_2_.$(OBJEXT) divrem_2_.lo fac_ui_large_.$(OBJEXT) \
fac_ui_large_.lo fac_ui_small_.$(OBJEXT) fac_ui_small_.lo \
freq_.$(OBJEXT) freq_.lo gcd_.$(OBJEXT) gcd_.lo gcd_bin_.$(OBJEXT) \
gcd_bin_.lo gcd_finda_gen_.$(OBJEXT) gcd_finda_gen_.lo \
gcdext_.$(OBJEXT) gcdext_.lo gcdext_double_.$(OBJEXT) \
gcdext_double_.lo gcdext_single_.$(OBJEXT) gcdext_single_.lo \
gcdextod_.$(OBJEXT) gcdextod_.lo gcdextos_.$(OBJEXT) gcdextos_.lo \
get_str_.$(OBJEXT) get_str_.lo jacbase1_.$(OBJEXT) jacbase1_.lo \
jacbase2_.$(OBJEXT) jacbase2_.lo jacbase3_.$(OBJEXT) jacbase3_.lo \
mod_1_.$(OBJEXT) mod_1_.lo mod_1_div_.$(OBJEXT) mod_1_div_.lo \
mod_1_inv_.$(OBJEXT) mod_1_inv_.lo modlinv_.$(OBJEXT) modlinv_.lo \
mul_.$(OBJEXT) mul_.lo mul_fft_.$(OBJEXT) mul_fft_.lo mul_n_.$(OBJEXT) \
mul_n_.lo mulhigh_n_.$(OBJEXT) mulhigh_n_.lo mullow_n_.$(OBJEXT) \
mullow_n_.lo noop_.$(OBJEXT) noop_.lo powm_mod_.$(OBJEXT) powm_mod_.lo \
powm_redc_.$(OBJEXT) powm_redc_.lo pre_divrem_1_.$(OBJEXT) \
pre_divrem_1_.lo sb_div_.$(OBJEXT) sb_div_.lo sb_divrem_mn_.$(OBJEXT) \
sb_divrem_mn_.lo sb_inv_.$(OBJEXT) sb_inv_.lo set_strb_.$(OBJEXT) \

View File

@ -1200,6 +1200,17 @@ speed_mpz_fac_ui (struct speed_params *s)
{
SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
}
double
speed_mpz_fac_ui_small (struct speed_params *s)
{
SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_small);
}
double
speed_mpz_fac_ui_large (struct speed_params *s)
{
SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_large);
}
double

31
tune/fac_ui_large.c Normal file
View File

@ -0,0 +1,31 @@
/* mpz/fac_ui.c forced to use large algorithm . */
/*
Copyright 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA.
*/
#include "mpir.h"
#include "gmp-impl.h"
#undef FAC_UI_THRESHOLD
#define FAC_UI_THRESHOLD 1
#define __gmpz_fac_ui mpz_fac_ui_large
#include "../mpz/fac_ui.c"

31
tune/fac_ui_small.c Normal file
View File

@ -0,0 +1,31 @@
/* mpz/fac_ui.c forced to use small algorithm */
/*
Copyright 2000 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA.
*/
#include "mpir.h"
#include "gmp-impl.h"
#undef FAC_UI_THRESHOLD
#define FAC_UI_THRESHOLD MP_SIZE_T_MAX
#define __gmpz_fac_ui mpz_fac_ui_small
#include "../mpz/fac_ui.c"

View File

@ -348,6 +348,8 @@ const struct routine_t {
{ "mpz_add", speed_mpz_add },
{ "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
{ "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA },
{ "mpz_fac_ui_small", speed_mpz_fac_ui_small, FLAG_NODATA },
{ "mpz_fac_ui_large", speed_mpz_fac_ui_large, FLAG_NODATA },
{ "mpz_powm", speed_mpz_powm },
{ "mpz_powm_mod", speed_mpz_powm_mod },
{ "mpz_powm_redc", speed_mpz_powm_redc },

View File

@ -284,6 +284,8 @@ double speed_mpq_init_clear _PROTO ((struct speed_params *s));
double speed_mpz_add _PROTO ((struct speed_params *s));
double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
double speed_mpz_fac_ui_small _PROTO ((struct speed_params *s));
double speed_mpz_fac_ui_large _PROTO ((struct speed_params *s));
double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
double speed_mpz_fib2_ui _PROTO ((struct speed_params *s));
double speed_mpz_init_clear _PROTO ((struct speed_params *s));
@ -462,6 +464,8 @@ void mpz_powm_redc _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
mpz_srcptr mod));
void redc _PROTO ((mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim,
mp_ptr tp));
void mpz_fac_ui_small _PROTO ((mpz_ptr,unsigned long));
void mpz_fac_ui_large _PROTO ((mpz_ptr,unsigned long));
int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
mp_ptr xp, int leading,

View File

@ -171,6 +171,7 @@ mp_size_t mulhigh_mul_threshold = MP_SIZE_T_MAX;
mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
mp_size_t div_dc_threshold = MP_SIZE_T_MAX;
mp_size_t powm_threshold = MP_SIZE_T_MAX;
mp_size_t fac_ui_threshold = MP_SIZE_T_MAX;
mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
mp_size_t gcdext_threshold = MP_SIZE_T_MAX;
mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
@ -1072,6 +1073,21 @@ tune_powm (void)
one (&powm_threshold, &param);
}
void
tune_fac_ui (void)
{
static struct param_t param;
param.name = "FAC_UI_THRESHOLD";
param.function = speed_mpz_fac_ui_small;
param.function2 = speed_mpz_fac_ui_large;
//param.step_factor = 0.03;
//param.stop_factor = 1.1;
param.min_size = 1024;
param.max_size = 32768;
//param.min_is_always=1;
one (&fac_ui_threshold, &param);
}
void
tune_gcd_accel (void)
{
@ -1761,6 +1777,7 @@ all (void)
tune_sb_preinv ();
tune_dc ();
tune_powm ();
tune_fac_ui();
printf("\n");
tune_gcd_accel ();