123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709 |
- /* statistic_tests.hpp header file
- *
- * Copyright Jens Maurer 2000
- * Distributed under the Boost Software License, Version 1.0. (See
- * accompanying file LICENSE_1_0.txt or copy at
- * http://www.boost.org/LICENSE_1_0.txt)
- *
- * $Id$
- *
- */
- #ifndef STATISTIC_TESTS_HPP
- #define STATISTIC_TESTS_HPP
- #include <stdexcept>
- #include <iterator>
- #include <vector>
- #include <boost/limits.hpp>
- #include <algorithm>
- #include <cmath>
- #include <boost/config.hpp>
- #include <boost/bind.hpp>
- #include <boost/random/uniform_01.hpp>
- #include <boost/random/variate_generator.hpp>
- #include "integrate.hpp"
- #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
- namespace std
- {
- inline double pow(double a, double b) { return ::pow(a,b); }
- inline double ceil(double x) { return ::ceil(x); }
- } // namespace std
- #endif
- template<class T>
- inline T fac(int k)
- {
- T result = 1;
- for(T i = 2; i <= k; ++i)
- result *= i;
- return result;
- }
- template<class T>
- T binomial(int n, int k)
- {
- if(k < n/2)
- k = n-k;
- T result = 1;
- for(int i = k+1; i<= n; ++i)
- result *= i;
- return result / fac<T>(n-k);
- }
- template<class T>
- T stirling2(int n, int m)
- {
- T sum = 0;
- for(int k = 0; k <= m; ++k)
- sum += binomial<T>(m, k) * std::pow(double(k), n) *
- ( (m-k)%2 == 0 ? 1 : -1);
- return sum / fac<T>(m);
- }
- /*
- * Experiments which create an empirical distribution in classes,
- * suitable for the chi-square test.
- */
- // std::floor(gen() * classes)
- class experiment_base
- {
- public:
- experiment_base(int cls) : _classes(cls) { }
- unsigned int classes() const { return _classes; }
- protected:
- unsigned int _classes;
- };
- class equidistribution_experiment : public experiment_base
- {
- public:
- explicit equidistribution_experiment(unsigned int classes)
- : experiment_base(classes) { }
-
- template<class NumberGenerator, class Counter>
- void run(NumberGenerator & f, Counter & count, int n) const
- {
- assert((f.min)() == 0 &&
- static_cast<unsigned int>((f.max)()) == classes()-1);
- for(int i = 0; i < n; ++i)
- count(f());
- }
- double probability(int /*i*/) const { return 1.0/classes(); }
- };
- // two-dimensional equidistribution experiment
- class equidistribution_2d_experiment : public equidistribution_experiment
- {
- public:
- explicit equidistribution_2d_experiment(unsigned int classes)
- : equidistribution_experiment(classes) { }
- template<class NumberGenerator, class Counter>
- void run(NumberGenerator & f, Counter & count, int n) const
- {
- unsigned int range = (f.max)()+1;
- assert((f.min)() == 0 && range*range == classes());
- for(int i = 0; i < n; ++i) {
- int y1 = f();
- int y2 = f();
- count(y1 + range * y2);
- }
- }
- };
- // distribution experiment: assume a probability density and
- // count events so that an equidistribution results.
- class distribution_experiment : public equidistribution_experiment
- {
- public:
- template<class Distribution>
- distribution_experiment(Distribution dist , unsigned int classes)
- : equidistribution_experiment(classes), limit(classes)
- {
- for(unsigned int i = 0; i < classes-1; ++i)
- limit[i] = quantile(dist, (i+1)*0.05);
- limit[classes-1] = std::numeric_limits<double>::infinity();
- if(limit[classes-1] < (std::numeric_limits<double>::max)())
- limit[classes-1] = (std::numeric_limits<double>::max)();
- #if 0
- std::cout << __PRETTY_FUNCTION__ << ": ";
- for(unsigned int i = 0; i < classes; ++i)
- std::cout << limit[i] << " ";
- std::cout << std::endl;
- #endif
- }
- template<class NumberGenerator, class Counter>
- void run(NumberGenerator & f, Counter & count, int n) const
- {
- for(int i = 0; i < n; ++i) {
- limits_type::const_iterator it =
- std::lower_bound(limit.begin(), limit.end(), f());
- count(it-limit.begin());
- }
- }
- private:
- typedef std::vector<double> limits_type;
- limits_type limit;
- };
- template<bool up, bool is_float>
- struct runs_direction_helper
- {
- template<class T>
- static T init(T)
- {
- return (std::numeric_limits<T>::max)();
- }
- };
- template<>
- struct runs_direction_helper<true, true>
- {
- template<class T>
- static T init(T)
- {
- return -(std::numeric_limits<T>::max)();
- }
- };
- template<>
- struct runs_direction_helper<true, false>
- {
- template<class T>
- static T init(T)
- {
- return (std::numeric_limits<T>::min)();
- }
- };
- // runs-up/runs-down experiment
- template<bool up>
- class runs_experiment : public experiment_base
- {
- public:
- explicit runs_experiment(unsigned int classes) : experiment_base(classes) { }
-
- template<class NumberGenerator, class Counter>
- void run(NumberGenerator & f, Counter & count, int n) const
- {
- typedef typename NumberGenerator::result_type result_type;
- result_type init =
- runs_direction_helper<
- up,
- !std::numeric_limits<result_type>::is_integer
- >::init(result_type());
- result_type previous = init;
- unsigned int length = 0;
- for(int i = 0; i < n; ++i) {
- result_type val = f();
- if(up ? previous <= val : previous >= val) {
- previous = val;
- ++length;
- } else {
- count((std::min)(length, classes())-1);
- length = 0;
- previous = init;
- // don't use this value, so that runs are independent
- }
- }
- }
- double probability(unsigned int r) const
- {
- if(r == classes()-1)
- return 1.0/fac<double>(classes());
- else
- return static_cast<double>(r+1)/fac<double>(r+2);
- }
- };
- // gap length experiment
- class gap_experiment : public experiment_base
- {
- public:
- template<class Dist>
- gap_experiment(unsigned int classes, const Dist & dist, double alpha, double beta)
- : experiment_base(classes), alpha(alpha), beta(beta), low(quantile(dist, alpha)), high(quantile(dist, beta)) {}
-
- template<class NumberGenerator, class Counter>
- void run(NumberGenerator & f, Counter & count, int n) const
- {
- typedef typename NumberGenerator::result_type result_type;
- unsigned int length = 0;
- for(int i = 0; i < n; ) {
- result_type value = f();
- if(value < low || value > high)
- ++length;
- else {
- count((std::min)(length, classes()-1));
- length = 0;
- ++i;
- }
- }
- }
- double probability(unsigned int r) const
- {
- double p = beta-alpha;
- if(r == classes()-1)
- return std::pow(1-p, static_cast<double>(r));
- else
- return p * std::pow(1-p, static_cast<double>(r));
- }
- private:
- double alpha, beta;
- double low, high;
- };
- // poker experiment
- class poker_experiment : public experiment_base
- {
- public:
- poker_experiment(unsigned int d, unsigned int k)
- : experiment_base(k), range(d)
- {
- assert(range > 1);
- }
- template<class UniformRandomNumberGenerator, class Counter>
- void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
- {
- typedef typename UniformRandomNumberGenerator::result_type result_type;
- assert(std::numeric_limits<result_type>::is_integer);
- assert((f.min)() == 0);
- assert((f.max)() == static_cast<result_type>(range-1));
- std::vector<result_type> v(classes());
- for(int i = 0; i < n; ++i) {
- for(unsigned int j = 0; j < classes(); ++j)
- v[j] = f();
- std::sort(v.begin(), v.end());
- result_type prev = v[0];
- int r = 1; // count different values in v
- for(unsigned int i = 1; i < classes(); ++i) {
- if(prev != v[i]) {
- prev = v[i];
- ++r;
- }
- }
- count(r-1);
- }
- }
- double probability(unsigned int r) const
- {
- ++r; // transform to 1 <= r <= 5
- double result = range;
- for(unsigned int i = 1; i < r; ++i)
- result *= range-i;
- return result / std::pow(range, static_cast<double>(classes())) *
- stirling2<double>(classes(), r);
- }
- private:
- unsigned int range;
- };
- // coupon collector experiment
- class coupon_collector_experiment : public experiment_base
- {
- public:
- coupon_collector_experiment(unsigned int d, unsigned int cls)
- : experiment_base(cls), d(d)
- {
- assert(d > 1);
- }
- template<class UniformRandomNumberGenerator, class Counter>
- void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
- {
- typedef typename UniformRandomNumberGenerator::result_type result_type;
- assert(std::numeric_limits<result_type>::is_integer);
- assert((f.min)() == 0);
- assert((f.max)() == static_cast<result_type>(d-1));
- std::vector<bool> occurs(d);
- for(int i = 0; i < n; ++i) {
- occurs.assign(d, false);
- unsigned int r = 0; // length of current sequence
- int q = 0; // number of non-duplicates in current set
- for(;;) {
- result_type val = f();
- ++r;
- if(!occurs[val]) { // new set element
- occurs[val] = true;
- ++q;
- if(q == d)
- break; // one complete set
- }
- }
- count((std::min)(r-d, classes()-1));
- }
- }
- double probability(unsigned int r) const
- {
- if(r == classes()-1)
- return 1-fac<double>(d)/
- std::pow(static_cast<double>(d), static_cast<double>(d+classes()-2)) *
- stirling2<double>(d+classes()-2, d);
- else
- return fac<double>(d)/
- std::pow(static_cast<double>(d), static_cast<double>(d+r)) *
- stirling2<double>(d+r-1, d-1);
- }
- private:
- int d;
- };
- // permutation test
- class permutation_experiment : public equidistribution_experiment
- {
- public:
- permutation_experiment(unsigned int t)
- : equidistribution_experiment(fac<int>(t)), t(t)
- {
- assert(t > 1);
- }
- template<class UniformRandomNumberGenerator, class Counter>
- void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
- {
- typedef typename UniformRandomNumberGenerator::result_type result_type;
- std::vector<result_type> v(t);
- for(int i = 0; i < n; ++i) {
- for(int j = 0; j < t; ++j) {
- v[j] = f();
- }
- int x = 0;
- for(int r = t-1; r > 0; r--) {
- typename std::vector<result_type>::iterator it =
- std::max_element(v.begin(), v.begin()+r+1);
- x = (r+1)*x + (it-v.begin());
- std::iter_swap(it, v.begin()+r);
- }
- count(x);
- }
- }
- private:
- int t;
- };
- // birthday spacing experiment test
- class birthday_spacing_experiment : public experiment_base
- {
- public:
- birthday_spacing_experiment(unsigned int d, int n, int m)
- : experiment_base(d), n(n), m(m)
- {
- }
- template<class UniformRandomNumberGenerator, class Counter>
- void run(UniformRandomNumberGenerator & f, Counter & count, int n_total) const
- {
- typedef typename UniformRandomNumberGenerator::result_type result_type;
- assert(std::numeric_limits<result_type>::is_integer);
- assert((f.min)() == 0);
- assert((f.max)() == static_cast<result_type>(m-1));
-
- for(int j = 0; j < n_total; j++) {
- std::vector<result_type> v(n);
- std::generate_n(v.begin(), n, f);
- std::sort(v.begin(), v.end());
- std::vector<result_type> spacing(n);
- for(int i = 0; i < n-1; i++)
- spacing[i] = v[i+1]-v[i];
- spacing[n-1] = v[0] + m - v[n-1];
- std::sort(spacing.begin(), spacing.end());
- unsigned int k = 0;
- for(int i = 0; i < n-1; ++i) {
- if(spacing[i] == spacing[i+1])
- ++k;
- }
- count((std::min)(k, classes()-1));
- }
- }
- double probability(unsigned int r) const
- {
- assert(classes() == 4);
- assert(m == (1<<25));
- assert(n == 512);
- static const double prob[] = { 0.368801577, 0.369035243, 0.183471182,
- 0.078691997 };
- return prob[r];
- }
- private:
- int n, m;
- };
- /*
- * Misc. helper functions.
- */
- template<class Float>
- struct distribution_function
- {
- typedef Float result_type;
- typedef Float argument_type;
- typedef Float first_argument_type;
- typedef Float second_argument_type;
- };
- // computes P(K_n <= t) or P(t1 <= K_n <= t2). See Knuth, 3.3.1
- class kolmogorov_smirnov_probability : public distribution_function<double>
- {
- public:
- kolmogorov_smirnov_probability(int n)
- : approx(n > 50), n(n), sqrt_n(std::sqrt(double(n)))
- {
- if(!approx)
- n_n = std::pow(static_cast<double>(n), n);
- }
-
- double cdf(double t) const
- {
- if(approx) {
- return 1-std::exp(-2*t*t)*(1-2.0/3.0*t/sqrt_n);
- } else {
- t *= sqrt_n;
- double sum = 0;
- for(int k = static_cast<int>(std::ceil(t)); k <= n; k++)
- sum += binomial<double>(n, k) * std::pow(k-t, k) *
- std::pow(t+n-k, n-k-1);
- return 1 - t/n_n * sum;
- }
- }
- //double operator()(double t1, double t2) const
- //{ return operator()(t2) - operator()(t1); }
- private:
- bool approx;
- int n;
- double sqrt_n;
- double n_n;
- };
- inline double cdf(const kolmogorov_smirnov_probability& dist, double val)
- {
- return dist.cdf(val);
- }
- inline double quantile(const kolmogorov_smirnov_probability& dist, double val)
- {
- return invert_monotone_inc(boost::bind(&cdf, dist, _1), val, 0.0, 1000.0);
- }
- /*
- * Experiments for generators with continuous distribution functions
- */
- class kolmogorov_experiment
- {
- public:
- kolmogorov_experiment(int n) : n(n), ksp(n) { }
- template<class NumberGenerator, class Distribution>
- double run(NumberGenerator & gen, Distribution distrib) const
- {
- const int m = n;
- typedef std::vector<double> saved_temp;
- saved_temp a(m,1.0), b(m,0);
- std::vector<int> c(m,0);
- for(int i = 0; i < n; ++i) {
- double val = static_cast<double>(gen());
- double y = cdf(distrib, val);
- int k = static_cast<int>(std::floor(m*y));
- if(k >= m)
- --k; // should not happen
- a[k] = (std::min)(a[k], y);
- b[k] = (std::max)(b[k], y);
- ++c[k];
- }
- double kplus = 0, kminus = 0;
- int j = 0;
- for(int k = 0; k < m; ++k) {
- if(c[k] > 0) {
- kminus = (std::max)(kminus, a[k]-j/static_cast<double>(n));
- j += c[k];
- kplus = (std::max)(kplus, j/static_cast<double>(n) - b[k]);
- }
- }
- kplus *= std::sqrt(double(n));
- kminus *= std::sqrt(double(n));
- // std::cout << "k+ " << kplus << " k- " << kminus << std::endl;
- return kplus;
- }
- double probability(double x) const
- {
- return cdf(ksp, x);
- }
- private:
- int n;
- kolmogorov_smirnov_probability ksp;
- };
- struct power_distribution
- {
- power_distribution(double t) : t(t) {}
- double t;
- };
- double cdf(const power_distribution& dist, double val)
- {
- return std::pow(val, dist.t);
- }
- // maximum-of-t test (KS-based)
- template<class UniformRandomNumberGenerator>
- class maximum_experiment
- {
- public:
- typedef UniformRandomNumberGenerator base_type;
- maximum_experiment(base_type & f, int n, int t) : f(f), ke(n), t(t)
- { }
- double operator()() const
- {
- generator gen(f, t);
- return ke.run(gen, power_distribution(t));
- }
- private:
- struct generator {
- generator(base_type & f, int t) : f(f, boost::uniform_01<>()), t(t) { }
- double operator()()
- {
- double mx = f();
- for(int i = 1; i < t; ++i)
- mx = (std::max)(mx, f());
- return mx;
- }
- private:
- boost::variate_generator<base_type&, boost::uniform_01<> > f;
- int t;
- };
- base_type & f;
- kolmogorov_experiment ke;
- int t;
- };
- // compute a chi-square value for the distribution approximation error
- template<class ForwardIterator, class UnaryFunction>
- typename UnaryFunction::result_type
- chi_square_value(ForwardIterator first, ForwardIterator last,
- UnaryFunction probability)
- {
- typedef std::iterator_traits<ForwardIterator> iter_traits;
- typedef typename iter_traits::value_type counter_type;
- typedef typename UnaryFunction::result_type result_type;
- unsigned int classes = std::distance(first, last);
- result_type sum = 0;
- counter_type n = 0;
- for(unsigned int i = 0; i < classes; ++first, ++i) {
- counter_type count = *first;
- n += count;
- sum += (count/probability(i)) * count; // avoid overflow
- }
- #if 0
- for(unsigned int i = 0; i < classes; ++i) {
- // std::cout << (n*probability(i)) << " ";
- if(n * probability(i) < 5)
- std::cerr << "Not enough test runs for slot " << i
- << " p=" << probability(i) << ", n=" << n
- << std::endl;
- }
- #endif
- // std::cout << std::endl;
- // throw std::invalid_argument("not enough test runs");
- return sum/n - n;
- }
- template<class RandomAccessContainer>
- class generic_counter
- {
- public:
- explicit generic_counter(unsigned int classes) : container(classes, 0) { }
- void operator()(int i)
- {
- assert(i >= 0);
- assert(static_cast<unsigned int>(i) < container.size());
- ++container[i];
- }
- typename RandomAccessContainer::const_iterator begin() const
- { return container.begin(); }
- typename RandomAccessContainer::const_iterator end() const
- { return container.end(); }
- private:
- RandomAccessContainer container;
- };
- // chi_square test
- template<class Experiment, class Generator>
- double run_experiment(const Experiment & experiment, Generator & gen, int n)
- {
- generic_counter<std::vector<int> > v(experiment.classes());
- experiment.run(gen, v, n);
- return chi_square_value(v.begin(), v.end(),
- boost::bind(&Experiment::probability,
- experiment, boost::placeholders::_1));
- }
- // chi_square test
- template<class Experiment, class Generator>
- double run_experiment(const Experiment & experiment, const Generator & gen, int n)
- {
- generic_counter<std::vector<int> > v(experiment.classes());
- experiment.run(gen, v, n);
- return chi_square_value(v.begin(), v.end(),
- boost::bind(&Experiment::probability,
- experiment, boost::placeholders::_1));
- }
- // number generator with experiment results (for nesting)
- template<class Experiment, class Generator>
- class experiment_generator_t
- {
- public:
- experiment_generator_t(const Experiment & exper, Generator & gen, int n)
- : experiment(exper), generator(gen), n(n) { }
- double operator()() const { return run_experiment(experiment, generator, n); }
- private:
- const Experiment & experiment;
- Generator & generator;
- int n;
- };
- template<class Experiment, class Generator>
- experiment_generator_t<Experiment, Generator>
- experiment_generator(const Experiment & e, Generator & gen, int n)
- {
- return experiment_generator_t<Experiment, Generator>(e, gen, n);
- }
- template<class Experiment, class Generator, class Distribution>
- class ks_experiment_generator_t
- {
- public:
- ks_experiment_generator_t(const Experiment & exper, Generator & gen,
- const Distribution & distrib)
- : experiment(exper), generator(gen), distribution(distrib) { }
- double operator()() const { return experiment.run(generator, distribution); }
- private:
- const Experiment & experiment;
- Generator & generator;
- Distribution distribution;
- };
- template<class Experiment, class Generator, class Distribution>
- ks_experiment_generator_t<Experiment, Generator, Distribution>
- ks_experiment_generator(const Experiment & e, Generator & gen,
- const Distribution & distrib)
- {
- return ks_experiment_generator_t<Experiment, Generator, Distribution>
- (e, gen, distrib);
- }
- #endif /* STATISTIC_TESTS_HPP */
|