/* statistic_tests.hpp header file * * Copyright Jens Maurer 2000 * Distributed under the Boost Software License, Version 1.0. (See * accompanying file LICENSE_1_0.txt or copy at * http://www.boost.org/LICENSE_1_0.txt) * * $Id$ * */ #ifndef STATISTIC_TESTS_HPP #define STATISTIC_TESTS_HPP #include #include #include #include #include #include #include #include #include #include #include "integrate.hpp" #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300 namespace std { inline double pow(double a, double b) { return ::pow(a,b); } inline double ceil(double x) { return ::ceil(x); } } // namespace std #endif template inline T fac(int k) { T result = 1; for(T i = 2; i <= k; ++i) result *= i; return result; } template T binomial(int n, int k) { if(k < n/2) k = n-k; T result = 1; for(int i = k+1; i<= n; ++i) result *= i; return result / fac(n-k); } template T stirling2(int n, int m) { T sum = 0; for(int k = 0; k <= m; ++k) sum += binomial(m, k) * std::pow(double(k), n) * ( (m-k)%2 == 0 ? 1 : -1); return sum / fac(m); } /* * Experiments which create an empirical distribution in classes, * suitable for the chi-square test. */ // std::floor(gen() * classes) class experiment_base { public: experiment_base(int cls) : _classes(cls) { } unsigned int classes() const { return _classes; } protected: unsigned int _classes; }; class equidistribution_experiment : public experiment_base { public: explicit equidistribution_experiment(unsigned int classes) : experiment_base(classes) { } template void run(NumberGenerator & f, Counter & count, int n) const { assert((f.min)() == 0 && static_cast((f.max)()) == classes()-1); for(int i = 0; i < n; ++i) count(f()); } double probability(int /*i*/) const { return 1.0/classes(); } }; // two-dimensional equidistribution experiment class equidistribution_2d_experiment : public equidistribution_experiment { public: explicit equidistribution_2d_experiment(unsigned int classes) : equidistribution_experiment(classes) { } template void run(NumberGenerator & f, Counter & count, int n) const { unsigned int range = (f.max)()+1; assert((f.min)() == 0 && range*range == classes()); for(int i = 0; i < n; ++i) { int y1 = f(); int y2 = f(); count(y1 + range * y2); } } }; // distribution experiment: assume a probability density and // count events so that an equidistribution results. class distribution_experiment : public equidistribution_experiment { public: template distribution_experiment(Distribution dist , unsigned int classes) : equidistribution_experiment(classes), limit(classes) { for(unsigned int i = 0; i < classes-1; ++i) limit[i] = quantile(dist, (i+1)*0.05); limit[classes-1] = std::numeric_limits::infinity(); if(limit[classes-1] < (std::numeric_limits::max)()) limit[classes-1] = (std::numeric_limits::max)(); #if 0 std::cout << __PRETTY_FUNCTION__ << ": "; for(unsigned int i = 0; i < classes; ++i) std::cout << limit[i] << " "; std::cout << std::endl; #endif } template void run(NumberGenerator & f, Counter & count, int n) const { for(int i = 0; i < n; ++i) { limits_type::const_iterator it = std::lower_bound(limit.begin(), limit.end(), f()); count(it-limit.begin()); } } private: typedef std::vector limits_type; limits_type limit; }; template struct runs_direction_helper { template static T init(T) { return (std::numeric_limits::max)(); } }; template<> struct runs_direction_helper { template static T init(T) { return -(std::numeric_limits::max)(); } }; template<> struct runs_direction_helper { template static T init(T) { return (std::numeric_limits::min)(); } }; // runs-up/runs-down experiment template class runs_experiment : public experiment_base { public: explicit runs_experiment(unsigned int classes) : experiment_base(classes) { } template void run(NumberGenerator & f, Counter & count, int n) const { typedef typename NumberGenerator::result_type result_type; result_type init = runs_direction_helper< up, !std::numeric_limits::is_integer >::init(result_type()); result_type previous = init; unsigned int length = 0; for(int i = 0; i < n; ++i) { result_type val = f(); if(up ? previous <= val : previous >= val) { previous = val; ++length; } else { count((std::min)(length, classes())-1); length = 0; previous = init; // don't use this value, so that runs are independent } } } double probability(unsigned int r) const { if(r == classes()-1) return 1.0/fac(classes()); else return static_cast(r+1)/fac(r+2); } }; // gap length experiment class gap_experiment : public experiment_base { public: template gap_experiment(unsigned int classes, const Dist & dist, double alpha, double beta) : experiment_base(classes), alpha(alpha), beta(beta), low(quantile(dist, alpha)), high(quantile(dist, beta)) {} template void run(NumberGenerator & f, Counter & count, int n) const { typedef typename NumberGenerator::result_type result_type; unsigned int length = 0; for(int i = 0; i < n; ) { result_type value = f(); if(value < low || value > high) ++length; else { count((std::min)(length, classes()-1)); length = 0; ++i; } } } double probability(unsigned int r) const { double p = beta-alpha; if(r == classes()-1) return std::pow(1-p, static_cast(r)); else return p * std::pow(1-p, static_cast(r)); } private: double alpha, beta; double low, high; }; // poker experiment class poker_experiment : public experiment_base { public: poker_experiment(unsigned int d, unsigned int k) : experiment_base(k), range(d) { assert(range > 1); } template void run(UniformRandomNumberGenerator & f, Counter & count, int n) const { typedef typename UniformRandomNumberGenerator::result_type result_type; assert(std::numeric_limits::is_integer); assert((f.min)() == 0); assert((f.max)() == static_cast(range-1)); std::vector v(classes()); for(int i = 0; i < n; ++i) { for(unsigned int j = 0; j < classes(); ++j) v[j] = f(); std::sort(v.begin(), v.end()); result_type prev = v[0]; int r = 1; // count different values in v for(unsigned int i = 1; i < classes(); ++i) { if(prev != v[i]) { prev = v[i]; ++r; } } count(r-1); } } double probability(unsigned int r) const { ++r; // transform to 1 <= r <= 5 double result = range; for(unsigned int i = 1; i < r; ++i) result *= range-i; return result / std::pow(range, static_cast(classes())) * stirling2(classes(), r); } private: unsigned int range; }; // coupon collector experiment class coupon_collector_experiment : public experiment_base { public: coupon_collector_experiment(unsigned int d, unsigned int cls) : experiment_base(cls), d(d) { assert(d > 1); } template void run(UniformRandomNumberGenerator & f, Counter & count, int n) const { typedef typename UniformRandomNumberGenerator::result_type result_type; assert(std::numeric_limits::is_integer); assert((f.min)() == 0); assert((f.max)() == static_cast(d-1)); std::vector occurs(d); for(int i = 0; i < n; ++i) { occurs.assign(d, false); unsigned int r = 0; // length of current sequence int q = 0; // number of non-duplicates in current set for(;;) { result_type val = f(); ++r; if(!occurs[val]) { // new set element occurs[val] = true; ++q; if(q == d) break; // one complete set } } count((std::min)(r-d, classes()-1)); } } double probability(unsigned int r) const { if(r == classes()-1) return 1-fac(d)/ std::pow(static_cast(d), static_cast(d+classes()-2)) * stirling2(d+classes()-2, d); else return fac(d)/ std::pow(static_cast(d), static_cast(d+r)) * stirling2(d+r-1, d-1); } private: int d; }; // permutation test class permutation_experiment : public equidistribution_experiment { public: permutation_experiment(unsigned int t) : equidistribution_experiment(fac(t)), t(t) { assert(t > 1); } template void run(UniformRandomNumberGenerator & f, Counter & count, int n) const { typedef typename UniformRandomNumberGenerator::result_type result_type; std::vector v(t); for(int i = 0; i < n; ++i) { for(int j = 0; j < t; ++j) { v[j] = f(); } int x = 0; for(int r = t-1; r > 0; r--) { typename std::vector::iterator it = std::max_element(v.begin(), v.begin()+r+1); x = (r+1)*x + (it-v.begin()); std::iter_swap(it, v.begin()+r); } count(x); } } private: int t; }; // birthday spacing experiment test class birthday_spacing_experiment : public experiment_base { public: birthday_spacing_experiment(unsigned int d, int n, int m) : experiment_base(d), n(n), m(m) { } template void run(UniformRandomNumberGenerator & f, Counter & count, int n_total) const { typedef typename UniformRandomNumberGenerator::result_type result_type; assert(std::numeric_limits::is_integer); assert((f.min)() == 0); assert((f.max)() == static_cast(m-1)); for(int j = 0; j < n_total; j++) { std::vector v(n); std::generate_n(v.begin(), n, f); std::sort(v.begin(), v.end()); std::vector spacing(n); for(int i = 0; i < n-1; i++) spacing[i] = v[i+1]-v[i]; spacing[n-1] = v[0] + m - v[n-1]; std::sort(spacing.begin(), spacing.end()); unsigned int k = 0; for(int i = 0; i < n-1; ++i) { if(spacing[i] == spacing[i+1]) ++k; } count((std::min)(k, classes()-1)); } } double probability(unsigned int r) const { assert(classes() == 4); assert(m == (1<<25)); assert(n == 512); static const double prob[] = { 0.368801577, 0.369035243, 0.183471182, 0.078691997 }; return prob[r]; } private: int n, m; }; /* * Misc. helper functions. */ template struct distribution_function { typedef Float result_type; typedef Float argument_type; typedef Float first_argument_type; typedef Float second_argument_type; }; // computes P(K_n <= t) or P(t1 <= K_n <= t2). See Knuth, 3.3.1 class kolmogorov_smirnov_probability : public distribution_function { public: kolmogorov_smirnov_probability(int n) : approx(n > 50), n(n), sqrt_n(std::sqrt(double(n))) { if(!approx) n_n = std::pow(static_cast(n), n); } double cdf(double t) const { if(approx) { return 1-std::exp(-2*t*t)*(1-2.0/3.0*t/sqrt_n); } else { t *= sqrt_n; double sum = 0; for(int k = static_cast(std::ceil(t)); k <= n; k++) sum += binomial(n, k) * std::pow(k-t, k) * std::pow(t+n-k, n-k-1); return 1 - t/n_n * sum; } } //double operator()(double t1, double t2) const //{ return operator()(t2) - operator()(t1); } private: bool approx; int n; double sqrt_n; double n_n; }; inline double cdf(const kolmogorov_smirnov_probability& dist, double val) { return dist.cdf(val); } inline double quantile(const kolmogorov_smirnov_probability& dist, double val) { return invert_monotone_inc(boost::bind(&cdf, dist, _1), val, 0.0, 1000.0); } /* * Experiments for generators with continuous distribution functions */ class kolmogorov_experiment { public: kolmogorov_experiment(int n) : n(n), ksp(n) { } template double run(NumberGenerator & gen, Distribution distrib) const { const int m = n; typedef std::vector saved_temp; saved_temp a(m,1.0), b(m,0); std::vector c(m,0); for(int i = 0; i < n; ++i) { double val = static_cast(gen()); double y = cdf(distrib, val); int k = static_cast(std::floor(m*y)); if(k >= m) --k; // should not happen a[k] = (std::min)(a[k], y); b[k] = (std::max)(b[k], y); ++c[k]; } double kplus = 0, kminus = 0; int j = 0; for(int k = 0; k < m; ++k) { if(c[k] > 0) { kminus = (std::max)(kminus, a[k]-j/static_cast(n)); j += c[k]; kplus = (std::max)(kplus, j/static_cast(n) - b[k]); } } kplus *= std::sqrt(double(n)); kminus *= std::sqrt(double(n)); // std::cout << "k+ " << kplus << " k- " << kminus << std::endl; return kplus; } double probability(double x) const { return cdf(ksp, x); } private: int n; kolmogorov_smirnov_probability ksp; }; struct power_distribution { power_distribution(double t) : t(t) {} double t; }; double cdf(const power_distribution& dist, double val) { return std::pow(val, dist.t); } // maximum-of-t test (KS-based) template class maximum_experiment { public: typedef UniformRandomNumberGenerator base_type; maximum_experiment(base_type & f, int n, int t) : f(f), ke(n), t(t) { } double operator()() const { generator gen(f, t); return ke.run(gen, power_distribution(t)); } private: struct generator { generator(base_type & f, int t) : f(f, boost::uniform_01<>()), t(t) { } double operator()() { double mx = f(); for(int i = 1; i < t; ++i) mx = (std::max)(mx, f()); return mx; } private: boost::variate_generator > f; int t; }; base_type & f; kolmogorov_experiment ke; int t; }; // compute a chi-square value for the distribution approximation error template typename UnaryFunction::result_type chi_square_value(ForwardIterator first, ForwardIterator last, UnaryFunction probability) { typedef std::iterator_traits iter_traits; typedef typename iter_traits::value_type counter_type; typedef typename UnaryFunction::result_type result_type; unsigned int classes = std::distance(first, last); result_type sum = 0; counter_type n = 0; for(unsigned int i = 0; i < classes; ++first, ++i) { counter_type count = *first; n += count; sum += (count/probability(i)) * count; // avoid overflow } #if 0 for(unsigned int i = 0; i < classes; ++i) { // std::cout << (n*probability(i)) << " "; if(n * probability(i) < 5) std::cerr << "Not enough test runs for slot " << i << " p=" << probability(i) << ", n=" << n << std::endl; } #endif // std::cout << std::endl; // throw std::invalid_argument("not enough test runs"); return sum/n - n; } template class generic_counter { public: explicit generic_counter(unsigned int classes) : container(classes, 0) { } void operator()(int i) { assert(i >= 0); assert(static_cast(i) < container.size()); ++container[i]; } typename RandomAccessContainer::const_iterator begin() const { return container.begin(); } typename RandomAccessContainer::const_iterator end() const { return container.end(); } private: RandomAccessContainer container; }; // chi_square test template double run_experiment(const Experiment & experiment, Generator & gen, int n) { generic_counter > v(experiment.classes()); experiment.run(gen, v, n); return chi_square_value(v.begin(), v.end(), boost::bind(&Experiment::probability, experiment, boost::placeholders::_1)); } // chi_square test template double run_experiment(const Experiment & experiment, const Generator & gen, int n) { generic_counter > v(experiment.classes()); experiment.run(gen, v, n); return chi_square_value(v.begin(), v.end(), boost::bind(&Experiment::probability, experiment, boost::placeholders::_1)); } // number generator with experiment results (for nesting) template class experiment_generator_t { public: experiment_generator_t(const Experiment & exper, Generator & gen, int n) : experiment(exper), generator(gen), n(n) { } double operator()() const { return run_experiment(experiment, generator, n); } private: const Experiment & experiment; Generator & generator; int n; }; template experiment_generator_t experiment_generator(const Experiment & e, Generator & gen, int n) { return experiment_generator_t(e, gen, n); } template class ks_experiment_generator_t { public: ks_experiment_generator_t(const Experiment & exper, Generator & gen, const Distribution & distrib) : experiment(exper), generator(gen), distribution(distrib) { } double operator()() const { return experiment.run(generator, distribution); } private: const Experiment & experiment; Generator & generator; Distribution distribution; }; template ks_experiment_generator_t ks_experiment_generator(const Experiment & e, Generator & gen, const Distribution & distrib) { return ks_experiment_generator_t (e, gen, distrib); } #endif /* STATISTIC_TESTS_HPP */