statistic_tests.hpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709
  1. /* statistic_tests.hpp header file
  2. *
  3. * Copyright Jens Maurer 2000
  4. * Distributed under the Boost Software License, Version 1.0. (See
  5. * accompanying file LICENSE_1_0.txt or copy at
  6. * http://www.boost.org/LICENSE_1_0.txt)
  7. *
  8. * $Id$
  9. *
  10. */
  11. #ifndef STATISTIC_TESTS_HPP
  12. #define STATISTIC_TESTS_HPP
  13. #include <stdexcept>
  14. #include <iterator>
  15. #include <vector>
  16. #include <boost/limits.hpp>
  17. #include <algorithm>
  18. #include <cmath>
  19. #include <boost/config.hpp>
  20. #include <boost/bind.hpp>
  21. #include <boost/random/uniform_01.hpp>
  22. #include <boost/random/variate_generator.hpp>
  23. #include "integrate.hpp"
  24. #if defined(BOOST_MSVC) && BOOST_MSVC <= 1300
  25. namespace std
  26. {
  27. inline double pow(double a, double b) { return ::pow(a,b); }
  28. inline double ceil(double x) { return ::ceil(x); }
  29. } // namespace std
  30. #endif
  31. template<class T>
  32. inline T fac(int k)
  33. {
  34. T result = 1;
  35. for(T i = 2; i <= k; ++i)
  36. result *= i;
  37. return result;
  38. }
  39. template<class T>
  40. T binomial(int n, int k)
  41. {
  42. if(k < n/2)
  43. k = n-k;
  44. T result = 1;
  45. for(int i = k+1; i<= n; ++i)
  46. result *= i;
  47. return result / fac<T>(n-k);
  48. }
  49. template<class T>
  50. T stirling2(int n, int m)
  51. {
  52. T sum = 0;
  53. for(int k = 0; k <= m; ++k)
  54. sum += binomial<T>(m, k) * std::pow(double(k), n) *
  55. ( (m-k)%2 == 0 ? 1 : -1);
  56. return sum / fac<T>(m);
  57. }
  58. /*
  59. * Experiments which create an empirical distribution in classes,
  60. * suitable for the chi-square test.
  61. */
  62. // std::floor(gen() * classes)
  63. class experiment_base
  64. {
  65. public:
  66. experiment_base(int cls) : _classes(cls) { }
  67. unsigned int classes() const { return _classes; }
  68. protected:
  69. unsigned int _classes;
  70. };
  71. class equidistribution_experiment : public experiment_base
  72. {
  73. public:
  74. explicit equidistribution_experiment(unsigned int classes)
  75. : experiment_base(classes) { }
  76. template<class NumberGenerator, class Counter>
  77. void run(NumberGenerator & f, Counter & count, int n) const
  78. {
  79. assert((f.min)() == 0 &&
  80. static_cast<unsigned int>((f.max)()) == classes()-1);
  81. for(int i = 0; i < n; ++i)
  82. count(f());
  83. }
  84. double probability(int /*i*/) const { return 1.0/classes(); }
  85. };
  86. // two-dimensional equidistribution experiment
  87. class equidistribution_2d_experiment : public equidistribution_experiment
  88. {
  89. public:
  90. explicit equidistribution_2d_experiment(unsigned int classes)
  91. : equidistribution_experiment(classes) { }
  92. template<class NumberGenerator, class Counter>
  93. void run(NumberGenerator & f, Counter & count, int n) const
  94. {
  95. unsigned int range = (f.max)()+1;
  96. assert((f.min)() == 0 && range*range == classes());
  97. for(int i = 0; i < n; ++i) {
  98. int y1 = f();
  99. int y2 = f();
  100. count(y1 + range * y2);
  101. }
  102. }
  103. };
  104. // distribution experiment: assume a probability density and
  105. // count events so that an equidistribution results.
  106. class distribution_experiment : public equidistribution_experiment
  107. {
  108. public:
  109. template<class Distribution>
  110. distribution_experiment(Distribution dist , unsigned int classes)
  111. : equidistribution_experiment(classes), limit(classes)
  112. {
  113. for(unsigned int i = 0; i < classes-1; ++i)
  114. limit[i] = quantile(dist, (i+1)*0.05);
  115. limit[classes-1] = std::numeric_limits<double>::infinity();
  116. if(limit[classes-1] < (std::numeric_limits<double>::max)())
  117. limit[classes-1] = (std::numeric_limits<double>::max)();
  118. #if 0
  119. std::cout << __PRETTY_FUNCTION__ << ": ";
  120. for(unsigned int i = 0; i < classes; ++i)
  121. std::cout << limit[i] << " ";
  122. std::cout << std::endl;
  123. #endif
  124. }
  125. template<class NumberGenerator, class Counter>
  126. void run(NumberGenerator & f, Counter & count, int n) const
  127. {
  128. for(int i = 0; i < n; ++i) {
  129. limits_type::const_iterator it =
  130. std::lower_bound(limit.begin(), limit.end(), f());
  131. count(it-limit.begin());
  132. }
  133. }
  134. private:
  135. typedef std::vector<double> limits_type;
  136. limits_type limit;
  137. };
  138. template<bool up, bool is_float>
  139. struct runs_direction_helper
  140. {
  141. template<class T>
  142. static T init(T)
  143. {
  144. return (std::numeric_limits<T>::max)();
  145. }
  146. };
  147. template<>
  148. struct runs_direction_helper<true, true>
  149. {
  150. template<class T>
  151. static T init(T)
  152. {
  153. return -(std::numeric_limits<T>::max)();
  154. }
  155. };
  156. template<>
  157. struct runs_direction_helper<true, false>
  158. {
  159. template<class T>
  160. static T init(T)
  161. {
  162. return (std::numeric_limits<T>::min)();
  163. }
  164. };
  165. // runs-up/runs-down experiment
  166. template<bool up>
  167. class runs_experiment : public experiment_base
  168. {
  169. public:
  170. explicit runs_experiment(unsigned int classes) : experiment_base(classes) { }
  171. template<class NumberGenerator, class Counter>
  172. void run(NumberGenerator & f, Counter & count, int n) const
  173. {
  174. typedef typename NumberGenerator::result_type result_type;
  175. result_type init =
  176. runs_direction_helper<
  177. up,
  178. !std::numeric_limits<result_type>::is_integer
  179. >::init(result_type());
  180. result_type previous = init;
  181. unsigned int length = 0;
  182. for(int i = 0; i < n; ++i) {
  183. result_type val = f();
  184. if(up ? previous <= val : previous >= val) {
  185. previous = val;
  186. ++length;
  187. } else {
  188. count((std::min)(length, classes())-1);
  189. length = 0;
  190. previous = init;
  191. // don't use this value, so that runs are independent
  192. }
  193. }
  194. }
  195. double probability(unsigned int r) const
  196. {
  197. if(r == classes()-1)
  198. return 1.0/fac<double>(classes());
  199. else
  200. return static_cast<double>(r+1)/fac<double>(r+2);
  201. }
  202. };
  203. // gap length experiment
  204. class gap_experiment : public experiment_base
  205. {
  206. public:
  207. template<class Dist>
  208. gap_experiment(unsigned int classes, const Dist & dist, double alpha, double beta)
  209. : experiment_base(classes), alpha(alpha), beta(beta), low(quantile(dist, alpha)), high(quantile(dist, beta)) {}
  210. template<class NumberGenerator, class Counter>
  211. void run(NumberGenerator & f, Counter & count, int n) const
  212. {
  213. typedef typename NumberGenerator::result_type result_type;
  214. unsigned int length = 0;
  215. for(int i = 0; i < n; ) {
  216. result_type value = f();
  217. if(value < low || value > high)
  218. ++length;
  219. else {
  220. count((std::min)(length, classes()-1));
  221. length = 0;
  222. ++i;
  223. }
  224. }
  225. }
  226. double probability(unsigned int r) const
  227. {
  228. double p = beta-alpha;
  229. if(r == classes()-1)
  230. return std::pow(1-p, static_cast<double>(r));
  231. else
  232. return p * std::pow(1-p, static_cast<double>(r));
  233. }
  234. private:
  235. double alpha, beta;
  236. double low, high;
  237. };
  238. // poker experiment
  239. class poker_experiment : public experiment_base
  240. {
  241. public:
  242. poker_experiment(unsigned int d, unsigned int k)
  243. : experiment_base(k), range(d)
  244. {
  245. assert(range > 1);
  246. }
  247. template<class UniformRandomNumberGenerator, class Counter>
  248. void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
  249. {
  250. typedef typename UniformRandomNumberGenerator::result_type result_type;
  251. assert(std::numeric_limits<result_type>::is_integer);
  252. assert((f.min)() == 0);
  253. assert((f.max)() == static_cast<result_type>(range-1));
  254. std::vector<result_type> v(classes());
  255. for(int i = 0; i < n; ++i) {
  256. for(unsigned int j = 0; j < classes(); ++j)
  257. v[j] = f();
  258. std::sort(v.begin(), v.end());
  259. result_type prev = v[0];
  260. int r = 1; // count different values in v
  261. for(unsigned int i = 1; i < classes(); ++i) {
  262. if(prev != v[i]) {
  263. prev = v[i];
  264. ++r;
  265. }
  266. }
  267. count(r-1);
  268. }
  269. }
  270. double probability(unsigned int r) const
  271. {
  272. ++r; // transform to 1 <= r <= 5
  273. double result = range;
  274. for(unsigned int i = 1; i < r; ++i)
  275. result *= range-i;
  276. return result / std::pow(range, static_cast<double>(classes())) *
  277. stirling2<double>(classes(), r);
  278. }
  279. private:
  280. unsigned int range;
  281. };
  282. // coupon collector experiment
  283. class coupon_collector_experiment : public experiment_base
  284. {
  285. public:
  286. coupon_collector_experiment(unsigned int d, unsigned int cls)
  287. : experiment_base(cls), d(d)
  288. {
  289. assert(d > 1);
  290. }
  291. template<class UniformRandomNumberGenerator, class Counter>
  292. void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
  293. {
  294. typedef typename UniformRandomNumberGenerator::result_type result_type;
  295. assert(std::numeric_limits<result_type>::is_integer);
  296. assert((f.min)() == 0);
  297. assert((f.max)() == static_cast<result_type>(d-1));
  298. std::vector<bool> occurs(d);
  299. for(int i = 0; i < n; ++i) {
  300. occurs.assign(d, false);
  301. unsigned int r = 0; // length of current sequence
  302. int q = 0; // number of non-duplicates in current set
  303. for(;;) {
  304. result_type val = f();
  305. ++r;
  306. if(!occurs[val]) { // new set element
  307. occurs[val] = true;
  308. ++q;
  309. if(q == d)
  310. break; // one complete set
  311. }
  312. }
  313. count((std::min)(r-d, classes()-1));
  314. }
  315. }
  316. double probability(unsigned int r) const
  317. {
  318. if(r == classes()-1)
  319. return 1-fac<double>(d)/
  320. std::pow(static_cast<double>(d), static_cast<double>(d+classes()-2)) *
  321. stirling2<double>(d+classes()-2, d);
  322. else
  323. return fac<double>(d)/
  324. std::pow(static_cast<double>(d), static_cast<double>(d+r)) *
  325. stirling2<double>(d+r-1, d-1);
  326. }
  327. private:
  328. int d;
  329. };
  330. // permutation test
  331. class permutation_experiment : public equidistribution_experiment
  332. {
  333. public:
  334. permutation_experiment(unsigned int t)
  335. : equidistribution_experiment(fac<int>(t)), t(t)
  336. {
  337. assert(t > 1);
  338. }
  339. template<class UniformRandomNumberGenerator, class Counter>
  340. void run(UniformRandomNumberGenerator & f, Counter & count, int n) const
  341. {
  342. typedef typename UniformRandomNumberGenerator::result_type result_type;
  343. std::vector<result_type> v(t);
  344. for(int i = 0; i < n; ++i) {
  345. for(int j = 0; j < t; ++j) {
  346. v[j] = f();
  347. }
  348. int x = 0;
  349. for(int r = t-1; r > 0; r--) {
  350. typename std::vector<result_type>::iterator it =
  351. std::max_element(v.begin(), v.begin()+r+1);
  352. x = (r+1)*x + (it-v.begin());
  353. std::iter_swap(it, v.begin()+r);
  354. }
  355. count(x);
  356. }
  357. }
  358. private:
  359. int t;
  360. };
  361. // birthday spacing experiment test
  362. class birthday_spacing_experiment : public experiment_base
  363. {
  364. public:
  365. birthday_spacing_experiment(unsigned int d, int n, int m)
  366. : experiment_base(d), n(n), m(m)
  367. {
  368. }
  369. template<class UniformRandomNumberGenerator, class Counter>
  370. void run(UniformRandomNumberGenerator & f, Counter & count, int n_total) const
  371. {
  372. typedef typename UniformRandomNumberGenerator::result_type result_type;
  373. assert(std::numeric_limits<result_type>::is_integer);
  374. assert((f.min)() == 0);
  375. assert((f.max)() == static_cast<result_type>(m-1));
  376. for(int j = 0; j < n_total; j++) {
  377. std::vector<result_type> v(n);
  378. std::generate_n(v.begin(), n, f);
  379. std::sort(v.begin(), v.end());
  380. std::vector<result_type> spacing(n);
  381. for(int i = 0; i < n-1; i++)
  382. spacing[i] = v[i+1]-v[i];
  383. spacing[n-1] = v[0] + m - v[n-1];
  384. std::sort(spacing.begin(), spacing.end());
  385. unsigned int k = 0;
  386. for(int i = 0; i < n-1; ++i) {
  387. if(spacing[i] == spacing[i+1])
  388. ++k;
  389. }
  390. count((std::min)(k, classes()-1));
  391. }
  392. }
  393. double probability(unsigned int r) const
  394. {
  395. assert(classes() == 4);
  396. assert(m == (1<<25));
  397. assert(n == 512);
  398. static const double prob[] = { 0.368801577, 0.369035243, 0.183471182,
  399. 0.078691997 };
  400. return prob[r];
  401. }
  402. private:
  403. int n, m;
  404. };
  405. /*
  406. * Misc. helper functions.
  407. */
  408. template<class Float>
  409. struct distribution_function
  410. {
  411. typedef Float result_type;
  412. typedef Float argument_type;
  413. typedef Float first_argument_type;
  414. typedef Float second_argument_type;
  415. };
  416. // computes P(K_n <= t) or P(t1 <= K_n <= t2). See Knuth, 3.3.1
  417. class kolmogorov_smirnov_probability : public distribution_function<double>
  418. {
  419. public:
  420. kolmogorov_smirnov_probability(int n)
  421. : approx(n > 50), n(n), sqrt_n(std::sqrt(double(n)))
  422. {
  423. if(!approx)
  424. n_n = std::pow(static_cast<double>(n), n);
  425. }
  426. double cdf(double t) const
  427. {
  428. if(approx) {
  429. return 1-std::exp(-2*t*t)*(1-2.0/3.0*t/sqrt_n);
  430. } else {
  431. t *= sqrt_n;
  432. double sum = 0;
  433. for(int k = static_cast<int>(std::ceil(t)); k <= n; k++)
  434. sum += binomial<double>(n, k) * std::pow(k-t, k) *
  435. std::pow(t+n-k, n-k-1);
  436. return 1 - t/n_n * sum;
  437. }
  438. }
  439. //double operator()(double t1, double t2) const
  440. //{ return operator()(t2) - operator()(t1); }
  441. private:
  442. bool approx;
  443. int n;
  444. double sqrt_n;
  445. double n_n;
  446. };
  447. inline double cdf(const kolmogorov_smirnov_probability& dist, double val)
  448. {
  449. return dist.cdf(val);
  450. }
  451. inline double quantile(const kolmogorov_smirnov_probability& dist, double val)
  452. {
  453. return invert_monotone_inc(boost::bind(&cdf, dist, _1), val, 0.0, 1000.0);
  454. }
  455. /*
  456. * Experiments for generators with continuous distribution functions
  457. */
  458. class kolmogorov_experiment
  459. {
  460. public:
  461. kolmogorov_experiment(int n) : n(n), ksp(n) { }
  462. template<class NumberGenerator, class Distribution>
  463. double run(NumberGenerator & gen, Distribution distrib) const
  464. {
  465. const int m = n;
  466. typedef std::vector<double> saved_temp;
  467. saved_temp a(m,1.0), b(m,0);
  468. std::vector<int> c(m,0);
  469. for(int i = 0; i < n; ++i) {
  470. double val = static_cast<double>(gen());
  471. double y = cdf(distrib, val);
  472. int k = static_cast<int>(std::floor(m*y));
  473. if(k >= m)
  474. --k; // should not happen
  475. a[k] = (std::min)(a[k], y);
  476. b[k] = (std::max)(b[k], y);
  477. ++c[k];
  478. }
  479. double kplus = 0, kminus = 0;
  480. int j = 0;
  481. for(int k = 0; k < m; ++k) {
  482. if(c[k] > 0) {
  483. kminus = (std::max)(kminus, a[k]-j/static_cast<double>(n));
  484. j += c[k];
  485. kplus = (std::max)(kplus, j/static_cast<double>(n) - b[k]);
  486. }
  487. }
  488. kplus *= std::sqrt(double(n));
  489. kminus *= std::sqrt(double(n));
  490. // std::cout << "k+ " << kplus << " k- " << kminus << std::endl;
  491. return kplus;
  492. }
  493. double probability(double x) const
  494. {
  495. return cdf(ksp, x);
  496. }
  497. private:
  498. int n;
  499. kolmogorov_smirnov_probability ksp;
  500. };
  501. struct power_distribution
  502. {
  503. power_distribution(double t) : t(t) {}
  504. double t;
  505. };
  506. double cdf(const power_distribution& dist, double val)
  507. {
  508. return std::pow(val, dist.t);
  509. }
  510. // maximum-of-t test (KS-based)
  511. template<class UniformRandomNumberGenerator>
  512. class maximum_experiment
  513. {
  514. public:
  515. typedef UniformRandomNumberGenerator base_type;
  516. maximum_experiment(base_type & f, int n, int t) : f(f), ke(n), t(t)
  517. { }
  518. double operator()() const
  519. {
  520. generator gen(f, t);
  521. return ke.run(gen, power_distribution(t));
  522. }
  523. private:
  524. struct generator {
  525. generator(base_type & f, int t) : f(f, boost::uniform_01<>()), t(t) { }
  526. double operator()()
  527. {
  528. double mx = f();
  529. for(int i = 1; i < t; ++i)
  530. mx = (std::max)(mx, f());
  531. return mx;
  532. }
  533. private:
  534. boost::variate_generator<base_type&, boost::uniform_01<> > f;
  535. int t;
  536. };
  537. base_type & f;
  538. kolmogorov_experiment ke;
  539. int t;
  540. };
  541. // compute a chi-square value for the distribution approximation error
  542. template<class ForwardIterator, class UnaryFunction>
  543. typename UnaryFunction::result_type
  544. chi_square_value(ForwardIterator first, ForwardIterator last,
  545. UnaryFunction probability)
  546. {
  547. typedef std::iterator_traits<ForwardIterator> iter_traits;
  548. typedef typename iter_traits::value_type counter_type;
  549. typedef typename UnaryFunction::result_type result_type;
  550. unsigned int classes = std::distance(first, last);
  551. result_type sum = 0;
  552. counter_type n = 0;
  553. for(unsigned int i = 0; i < classes; ++first, ++i) {
  554. counter_type count = *first;
  555. n += count;
  556. sum += (count/probability(i)) * count; // avoid overflow
  557. }
  558. #if 0
  559. for(unsigned int i = 0; i < classes; ++i) {
  560. // std::cout << (n*probability(i)) << " ";
  561. if(n * probability(i) < 5)
  562. std::cerr << "Not enough test runs for slot " << i
  563. << " p=" << probability(i) << ", n=" << n
  564. << std::endl;
  565. }
  566. #endif
  567. // std::cout << std::endl;
  568. // throw std::invalid_argument("not enough test runs");
  569. return sum/n - n;
  570. }
  571. template<class RandomAccessContainer>
  572. class generic_counter
  573. {
  574. public:
  575. explicit generic_counter(unsigned int classes) : container(classes, 0) { }
  576. void operator()(int i)
  577. {
  578. assert(i >= 0);
  579. assert(static_cast<unsigned int>(i) < container.size());
  580. ++container[i];
  581. }
  582. typename RandomAccessContainer::const_iterator begin() const
  583. { return container.begin(); }
  584. typename RandomAccessContainer::const_iterator end() const
  585. { return container.end(); }
  586. private:
  587. RandomAccessContainer container;
  588. };
  589. // chi_square test
  590. template<class Experiment, class Generator>
  591. double run_experiment(const Experiment & experiment, Generator & gen, int n)
  592. {
  593. generic_counter<std::vector<int> > v(experiment.classes());
  594. experiment.run(gen, v, n);
  595. return chi_square_value(v.begin(), v.end(),
  596. boost::bind(&Experiment::probability,
  597. experiment, boost::placeholders::_1));
  598. }
  599. // chi_square test
  600. template<class Experiment, class Generator>
  601. double run_experiment(const Experiment & experiment, const Generator & gen, int n)
  602. {
  603. generic_counter<std::vector<int> > v(experiment.classes());
  604. experiment.run(gen, v, n);
  605. return chi_square_value(v.begin(), v.end(),
  606. boost::bind(&Experiment::probability,
  607. experiment, boost::placeholders::_1));
  608. }
  609. // number generator with experiment results (for nesting)
  610. template<class Experiment, class Generator>
  611. class experiment_generator_t
  612. {
  613. public:
  614. experiment_generator_t(const Experiment & exper, Generator & gen, int n)
  615. : experiment(exper), generator(gen), n(n) { }
  616. double operator()() const { return run_experiment(experiment, generator, n); }
  617. private:
  618. const Experiment & experiment;
  619. Generator & generator;
  620. int n;
  621. };
  622. template<class Experiment, class Generator>
  623. experiment_generator_t<Experiment, Generator>
  624. experiment_generator(const Experiment & e, Generator & gen, int n)
  625. {
  626. return experiment_generator_t<Experiment, Generator>(e, gen, n);
  627. }
  628. template<class Experiment, class Generator, class Distribution>
  629. class ks_experiment_generator_t
  630. {
  631. public:
  632. ks_experiment_generator_t(const Experiment & exper, Generator & gen,
  633. const Distribution & distrib)
  634. : experiment(exper), generator(gen), distribution(distrib) { }
  635. double operator()() const { return experiment.run(generator, distribution); }
  636. private:
  637. const Experiment & experiment;
  638. Generator & generator;
  639. Distribution distribution;
  640. };
  641. template<class Experiment, class Generator, class Distribution>
  642. ks_experiment_generator_t<Experiment, Generator, Distribution>
  643. ks_experiment_generator(const Experiment & e, Generator & gen,
  644. const Distribution & distrib)
  645. {
  646. return ks_experiment_generator_t<Experiment, Generator, Distribution>
  647. (e, gen, distrib);
  648. }
  649. #endif /* STATISTIC_TESTS_HPP */