Thrill  0.1
hypergeometric_distribution.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * thrill/common/hypergeometric_distribution.hpp
3  *
4  * A hypergeomitric distribution random generator adapted from NumPy.
5  *
6  * The implementation of loggam(), rk_hypergeometric(), rk_hypergeometric_hyp(),
7  * and rk_hypergeometric_hrua() were adapted from NumPy's
8  * numpy/random/mtrand/distributions.c which has this license:
9  *
10  * Copyright 2005 Robert Kern ([email protected])
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a
13  * copy of this software and associated documentation files (the
14  * "Software"), to deal in the Software without restriction, including
15  * without limitation the rights to use, copy, modify, merge, publish,
16  * distribute, sublicense, and/or sell copies of the Software, and to
17  * permit persons to whom the Software is furnished to do so, subject to
18  * the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included
21  * in all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
24  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30  *
31  *
32  * The implementations of rk_hypergeometric_hyp(), rk_hypergeometric_hrua(),
33  * and rk_triangular() were adapted from Ivan Frohne's rv.py which has this
34  * license:
35  *
36  * Copyright 1998 by Ivan Frohne; Wasilla, Alaska, U.S.A.
37  * All Rights Reserved
38  *
39  * Permission to use, copy, modify and distribute this software and its
40  * documentation for any purpose, free of charge, is granted subject to the
41  * following conditions:
42  * The above copyright notice and this permission notice shall be included in
43  * all copies or substantial portions of the software.
44  *
45  * THE SOFTWARE AND DOCUMENTATION IS PROVIDED WITHOUT WARRANTY OF ANY KIND,
46  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO MERCHANTABILITY, FITNESS
47  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHOR
48  * OR COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM OR DAMAGES IN A CONTRACT
49  * ACTION, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
50  * SOFTWARE OR ITS DOCUMENTATION.
51  *
52  *
53  * Part of Project Thrill - http://project-thrill.org
54  *
55  * Copyright (C) 2017 Lorenz Hübschle-Schneider <[email protected]>
56  *
57  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
58  ******************************************************************************/
59 
60 #pragma once
61 #ifndef THRILL_COMMON_HYPERGEOMETRIC_DISTRIBUTION_HEADER
62 #define THRILL_COMMON_HYPERGEOMETRIC_DISTRIBUTION_HEADER
63 
64 #include <thrill/common/logger.hpp>
65 
66 #include <cmath>
67 #include <random>
68 
69 namespace thrill {
70 namespace common {
71 
72 template <typename int_t = int64_t, typename fp_t = double>
74 {
75 public:
76  using result_type = int_t;
77  using real_type = fp_t;
78 
80  : rng(seed != 0 ? seed : std::random_device { }
81  ()) { }
82 
83  int_t operator () (int_t good, int_t bad, int_t sample) {
84  if (sample < 1) {
85  sLOG1 << "hypergeometric distribution error: sample < 1:" << sample;
86  return static_cast<int_t>(-1);
87  }
88  if (good + bad < sample) {
89  sLOG1 << "hypergeometric distribution error: good + bad < sample:"
90  << "good:" << good << "bad:" << bad << "sample:" << sample;
91  return static_cast<int_t>(-1);
92  }
93  return rk_hypergeometric(good, bad, sample);
94  }
95 
96  void seed(size_t seed_val) {
97  rng.seed(seed_val);
98  }
99 
100 private:
101  /*
102  * log-gamma function to support some of these distributions. The
103  * algorithm comes from SPECFUN by Shanjie Zhang and Jianming Jin and their
104  * book "Computation of Special Functions", 1996, John Wiley & Sons, Inc.
105  */
106  fp_t loggam(fp_t x) const {
107  fp_t x0, x2, xp, gl, gl0;
108  int_t k, n;
109 
110  static fp_t a[10] = {
111  8.333333333333333e-02, -2.777777777777778e-03,
112  7.936507936507937e-04, -5.952380952380952e-04,
113  8.417508417508418e-04, -1.917526917526918e-03,
114  6.410256410256410e-03, -2.955065359477124e-02,
115  1.796443723688307e-01, -1.39243221690590e+00
116  };
117  x0 = x;
118  n = 0;
119  if ((x == 1.0) || (x == 2.0))
120  {
121  return 0.0;
122  }
123  else if (x <= 7.0)
124  {
125  n = static_cast<int_t>(7 - x);
126  x0 = x + n;
127  }
128  x2 = 1.0 / (x0 * x0);
129  xp = 2 * M_PI;
130  gl0 = a[9];
131  for (k = 8; k >= 0; k--)
132  {
133  gl0 *= x2;
134  gl0 += a[k];
135  }
136  gl = gl0 / x0 + 0.5 * std::log(xp) + (x0 - 0.5) * std::log(x0) - x0;
137  if (x <= 7.0)
138  {
139  for (k = 1; k <= n; k++)
140  {
141  gl -= std::log(x0 - 1.0);
142  x0 -= 1.0;
143  }
144  }
145  return gl;
146  }
147 
148  /* D1 = 2*sqrt(2/e) */
149  /* D2 = 3 - 2*sqrt(3/e) */
150  static constexpr fp_t D1 = 1.7155277699214135;
151  static constexpr fp_t D2 = 0.8989161620588988;
152 
153  int_t rk_hypergeometric_hyp(int_t good, int_t bad, int_t sample) {
154  int_t d1, K, Z;
155  fp_t d2, U, Y;
156 
157  d1 = bad + good - sample;
158  d2 = static_cast<fp_t>(std::min(bad, good));
159 
160  Y = d2;
161  K = sample;
162  while (Y > 0.0) {
163  U = dist(rng); // rk_double(state);
164  Y -= static_cast<int_t>(floor(U + Y / (d1 + K)));
165  K--;
166  if (K == 0) break;
167  }
168  Z = static_cast<int_t>(d2 - Y);
169  if (good > bad) Z = sample - Z;
170  return Z;
171  }
172 
173  int_t rk_hypergeometric_hrua(int_t good, int_t bad, int_t sample) {
174  int_t mingoodbad, maxgoodbad, popsize, m, d9;
175  fp_t d4, d5, d6, d7, d8, d10, d11;
176  int_t Z;
177  fp_t T, W, X, Y;
178 
179  mingoodbad = std::min(good, bad);
180  popsize = good + bad;
181  maxgoodbad = std::max(good, bad);
182  m = std::min(sample, popsize - sample);
183  d4 = static_cast<fp_t>(mingoodbad) / popsize;
184  d5 = 1.0 - d4;
185  d6 = m * d4 + 0.5;
186  d7 = sqrt(static_cast<fp_t>(popsize - m) * sample * d4 * d5 /
187  (popsize - 1) + 0.5);
188  d8 = D1 * d7 + D2;
189  d9 = static_cast<int_t>(floor(static_cast<fp_t>(m + 1) *
190  (mingoodbad + 1) / (popsize + 2)));
191  d10 =
192  (loggam(d9 + 1) + loggam(mingoodbad - d9 + 1) + loggam(m - d9 + 1) +
193  loggam(maxgoodbad - m + d9 + 1));
194  d11 = std::min(std::min(m, mingoodbad) + 1.0, floor(d6 + 16 * d7));
195  /* 16 for 16-decimal-digit precision in D1 and D2 */
196 
197  while (1) {
198  X = dist(rng); // rk_double(state);
199  Y = dist(rng); // rk_double(state);
200  W = d6 + d8 * (Y - 0.5) / X;
201 
202  /* fast rejection: */
203  if ((W < 0.0) || (W >= d11)) continue;
204 
205  Z = static_cast<int_t>(floor(W));
206  T = d10 - (loggam(Z + 1) + loggam(mingoodbad - Z + 1) +
207  loggam(m - Z + 1) + loggam(maxgoodbad - m + Z + 1));
208 
209  /* fast acceptance: */
210  if ((X * (4.0 - X) - 3.0) <= T) break;
211 
212  /* fast rejection: */
213  if (X * (X - T) >= 1) continue;
214 
215  if (2.0 * std::log(X) <= T) break; /* acceptance */
216  }
217 
218  /* this is a correction to HRUA* by Ivan Frohne in rv.py */
219  if (good > bad) Z = m - Z;
220 
221  /* another fix from rv.py to allow sample to exceed popsize/2 */
222  if (m < sample) Z = good - Z;
223 
224  return Z;
225  }
226 
227  int_t rk_hypergeometric(int_t good, int_t bad, int_t sample) {
228  if (sample > 10) {
229  return rk_hypergeometric_hrua(good, bad, sample);
230  }
231  else {
232  return rk_hypergeometric_hyp(good, bad, sample);
233  }
234  }
235 
236  // Data members:
237  std::mt19937 rng; // random number generator
238  std::uniform_real_distribution<fp_t> dist; // [0.0...1.0) distribution
239 };
240 
242 
243 } // namespace common
244 } // namespace thrill
245 
246 #endif // !THRILL_COMMON_HYPERGEOMETRIC_DISTRIBUTION_HEADER
247 
248 /******************************************************************************/
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
double T
STL namespace.
#define sLOG1
Definition: logger.hpp:38
int_t operator()(int_t good, int_t bad, int_t sample)
list x
Definition: gen_data.py:39
int_t rk_hypergeometric_hrua(int_t good, int_t bad, int_t sample)
static const uint64_t K[80]
Definition: sha512.cpp:32
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
int_t rk_hypergeometric_hyp(int_t good, int_t bad, int_t sample)
int_t rk_hypergeometric(int_t good, int_t bad, int_t sample)