Thrill  0.1
sampling.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * thrill/common/sampling.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Sebastian Lamm <[email protected]>
7  * Copyright (C) 2017-2018 Lorenz Hübschle-Schneider <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #pragma once
13 #ifndef THRILL_COMMON_SAMPLING_HEADER
14 #define THRILL_COMMON_SAMPLING_HEADER
15 
17 #include <thrill/common/logger.hpp>
18 
19 #include <tlx/define.hpp>
20 #include <tlx/math.hpp>
21 
22 #include <cmath>
23 #include <type_traits>
24 #include <vector>
25 
26 namespace thrill {
27 namespace common {
28 
29 //! Sampling without replacement, implementing Algorithm R from Sanders, Lamm,
30 //! Hübschle-Schneider, Schrade, Dachsbacher, ACM TOMS 2017: Efficient Random
31 //! Sampling - Parallel, Vectorized, Cache-Efficient, and Online
32 template <typename RNG = std::mt19937_64>
33 class Sampling
34 {
35 public:
36  static constexpr bool debug = false;
37 
38  explicit Sampling(RNG& rng) : rng_(rng), hyp_(rng()) { }
39 
40  template <typename Iterator,
41  typename Type = typename std::iterator_traits<Iterator>::value_type>
42  void operator () (Iterator begin, Iterator end, size_t size,
43  std::vector<Type>& samples) {
44  samples.resize(size);
45  do_sample(begin, end, size, samples.begin());
46  }
47 
48  template <typename Iterator, typename OutputIterator>
49  void operator () (Iterator begin, Iterator end, size_t size,
50  OutputIterator out_begin) {
51  do_sample(begin, end, size, out_begin);
52  }
53 
54 private:
55  template <typename Iterator, typename OutputIterator>
56  void do_sample(Iterator begin, Iterator end, size_t size,
57  OutputIterator out_begin) {
58  if (size == 0) return; // degenerate
59 
60  const size_t insize = end - begin;
61  if (size == insize) { // degenerate
62  std::copy(begin, end, out_begin);
63  }
64  else if (insize > 1024) { // recursive step
65  size_t left_size = insize / 2;
66  size_t left = hyp_(left_size, insize - left_size, size);
67  sLOG << "Splitting input of size" << insize << "into two, left"
68  << left_size << "elements get" << left << "of"
69  << size << "samples";
70  do_sample(begin, begin + left_size, left, out_begin);
71  do_sample(begin + left_size, end, size - left, out_begin + left);
72  }
73  else if (insize > 32 && size > 8) { // hash base case
74  sLOG << "Base case for size" << insize << "and" << size << "samples";
75  hash_sample(begin, end, size, out_begin);
76  }
77  else { // mini base case
78  sLOG << "Mini case for size" << insize << "and" << size << "samples";
79  std::vector<size_t> sample;
80  sample.reserve(size);
81  std::uniform_int_distribution<size_t> dist(0, insize - 1);
82  size_t remaining = size;
83  while (remaining > 0) {
84  size_t elem = dist(rng_);
85  if (std::find(sample.begin(), sample.end(), elem) == sample.end()) {
86  sample.push_back(elem);
87  *(out_begin + size - remaining) = *(begin + elem);
88  --remaining;
89  }
90  }
91  }
92  }
93 
94  template <typename Iterator, typename OutIterator,
95  typename Type = typename std::iterator_traits<Iterator>::value_type>
96  void hash_sample(Iterator begin, Iterator end, size_t size,
97  OutIterator out_begin) {
98  const size_t insize = end - begin;
99  if (insize <= size) {
100  // degenerate case
101  std::copy(begin, end, out_begin);
102  return;
103  }
104  sLOG << "HashSampling" << size << "of" << insize << "elements";
105 
106  std::uniform_int_distribution<size_t> dist(0, insize - 1);
107  const size_t dummy = -1;
108  const size_t population_lg = tlx::integer_log2_ceil(insize);
109  const size_t table_lg = 3 + tlx::integer_log2_ceil(size);
110  const size_t table_size = 1ULL << table_lg;
111  const size_t address_mask = (table_lg >= population_lg) ? 0 :
112  population_lg - table_lg;
113 
114  sLOG << "Table size:" << table_size << "(lg:" << table_lg << " pop_lg:"
115  << population_lg << " mask:" << address_mask << ")";
116 
117  if (table_size > hash_table.size()) {
118  sLOG << "Resizing table from" << hash_table.size() << "to"
119  << table_size;
120  hash_table.resize(table_size, dummy);
121  }
122  indices.reserve(table_size);
123 
124  size_t remaining = size;
125  while (remaining > 0) {
126  size_t variate, index;
127  while (true) {
128  // Take sample
129  variate = dist(rng_); // N * randblock[array_index++];
130  index = variate >> address_mask;
131  assert(index < table_size);
132  size_t hash_elem = hash_table[index];
133 
134  // Table lookup
135  if (TLX_LIKELY(hash_elem == dummy)) break; // done
136  else if (hash_elem == variate) continue; // already sampled
137  else {
138 increment:
139  ++index;
140  index &= (table_size - 1);
141  hash_elem = hash_table[index];
142  if (hash_elem == dummy) break; // done
143  else if (hash_elem == variate) continue; // already sampled
144  goto increment; // keep incrementing
145  }
146  }
147  // Add sample
148  hash_table[index] = variate;
149  *(out_begin + size - remaining) = *(begin + variate);
150  sLOG << "sample no" << size - remaining << "= elem" << variate;
151  indices.push_back(index);
152  remaining--;
153  }
154 
155  // clear table
156  for (size_t index : indices)
157  hash_table[index] = dummy;
158  indices.clear();
159  }
160 
161  RNG& rng_;
163  std::vector<size_t> hash_table, indices;
164 };
165 
166 } // namespace common
167 } // namespace thrill
168 
169 #endif // !THRILL_COMMON_SAMPLING_HEADER
170 
171 /******************************************************************************/
std::vector< size_t > hash_table
Definition: sampling.hpp:163
void operator()(Iterator begin, Iterator end, size_t size, std::vector< Type > &samples)
Definition: sampling.hpp:42
#define sLOG
Default logging method: output if the local debug variable is true.
Definition: logger.hpp:34
Type
VFS object type.
Definition: file_io.hpp:52
void hash_sample(Iterator begin, Iterator end, size_t size, OutIterator out_begin)
Definition: sampling.hpp:96
std::vector< size_t > indices
Definition: sampling.hpp:163
#define TLX_LIKELY(c)
Definition: likely.hpp:23
common::hypergeometric hyp_
Definition: sampling.hpp:162
void do_sample(Iterator begin, Iterator end, size_t size, OutputIterator out_begin)
Definition: sampling.hpp:56
static unsigned integer_log2_ceil(int i)
calculate the log2 floor of an integer type
static constexpr bool debug
Definition: sampling.hpp:36