13 #ifndef THRILL_COMMON_SAMPLING_HEADER 14 #define THRILL_COMMON_SAMPLING_HEADER 23 #include <type_traits> 32 template <
typename RNG = std::mt19937_64>
36 static constexpr
bool debug =
false;
40 template <
typename Iterator,
41 typename Type =
typename std::iterator_traits<Iterator>::value_type>
43 std::vector<Type>& samples) {
45 do_sample(begin, end, size, samples.begin());
48 template <
typename Iterator,
typename OutputIterator>
50 OutputIterator out_begin) {
55 template <
typename Iterator,
typename OutputIterator>
56 void do_sample(Iterator begin, Iterator end,
size_t size,
57 OutputIterator out_begin) {
58 if (size == 0)
return;
60 const size_t insize = end - begin;
62 std::copy(begin, end, out_begin);
64 else if (insize > 1024) {
65 size_t left_size = insize / 2;
66 size_t left =
hyp_(left_size, insize - left_size, size);
67 sLOG <<
"Splitting input of size" << insize <<
"into two, left" 68 << left_size <<
"elements get" << left <<
"of" 70 do_sample(begin, begin + left_size, left, out_begin);
71 do_sample(begin + left_size, end, size - left, out_begin + left);
73 else if (insize > 32 && size > 8) {
74 sLOG <<
"Base case for size" << insize <<
"and" << size <<
"samples";
78 sLOG <<
"Mini case for size" << insize <<
"and" << size <<
"samples";
79 std::vector<size_t> sample;
81 std::uniform_int_distribution<size_t> dist(0, insize - 1);
82 size_t remaining = size;
83 while (remaining > 0) {
84 size_t elem = dist(
rng_);
85 if (std::find(sample.begin(), sample.end(), elem) == sample.end()) {
86 sample.push_back(elem);
87 *(out_begin + size - remaining) = *(begin + elem);
94 template <
typename Iterator,
typename OutIterator,
95 typename Type =
typename std::iterator_traits<Iterator>::value_type>
97 OutIterator out_begin) {
98 const size_t insize = end - begin;
101 std::copy(begin, end, out_begin);
104 sLOG <<
"HashSampling" << size <<
"of" << insize <<
"elements";
106 std::uniform_int_distribution<size_t> dist(0, insize - 1);
107 const size_t dummy = -1;
110 const size_t table_size = 1ULL << table_lg;
111 const size_t address_mask = (table_lg >= population_lg) ? 0 :
112 population_lg - table_lg;
114 sLOG <<
"Table size:" << table_size <<
"(lg:" << table_lg <<
" pop_lg:" 115 << population_lg <<
" mask:" << address_mask <<
")";
124 size_t remaining = size;
125 while (remaining > 0) {
126 size_t variate, index;
129 variate = dist(
rng_);
130 index = variate >> address_mask;
131 assert(index < table_size);
136 else if (hash_elem == variate)
continue;
140 index &= (table_size - 1);
142 if (hash_elem == dummy)
break;
143 else if (hash_elem == variate)
continue;
149 *(out_begin + size - remaining) = *(begin + variate);
150 sLOG <<
"sample no" << size - remaining <<
"= elem" << variate;
169 #endif // !THRILL_COMMON_SAMPLING_HEADER std::vector< size_t > hash_table
void operator()(Iterator begin, Iterator end, size_t size, std::vector< Type > &samples)
#define sLOG
Default logging method: output if the local debug variable is true.
void hash_sample(Iterator begin, Iterator end, size_t size, OutIterator out_begin)
std::vector< size_t > indices
common::hypergeometric hyp_
void do_sample(Iterator begin, Iterator end, size_t size, OutputIterator out_begin)
static unsigned integer_log2_ceil(int i)
calculate the log2 floor of an integer type
static constexpr bool debug