13 #ifndef THRILL_COMMON_RESERVOIR_SAMPLING_HEADER 14 #define THRILL_COMMON_RESERVOIR_SAMPLING_HEADER 31 template <
typename Type,
typename RNG = std::default_random_engine>
84 template <
typename Type,
typename RNG = std::default_random_engine>
93 W_ = std::exp(std::log(uniform(
rng_)) / size);
94 gap_ = std::floor(std::log(uniform(
rng_)) / std::log(1 - W_));
116 else if (gap_ == 0) {
153 std::uniform_real_distribution<double>
uniform;
157 W_ *= std::exp(std::log(uniform(rng_)) / size_);
158 gap_ = std::log(uniform(rng_)) / std::log(1 - W_);
173 template <
typename Type,
typename RNG = std::default_random_engine>
176 static constexpr
bool debug =
false;
182 double desired_imbalance = 0.05)
183 :
samples_(samples),
rng_(rng), desired_imbalance_(desired_imbalance)
188 sLOG0 <<
"ReservoirSamplingGrow::add" 197 if (steps_to_resize == 0)
200 size_t target_size = calc_sample_size();
201 steps_to_resize = calc_steps_to_next_resize();
203 sLOG <<
"steps_to_resize" << steps_to_resize
204 <<
"target_size" << target_size
206 <<
"expanded_by" << target_size -
size_;
207 assert(target_size >= size_);
210 while (size_ < target_size) {
211 size_t x =
rng_() % (size_ + 1);
224 if (debug &&
size_ != calc_sample_size())
225 LOG0 <<
"delta: " << (
int)
size_ - (int)calc_sample_size()
241 gap_ = calc_next_gap();
244 else if (gap_ == 0) {
250 gap_ = calc_next_gap();
271 size_t s =
static_cast<size_t>(
273 * (1.0 / (desired_imbalance_ * desired_imbalance_)));
279 return calc_sample_size(
count_);
290 size_t steps_to_resize = 0;
305 return std::geometric_distribution<size_t>(
306 static_cast<double>(
size_) / static_cast<double>(count_))(rng_);
310 double p =
static_cast<double>(
size_) / static_cast<double>(count_);
311 double u = std::uniform_real_distribution<double>()(rng_);
312 return std::floor(std::log(u) / std::log(1 - p));
321 std::pow(2.0, desired_imbalance_ * desired_imbalance_) - 1.0));
328 #endif // !THRILL_COMMON_RESERVOIR_SAMPLING_HEADER size_t calc_sample_size(size_t count) const
calculate desired sample size
RNG & rng_
source of randomness
size_t size_
size of reservoir
#define sLOG
Default logging method: output if the local debug variable is true.
static uint_pair max()
return an uint_pair instance containing the largest value possible
const std::vector< Type > & samples() const
access to samples
void add(const Type &item)
visit item, maybe add it to the sample.
std::uniform_real_distribution< double > uniform
uniform [0.0, 1.0) distribution
#define LOG0
Override default output: never or always output log.
Implementation of a fast approximation of adaptive reservoir sampling using http://erikerlandson.github.io/blog/2015/11/20/very-fast-reservoir-sampling/ The reservoir size grows logarithmically with the number given to the sampler, new items replace old ones such that all items in the stream are sampled with the same approximately uniform probability.
size_t count() const
number of items seen
ReservoirSamplingGrow(std::vector< Type > &samples, RNG &rng, double desired_imbalance=0.05)
initialize reservoir sampler
const double desired_imbalance_
void add(const Type &item)
visit item, maybe add it to the sample.
void add(const Type &item)
visit item, maybe add it to the sample.
void calc_next_gap()
draw gap size from geometric distribution with p = size_ / count_
size_t size() const
size of reservoir
Implementation of reservoir sampling using Vitter's Algorithm R.
size_t size_
size of reservoir
size_t count() const
number of items seen
ReservoirSampling(size_t size, std::vector< Type > &samples, RNG &rng)
initialize reservoir sampler
Fast exact implementation of reservoir sampling using skip values.
double desired_imbalance() const
desired imbalance
#define sLOG0
Override default output: never or always output log.
ReservoirSamplingFast(size_t size, std::vector< Type > &samples, RNG &rng)
initialize reservoir sampler
RNG & rng_
source of randomness
size_t calc_sample_size() const
calculate desired sample size
size_t calc_steps_to_next_resize() const
static constexpr bool debug
size_t size() const
size of reservoir
size_t size() const
size of reservoir
RNG & rng_
source of randomness
size_t count() const
number of items seen
size_t gap_
number of items to skip until next sample
const std::vector< Type > & samples() const
access to samples
size_t calc_next_gap() const
draw gap size from geometric distribution with p = size_ / count_
std::vector< Type > & samples_
reservoir
size_t count_
number of items seen
std::vector< Type > & samples_
reservoir
std::vector< Type > & samples_
reservoir
const std::vector< Type > & samples() const
access to samples