Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
aggregate.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * thrill/common/aggregate.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2015 Timo Bingmann <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
11 #pragma once
12 #ifndef THRILL_COMMON_AGGREGATE_HEADER
13 #define THRILL_COMMON_AGGREGATE_HEADER
14 
15 #include <tlx/define/likely.hpp>
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cmath>
20 #include <limits>
21 
22 namespace thrill {
23 namespace common {
24 
25 /*!
26  * Calculate running aggregate statistics: feed it with values, and it will keep
27  * the minimum, the maximum, the average, the value number, and the standard
28  * deviation is values.
29  */
30 template <typename Type_>
31 class Aggregate
32 {
33 public:
34  using Type = Type_;
35 
36  //! default constructor
37  Aggregate() = default;
38 
39  //! initializing constructor
40  Aggregate(size_t count, const double& mean, const double nvar,
41  const Type& min, const Type& max) noexcept
42  : count_(count), mean_(mean), nvar_(nvar),
43  min_(min), max_(max) { }
44 
45  //! add a value to the running aggregation
46  Aggregate& Add(const Type& value) noexcept {
47  count_++;
48  min_ = std::min(min_, value);
49  max_ = std::max(max_, value);
50  if (TLX_UNLIKELY(count_ == 1)) {
51  mean_ = value;
52  }
53  else {
54  // Single-pass numerically stable mean and standard deviation
55  // calculation as described in Donald Knuth: The Art of Computer
56  // Programming, Volume 2, Chapter 4.2.2, Equations 15 & 16
57  double delta = value - mean_;
58  mean_ += delta / count_;
59  nvar_ += delta * (value - mean_);
60  }
61  return *this;
62  }
63 
64  //! return number of values aggregated
65  size_t Count() const noexcept { return count_; }
66 
67  //! return sum over all values aggregated
68  // can't make noexcept since Type_'s conversion is allowed to throw
69  const Type Total() const { return static_cast<Type>(count_ * mean_); }
70 
71  //! return the average over all values aggregated
72  double Average() const noexcept { return mean_; }
73 
74  //! return the average over all values aggregated
75  double Avg() const noexcept { return Average(); }
76 
77  //! return the average over all values aggregated
78  double Mean() const noexcept { return Average(); }
79 
80  //! return minimum over all values aggregated
81  const Type& Min() const noexcept { return min_; }
82 
83  //! return maximum over all values aggregated
84  const Type& Max() const noexcept { return max_; }
85 
86  //! return the standard deviation of all values aggregated
87  double StandardDeviation(size_t ddof = 1) const {
88  if (count_ <= 1) return 0.0;
89  // ddof = delta degrees of freedom
90  // Set to 0 if you have the entire distribution
91  // Set to 1 if you have a sample (to correct for bias)
92  return std::sqrt(nvar_ / static_cast<double>(count_ - ddof));
93  }
94 
95  //! return the standard deviation of all values aggregated
96  double StDev(size_t ddof = 1) const { return StandardDeviation(ddof); }
97 
98  //! operator +
99  Aggregate operator + (const Aggregate& a) const noexcept {
100  return Aggregate(
101  // count
102  count_ + a.count_,
103  // mean
104  merged_means(a),
105  // merging variance is a bit complicated
106  merged_variance(a),
107  std::min(min_, a.min_), std::max(max_, a.max_)); // min, max
108  }
109 
110  //! operator +=
111  Aggregate& operator += (const Aggregate& a) noexcept {
112  mean_ = merged_means(a);
113  min_ = std::min(min_, a.min_);
114  max_ = std::max(max_, a.max_);
115  nvar_ = merged_variance(a);
116  count_ += a.count_;
117  return *this;
118  }
119 
120  //! serialization with Thrill's serializer
121  template <typename Archive>
122  void ThrillSerialize(Archive& ar) const {
123  ar.template Put<size_t>(count_);
124  ar.template Put<double>(mean_);
125  ar.template Put<double>(nvar_);
126  ar.template Put<Type>(min_);
127  ar.template Put<Type>(max_);
128  }
129 
130  //! deserialization with Thrill's serializer
131  template <typename Archive>
132  static Aggregate ThrillDeserialize(Archive& ar) {
133  Aggregate agg;
134  agg.count_ = ar.template Get<size_t>();
135  agg.mean_ = ar.template Get<double>();
136  agg.nvar_ = ar.template Get<double>();
137  agg.min_ = ar.template Get<Type>();
138  agg.max_ = ar.template Get<Type>();
139  return agg;
140  }
141 
142  static constexpr bool thrill_is_fixed_size = true;
143  static constexpr size_t thrill_fixed_size =
144  sizeof(size_t) + 2 * sizeof(double) + 2 * sizeof(Type);
145 
146 private:
147  double merged_means(const Aggregate& a) const noexcept {
148  // check if either count is zero, this fixes problems with NaN
149  if (count_ == 0)
150  return a.mean_;
151  if (a.count_ == 0)
152  return mean_;
153  return (mean_ * count_ + a.mean_ * a.count_) / (count_ + a.count_);
154  }
155 
156  // T. Chan et al 1979, "Updating Formulae and a Pairwise Algorithm for
157  // Computing Sample Variances"
158  double merged_variance(const Aggregate& other) const noexcept {
159  double delta = mean_ - other.mean_;
160  return nvar_ + other.nvar_ + (delta * delta) *
161  (count_ * other.count_) / (count_ + other.count_);
162  }
163 
164  //! number of values aggregated
165  size_t count_ = 0;
166 
167  //! mean of values
168  double mean_ = 0;
169 
170  //! approximate count * variance; stddev = sqrt(nvar / (count-1))
171  double nvar_ = 0.0;
172 
173  //! minimum value
175 
176  //! maximum value
177  Type max_ = std::numeric_limits<Type>::lowest();
178 };
179 
180 } // namespace common
181 } // namespace thrill
182 
183 #endif // !THRILL_COMMON_AGGREGATE_HEADER
184 
185 /******************************************************************************/
Type min_
minimum value
Definition: aggregate.hpp:174
double nvar_
approximate count * variance; stddev = sqrt(nvar / (count-1))
Definition: aggregate.hpp:171
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
Aggregate(size_t count, const double &mean, const double nvar, const Type &min, const Type &max) noexcept
initializing constructor
Definition: aggregate.hpp:40
double merged_variance(const Aggregate &other) const noexcept
Definition: aggregate.hpp:158
static constexpr double delta
Definition: select.hpp:35
static constexpr bool thrill_is_fixed_size
Definition: aggregate.hpp:142
const Type & Min() const noexcept
return minimum over all values aggregated
Definition: aggregate.hpp:81
static constexpr size_t thrill_fixed_size
Definition: aggregate.hpp:143
void ThrillSerialize(Archive &ar) const
serialization with Thrill's serializer
Definition: aggregate.hpp:122
#define TLX_UNLIKELY(c)
Definition: likely.hpp:24
Calculate running aggregate statistics: feed it with values, and it will keep the minimum...
Definition: aggregate.hpp:31
double StandardDeviation(size_t ddof=1) const
return the standard deviation of all values aggregated
Definition: aggregate.hpp:87
double merged_means(const Aggregate &a) const noexcept
Definition: aggregate.hpp:147
double mean_
mean of values
Definition: aggregate.hpp:168
int value
Definition: gen_data.py:41
size_t count_
number of values aggregated
Definition: aggregate.hpp:165
size_t Count() const noexcept
return number of values aggregated
Definition: aggregate.hpp:65
Aggregate & Add(const Type &value) noexcept
add a value to the running aggregation
Definition: aggregate.hpp:46
Type max_
maximum value
Definition: aggregate.hpp:177
const Type & Max() const noexcept
return maximum over all values aggregated
Definition: aggregate.hpp:84
double Average() const noexcept
return the average over all values aggregated
Definition: aggregate.hpp:72
Aggregate & operator+=(const Aggregate &a) noexcept
operator +=
Definition: aggregate.hpp:111
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
Aggregate()=default
default constructor
double Avg() const noexcept
return the average over all values aggregated
Definition: aggregate.hpp:75
Aggregate operator+(const Aggregate &a) const noexcept
operator +
Definition: aggregate.hpp:99
static constexpr const T & min(const T &a, const T &b)
template for constexpr min, because std::min is not good enough.
Definition: functional.hpp:59
double Mean() const noexcept
return the average over all values aggregated
Definition: aggregate.hpp:78
static Aggregate ThrillDeserialize(Archive &ar)
deserialization with Thrill's serializer
Definition: aggregate.hpp:132
double StDev(size_t ddof=1) const
return the standard deviation of all values aggregated
Definition: aggregate.hpp:96
const Type Total() const
return sum over all values aggregated
Definition: aggregate.hpp:69
static constexpr const T & max(const T &a, const T &b)
template for constexpr max, because std::max is not good enough.
Definition: functional.hpp:65