Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
k-means.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/k-means/k-means.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2015 Matthias Stumpp <[email protected]>
7  * Copyright (C) 2016 Timo Bingmann <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #pragma once
13 #ifndef THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER
14 #define THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER
15 
17 #include <thrill/api/cache.hpp>
18 #include <thrill/api/collapse.hpp>
20 #include <thrill/api/sample.hpp>
21 #include <thrill/api/sum.hpp>
22 #include <thrill/common/vector.hpp>
23 
24 #include <cereal/types/vector.hpp>
26 
27 #include <limits>
28 #include <string>
29 #include <utility>
30 #include <vector>
31 
32 namespace examples {
33 namespace k_means {
34 
35 using thrill::DIA;
36 
37 //! Compile-Time Fixed-Dimensional Points
38 template <size_t D>
40 
41 //! A variable D-dimensional point with double precision
43 
44 template <typename Point>
45 using PointClusterId = std::pair<Point, size_t>;
46 
47 //! A point which contains "count" accumulated vectors.
48 template <typename Point>
51  size_t count;
52 
53  template <typename Archive>
54  void serialize(Archive& archive) {
55  archive(p, count);
56  }
57 };
58 
59 //! Assignment of a point to a cluster, which is the input to
60 template <typename Point>
62  size_t cluster_id;
64 
65  template <typename Archive>
66  void serialize(Archive& archive) {
67  archive(cluster_id, center);
68  }
69 };
70 
71 //! Model returned by KMeans algorithm containing results.
72 template <typename Point>
74 {
75 public:
77  const std::vector<Point>& centroids)
78  : dimensions_(dimensions), num_clusters_(num_clusters),
79  iterations_(iterations), centroids_(centroids)
80  { }
81 
82  //! \name Accessors
83  //! \{
84 
85  //! Returns dimensions_
86  size_t dimensions() const { return dimensions_; }
87 
88  //! Returns number of clusters
89  size_t num_clusters() const { return num_clusters_; }
90 
91  //! Returns iterations_
92  size_t iterations() const { return iterations_; }
93 
94  //! Returns centroids_
95  const std::vector<Point>& centroids() const { return centroids_; }
96 
97  //! \}
98 
99  //! \name Classification
100  //! \{
101 
102  //! Calculate closest cluster to point
103  size_t Classify(const Point& p) const {
104  double min_dist = p.DistanceSquare(centroids_[0]);
105  size_t closest_id = 0;
106  for (size_t i = 1; i < centroids_.size(); ++i) {
107  double dist = p.DistanceSquare(centroids_[i]);
108  if (dist < min_dist) {
109  min_dist = dist;
110  closest_id = i;
111  }
112  }
113  return closest_id;
114  }
115 
116  //! Calculate closest cluster to all points, returns DIA containing only the
117  //! cluster ids.
118  template <typename PointDIA>
119  auto Classify(const PointDIA& points) const {
120  return points
121  .Map([this](const Point& p) { return Classify(p); });
122  }
123 
124  //! Calculate closest cluster to all points, returns DIA contains pairs of
125  //! points and their cluster id.
126  template <typename PointDIA>
127  auto ClassifyPairs(const PointDIA& points) const {
128  return points
129  .Map([this](const Point& p) {
130  return PointClusterId<Point>(p, Classify(p));
131  });
132  }
133 
134  //! Calculate the k-means cost: the squared distance to the nearest center.
135  double ComputeCost(const Point& p) const {
136  double min_dist = p.DistanceSquare(centroids_[0]);
137  for (size_t i = 1; i < centroids_.size(); ++i) {
138  double dist = p.DistanceSquare(centroids_[i]);
139  if (dist < min_dist) {
140  min_dist = dist;
141  }
142  }
143  return min_dist;
144  }
145 
146  //! Calculate the overall k-means cost: the sum of squared distances to
147  //! their nearest center.
148  template <typename PointDIA>
149  double ComputeCost(const PointDIA& points) const {
150  return points
151  .Map([this](const Point& p) { return ComputeCost(p); })
152  .Sum();
153  }
154 
155  //! \}
156 
157 private:
158  //! dimensions of space
159  size_t dimensions_;
160 
161  //! number of clusters
163 
164  //! number of iterations
165  size_t iterations_;
166 
167  //! computed centroids in cluster id order
168  std::vector<Point> centroids_;
169 };
170 
171 //! Calculate k-Means using Lloyd's Algorithm. The DIA centroids is both an
172 //! input and an output parameter. The method returns a std::pair<Point2D,
173 //! size_t> = Point2DClusterId into the centroids for each input point.
174 template <typename Point, typename InStack>
175 auto KMeans(const DIA<Point, InStack>& input_points,
176  size_t dimensions, size_t num_clusters, size_t iterations) {
177 
178  auto points = input_points.Cache();
179 
182 
183  DIA<Point> centroids = points.Keep().Sample(num_clusters);
184 
185  for (size_t iter = 0; iter < iterations; ++iter) {
186 
187  // handling this local variable is difficult: it is calculated as an
188  // Action here, but must exist later when the Map() is
189  // processed. Hhence, we move it into the closure.
190  std::vector<Point> local_centroids = centroids.AllGather();
191 
192  // calculate the closest centroid for each point
193  auto closest = points.Keep().Map(
194  [local_centroids = std::move(local_centroids)](const Point& p) {
195  assert(local_centroids.size());
196  double min_dist = p.DistanceSquare(local_centroids[0]);
197  size_t closest_id = 0;
198 
199  for (size_t i = 1; i < local_centroids.size(); ++i) {
200  double dist = p.DistanceSquare(local_centroids[i]);
201  if (dist < min_dist) {
202  min_dist = dist;
203  closest_id = i;
204  }
205  }
206  return ClosestCentroid {
207  closest_id, CentroidAccumulated { p, 1 }
208  };
209  });
210 
211  // Calculate new centroids as the mean of all points associated with it.
212  centroids =
213  closest
214  .ReduceByKey(
215  [](const ClosestCentroid& cc) { return cc.cluster_id; },
216  [](const ClosestCentroid& a, const ClosestCentroid& b) {
217  return ClosestCentroid {
218  a.cluster_id,
219  CentroidAccumulated { a.center.p + b.center.p,
220  a.center.count + b.center.count }
221  };
222  })
223  .Map([](const ClosestCentroid& cc) {
224  return cc.center.p / static_cast<double>(cc.center.count);
225  })
226  .Collapse();
227  }
228 
229  return KMeansModel<Point>(
230  dimensions, num_clusters, iterations,
231  centroids.AllGather());
232 }
233 
234 } // namespace k_means
235 } // namespace examples
236 
237 #endif // !THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER
238 
239 /******************************************************************************/
double ComputeCost(const PointDIA &points) const
Definition: k-means.hpp:149
Model returned by KMeans algorithm containing results.
Definition: k-means.hpp:73
auto Classify(const PointDIA &points) const
Definition: k-means.hpp:119
size_t iterations_
number of iterations
Definition: k-means.hpp:165
A point which contains "count" accumulated vectors.
Definition: k-means.hpp:49
A variable-length D-dimensional point with double precision.
Definition: vector.hpp:123
void serialize(Archive &archive)
Definition: k-means.hpp:54
KMeansModel(size_t dimensions, size_t num_clusters, size_t iterations, const std::vector< Point > &centroids)
Definition: k-means.hpp:76
auto ClassifyPairs(const PointDIA &points) const
Definition: k-means.hpp:127
double ComputeCost(const Point &p) const
Calculate the k-means cost: the squared distance to the nearest center.
Definition: k-means.hpp:135
std::pair< Point, size_t > PointClusterId
Definition: k-means.hpp:45
size_t num_clusters_
number of clusters
Definition: k-means.hpp:162
std::vector< Point > centroids_
computed centroids in cluster id order
Definition: k-means.hpp:168
void serialize(Archive &archive)
Definition: k-means.hpp:66
size_t Classify(const Point &p) const
Calculate closest cluster to point.
Definition: k-means.hpp:103
size_t iterations() const
Returns iterations_.
Definition: k-means.hpp:92
CentroidAccumulated< Point > center
Definition: k-means.hpp:63
Assignment of a point to a cluster, which is the input to.
Definition: k-means.hpp:61
Type DistanceSquare(const Vector &b) const
Definition: vector.hpp:64
const std::vector< Point > & centroids() const
Returns centroids_.
Definition: k-means.hpp:95
size_t dimensions() const
Returns dimensions_.
Definition: k-means.hpp:86
size_t dimensions_
dimensions of space
Definition: k-means.hpp:159
size_t num_clusters() const
Returns number of clusters.
Definition: k-means.hpp:89
auto KMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations)
Definition: k-means.hpp:175