Thrill  0.1
k-means.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/k-means/k-means.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2015 Matthias Stumpp <[email protected]>
7  * Copyright (C) 2016 Timo Bingmann <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #pragma once
13 #ifndef THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER
14 #define THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER
15 
17 #include <thrill/api/cache.hpp>
18 #include <thrill/api/collapse.hpp>
20 #include <thrill/api/sample.hpp>
21 #include <thrill/api/sum.hpp>
22 #include <thrill/common/vector.hpp>
23 
24 #include <cereal/types/vector.hpp>
26 
27 #include <limits>
28 #include <string>
29 #include <utility>
30 #include <vector>
31 
32 namespace examples {
33 namespace k_means {
34 
35 using thrill::DIA;
36 
37 //! Compile-Time Fixed-Dimensional Points
38 template <size_t D>
40 
41 //! A variable D-dimensional point with double precision
43 
44 template <typename Point>
45 using PointClusterId = std::pair<Point, size_t>;
46 
47 //! A point which contains "count" accumulated vectors.
48 template <typename Point>
51  size_t count;
52 
53  template <typename Archive>
54  void serialize(Archive& archive) {
55  archive(p, count);
56  }
57 };
58 
59 //! Assignment of a point to a cluster, which is the input to
60 template <typename Point>
62  size_t cluster_id;
64 
65  template <typename Archive>
66  void serialize(Archive& archive) {
67  archive(cluster_id, center);
68  }
69 };
70 
71 //! Model returned by KMeans algorithm containing results.
72 template <typename Point>
74 {
75 public:
76  KMeansModel(size_t dimensions, size_t num_clusters, size_t iterations,
77  const std::vector<Point>& centroids)
78  : dimensions_(dimensions), num_clusters_(num_clusters),
79  iterations_(iterations), centroids_(centroids)
80  { }
81 
82  //! \name Accessors
83  //! \{
84 
85  //! Returns dimensions_
86  size_t dimensions() const { return dimensions_; }
87 
88  //! Returns number of clusters
89  size_t num_clusters() const { return num_clusters_; }
90 
91  //! Returns iterations_
92  size_t iterations() const { return iterations_; }
93 
94  //! Returns centroids_
95  const std::vector<Point>& centroids() const { return centroids_; }
96 
97  //! \}
98 
99  //! \name Classification
100  //! \{
101 
102  //! Calculate closest cluster to point
103  size_t Classify(const Point& p) const {
104  double min_dist = p.DistanceSquare(centroids_[0]);
105  size_t closest_id = 0;
106  for (size_t i = 1; i < centroids_.size(); ++i) {
107  double dist = p.DistanceSquare(centroids_[i]);
108  if (dist < min_dist) {
109  min_dist = dist;
110  closest_id = i;
111  }
112  }
113  return closest_id;
114  }
115 
116  //! Calculate closest cluster to all points, returns DIA containing only the
117  //! cluster ids.
118  template <typename PointDIA>
119  auto Classify(const PointDIA& points) const {
120  return points
121  .Map([this](const Point& p) { return Classify(p); });
122  }
123 
124  //! Calculate closest cluster to all points, returns DIA contains pairs of
125  //! points and their cluster id.
126  template <typename PointDIA>
127  auto ClassifyPairs(const PointDIA& points) const {
128  return points
129  .Map([this](const Point& p) {
130  return PointClusterId<Point>(p, Classify(p));
131  });
132  }
133 
134  //! Calculate the k-means cost: the squared distance to the nearest center.
135  double ComputeCost(const Point& p) const {
136  double min_dist = p.DistanceSquare(centroids_[0]);
137  for (size_t i = 1; i < centroids_.size(); ++i) {
138  double dist = p.DistanceSquare(centroids_[i]);
139  if (dist < min_dist) {
140  min_dist = dist;
141  }
142  }
143  return min_dist;
144  }
145 
146  //! Calculate the overall k-means cost: the sum of squared distances to
147  //! their nearest center.
148  template <typename PointDIA>
149  double ComputeCost(const PointDIA& points) const {
150  return points
151  .Map([this](const Point& p) { return ComputeCost(p); })
152  .Sum();
153  }
154 
155  //! \}
156 
157 private:
158  //! dimensions of space
159  size_t dimensions_;
160 
161  //! number of clusters
163 
164  //! number of iterations
165  size_t iterations_;
166 
167  //! computed centroids in cluster id order
168  std::vector<Point> centroids_;
169 };
170 
171 //! Calculate k-Means using Lloyd's Algorithm. The DIA centroids is both an
172 //! input and an output parameter. The method returns a std::pair<Point2D,
173 //! size_t> = Point2DClusterId into the centroids for each input point.
174 template <typename Point, typename InStack>
175 auto KMeans(const DIA<Point, InStack>& input_points, size_t dimensions,
176  size_t num_clusters, size_t iterations, double epsilon = 0.0) {
177 
178  auto points = input_points.Cache();
179 
180  bool break_condition = false;
181 
184 
185  std::vector<Point> local_centroids =
186  points.Keep().Sample(num_clusters).AllGather();
187 
188  for (size_t iter = 0; iter < iterations && !break_condition; ++iter) {
189 
190  std::vector<Point> old_centroids = local_centroids;
191 
192  // calculate the closest centroid for each point
193  auto closest = points.Keep().Map(
194  [&local_centroids](const Point& p) {
195  assert(local_centroids.size());
196  double min_dist = p.DistanceSquare(local_centroids[0]);
197  size_t closest_id = 0;
198 
199  for (size_t i = 1; i < local_centroids.size(); ++i) {
200  double dist = p.DistanceSquare(local_centroids[i]);
201  if (dist < min_dist) {
202  min_dist = dist;
203  closest_id = i;
204  }
205  }
206  return ClosestCentroid {
207  closest_id, CentroidAccumulated { p, 1 }
208  };
209  });
210 
211  // Calculate new centroids as the mean of all points associated with it.
212  auto centroids =
213  closest
214  .ReduceByKey(
215  [](const ClosestCentroid& cc) { return cc.cluster_id; },
216  [](const ClosestCentroid& a, const ClosestCentroid& b) {
217  return ClosestCentroid {
218  a.cluster_id,
219  CentroidAccumulated { a.center.p + b.center.p,
220  a.center.count + b.center.count }
221  };
222  })
223  .Map([](const ClosestCentroid& cc) {
224  return CentroidAccumulated {
225  cc.center.p / static_cast<double>(cc.center.count),
226  cc.cluster_id
227  };
228  })
229  .Collapse();
230 
231  // collect centroids again, and put back into cluster order
232  for (const CentroidAccumulated& uc : centroids.AllGather()) {
233  local_centroids[uc.count] = uc.p;
234  }
235 
236  // Check whether centroid positions changed significantly, if yes do
237  // another iteration. only check if epsilon > 0, otherwise we run a
238  // fixed number of iterations.
239  if (epsilon > 0) {
240  break_condition = true;
241 
242  for (size_t i = 0; i < local_centroids.size(); ++i) {
243  if (local_centroids[i].Distance(old_centroids[i]) > epsilon) {
244  break_condition = false;
245  break;
246  }
247  }
248  }
249  }
250 
251  return KMeansModel<Point>(
252  dimensions, num_clusters, iterations,
253  local_centroids);
254 }
255 
256 //! Calculate k-Means using bisecting method
257 template <typename Point, typename InStack>
258 auto BisecKMeans(const DIA<Point, InStack>& input_points, size_t dimensions,
259  size_t num_clusters, size_t iterations, double epsilon) {
260 
263 
264  //! initial cluster size
265  size_t initial_size = num_clusters <= 2 ? num_clusters : 2;
266 
267  //! model that is steadily updated and returned to the calling function
268  auto result_model =
269  KMeans(input_points, dimensions, initial_size, iterations, epsilon);
270 
271  for (size_t size = initial_size; size < num_clusters; ++size) {
272 
273  // Classify all points for the current k-Means model
274  auto classified_points =
275  result_model.ClassifyPairs(input_points)
276  .Map([](const PointClusterId<Point>& pci) {
277  return ClosestCentroid {
278  pci.second,
279  CentroidAccumulated { pci.first, 1 }
280  };
281  });
282 
283  // Create a vector containing the cluster IDs and the number
284  // of closest points in order to determine the biggest cluster
285  size_t biggest_cluster_idx =
286  classified_points
287  .ReduceByKey(
288  [](const ClosestCentroid& cc) { return cc.cluster_id; },
289  [](const ClosestCentroid& a, const ClosestCentroid& b) {
290  return ClosestCentroid {
291  a.cluster_id,
293  a.center.count + b.center.count }
294  };
295  })
296  .AllReduce(
297  [](const ClosestCentroid& cc1, const ClosestCentroid& cc2) {
298  return cc1.center.count > cc2.center.count ? cc1 : cc2;
299  })
300  .cluster_id;
301 
302  // Filter the points of the biggest cluster for a further split
303  auto filtered_points =
304  classified_points
305  .Filter(
306  [biggest_cluster_idx](const ClosestCentroid& cc) {
307  return cc.cluster_id == biggest_cluster_idx;
308  })
309  .Map(
310  [](const ClosestCentroid& cc) {
311  return cc.center.p;
312  });
313 
314  // Compute two new cluster by splitting the biggest one
315  std::vector<Point> tmp_model_centroids =
316  KMeans(filtered_points, dimensions, 2, iterations, epsilon)
317  .centroids();
318 
319  // Delete the centroid of the biggest cluster
320  // Add two new centroids calculated in the previous step
321  std::vector<Point> result_model_centroids = result_model.centroids();
322  result_model_centroids.erase(
323  result_model_centroids.begin() + biggest_cluster_idx);
324  result_model_centroids.insert(
325  result_model_centroids.end(),
326  tmp_model_centroids.begin(), tmp_model_centroids.end());
327 
328  // Update centroids of the result_model
329  result_model = KMeansModel<Point>(
330  dimensions, num_clusters, iterations, result_model_centroids);
331  }
332 
333  return result_model;
334 }
335 
336 } // namespace k_means
337 } // namespace examples
338 
339 #endif // !THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER
340 
341 /******************************************************************************/
Model returned by KMeans algorithm containing results.
Definition: k-means.hpp:73
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
auto BisecKMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon)
Calculate k-Means using bisecting method.
Definition: k-means.hpp:258
size_t iterations_
number of iterations
Definition: k-means.hpp:165
A point which contains "count" accumulated vectors.
Definition: k-means.hpp:49
A variable-length D-dimensional point with double precision.
Definition: vector.hpp:123
void serialize(Archive &archive)
Definition: k-means.hpp:54
KMeansModel(size_t dimensions, size_t num_clusters, size_t iterations, const std::vector< Point > &centroids)
Definition: k-means.hpp:76
double ComputeCost(const PointDIA &points) const
Definition: k-means.hpp:149
size_t dimensions() const
Returns dimensions_.
Definition: k-means.hpp:86
Definition: bfs.hpp:21
size_t Classify(const Point &p) const
Calculate closest cluster to point.
Definition: k-means.hpp:103
std::pair< Point, size_t > PointClusterId
Definition: k-means.hpp:45
auto Classify(const PointDIA &points) const
Definition: k-means.hpp:119
size_t num_clusters_
number of clusters
Definition: k-means.hpp:162
const std::vector< Point > & centroids() const
Returns centroids_.
Definition: k-means.hpp:95
Type DistanceSquare(const Vector &b) const
Definition: vector.hpp:64
std::vector< Point > centroids_
computed centroids in cluster id order
Definition: k-means.hpp:168
void serialize(Archive &archive)
Definition: k-means.hpp:66
auto KMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon=0.0)
Definition: k-means.hpp:175
DIA< ValueType > Cache() const
Create a CacheNode which contains all items of a DIA in calculated plain format.
Definition: cache.hpp:93
size_t iterations() const
Returns iterations_.
Definition: k-means.hpp:92
CentroidAccumulated< Point > center
Definition: k-means.hpp:63
auto ClassifyPairs(const PointDIA &points) const
Definition: k-means.hpp:127
Assignment of a point to a cluster, which is the input to.
Definition: k-means.hpp:61
size_t num_clusters() const
Returns number of clusters.
Definition: k-means.hpp:89
double ComputeCost(const Point &p) const
Calculate the k-means cost: the squared distance to the nearest center.
Definition: k-means.hpp:135
size_t dimensions_
dimensions of space
Definition: k-means.hpp:159