Thrill  0.1
k-means_step3.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/tutorial/k-means_step3.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Timo Bingmann <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
11 //! \example examples/tutorial/k-means_step3.cpp
12 //!
13 //! This example is part of the k-means tutorial. See \ref kmeans_tutorial_step3
14 
16 #include <thrill/api/cache.hpp>
17 #include <thrill/api/generate.hpp>
18 #include <thrill/api/print.hpp>
20 #include <thrill/api/sample.hpp>
21 
22 #include <ostream>
23 #include <random>
24 #include <vector>
25 
26 //! [Point class]
27 //! A 2-dimensional point with double precision
28 struct Point {
29  //! point coordinates
30  double x, y;
31 
32  double DistanceSquare(const Point& b) const {
33  return (x - b.x) * (x - b.x) + (y - b.y) * (y - b.y);
34  }
35  Point operator + (const Point& b) const {
36  return Point { x + b.x, y + b.y };
37  }
38  Point operator / (double s) const {
39  return Point { x / s, y / s };
40  }
41 };
42 //! [Point class]
43 
44 //! make ostream-able for Print()
45 std::ostream& operator << (std::ostream& os, const Point& p) {
46  return os << '(' << p.x << ',' << p.y << ')';
47 }
48 
49 //! [ClosestCenter class]
50 //! Assignment of a point to a cluster.
51 struct ClosestCenter {
52  size_t cluster_id;
53  Point point;
54  size_t count;
55 };
56 //! make ostream-able for Print()
57 std::ostream& operator << (std::ostream& os, const ClosestCenter& cc) {
58  return os << '(' << cc.cluster_id
59  << ':' << cc.point << ':' << cc.count << ')';
60 }
61 //! [ClosestCenter class]
62 
63 //! our main processing method
65 
66  std::default_random_engine rng(std::random_device { } ());
67  std::uniform_real_distribution<double> dist(0.0, 1000.0);
68 
69  // generate 100 random points using uniform distribution
70  auto points =
71  Generate(
72  ctx, /* size */ 100,
73  [&](const size_t&) {
74  return Point { dist(rng), dist(rng) };
75  })
76  .Cache();
77 
78  // print out the points
79  points.Print("points");
80 
81  // pick some initial random cluster centers
82  auto centers = points.Sample(/* num_clusters */ 10);
83 
84  // collect centers in a local vector on each worker
85  std::vector<Point> local_centers = centers.AllGather();
86 
87  // calculate the closest center for each point
88  auto closest = points.Map(
89  [local_centers](const Point& p) {
90  double min_dist = p.DistanceSquare(local_centers[0]);
91  size_t cluster_id = 0;
92 
93  for (size_t i = 1; i < local_centers.size(); ++i) {
94  double dist = p.DistanceSquare(local_centers[i]);
95  if (dist < min_dist)
96  min_dist = dist, cluster_id = i;
97  }
98  //! [step3 count 1]
99  return ClosestCenter { cluster_id, p, /* count */ 1 };
100  //! [step3 count 1]
101  });
102 
103  closest.Print("closest");
104 
105  //! [step3 ReduceByKey]
106  // calculate new centers as the mean of all points associated with its id
107  auto reduced_centers =
108  closest
109  .ReduceByKey(
110  // key extractor: the cluster id
111  [](const ClosestCenter& cc) { return cc.cluster_id; },
112  // reduction: add points and the counter
113  [](const ClosestCenter& a, const ClosestCenter& b) {
114  return ClosestCenter {
115  a.cluster_id, a.point + b.point, a.count + b.count
116  };
117  });
118 
119  reduced_centers.Print("reduced_centers");
120  //! [step3 ReduceByKey]
121 
122  //! [step3 ReduceByKey divide by count]
123  auto new_centers =
124  reduced_centers
125  .Map([](const ClosestCenter& cc) {
126  return cc.point / cc.count;
127  });
128 
129  new_centers.Print("new_centers");
130  //! [step3 ReduceByKey divide by count]
131 }
132 
133 int main() {
134  // launch Thrill program: the lambda function will be run on each worker.
135  return thrill::Run(
136  [&](thrill::Context& ctx) { Process(ctx); });
137 }
138 
139 /******************************************************************************/
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:87
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:947
thrill::common::Vector< D, double > Point
Compile-Time Fixed-Dimensional Points.
Definition: k-means.hpp:39
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
Definition: context.hpp:221
void Process(thrill::Context &ctx)
[ClosestCenter class]
list x
Definition: gen_data.py:39
int main()
std::ostream & operator<<(std::ostream &os, const Point &p)
[Point class]