Thrill  0.1
k-means_step2.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/tutorial/k-means_step2.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Timo Bingmann <[email protected]>
7  *
9  ******************************************************************************/
10
11 //! \example examples/tutorial/k-means_step2.cpp
12 //!
13 //! This example is part of the k-means tutorial. See \ref kmeans_tutorial_step2
14
16 #include <thrill/api/cache.hpp>
17 #include <thrill/api/generate.hpp>
18 #include <thrill/api/print.hpp>
19 #include <thrill/api/sample.hpp>
20
21 #include <ostream>
22 #include <random>
23 #include <vector>
24
25 //! [Point class]
26 //! A 2-dimensional point with double precision
27 struct Point {
28  //! point coordinates
29  double x, y;
30
31  double DistanceSquare(const Point& b) const {
32  return (x - b.x) * (x - b.x) + (y - b.y) * (y - b.y);
33  }
34 };
35 //! [Point class]
36
37 //! make ostream-able for Print()
38 std::ostream& operator << (std::ostream& os, const Point& p) {
39  return os << '(' << p.x << ',' << p.y << ')';
40 }
41
42 //! [ClosestCenter class]
43 //! Assignment of a point to a cluster.
44 struct ClosestCenter {
45  size_t cluster_id;
46  Point point;
47 };
48 //! make ostream-able for Print()
49 std::ostream& operator << (std::ostream& os, const ClosestCenter& cc) {
50  return os << '(' << cc.cluster_id << ':' << cc.point << ')';
51 }
52 //! [ClosestCenter class]
53
54 void Process(thrill::Context& ctx) {
55
56  std::default_random_engine rng(std::random_device { } ());
57  std::uniform_real_distribution<double> dist(0.0, 1000.0);
58
59  // generate 100 random points using uniform distribution
60  auto points =
61  Generate(
62  ctx, /* size */ 100,
63  [&](const size_t&) {
64  return Point { dist(rng), dist(rng) };
65  })
66  .Cache();
67
68  // print out the points
69  points.Print("points");
70
71  //! [step2 sample]
72  // pick some initial random cluster centers
73  auto centers = points.Sample(/* num_clusters */ 10);
74  //! [step2 sample]
75
76  //! [step2 classify]
77  // collect centers in a local vector on each worker
78  std::vector<Point> local_centers = centers.AllGather();
79
80  // calculate the closest center for each point
81  auto closest = points.Map(
82  [local_centers](const Point& p) {
83  double min_dist = p.DistanceSquare(local_centers[0]);
84  size_t cluster_id = 0;
85
86  for (size_t i = 1; i < local_centers.size(); ++i) {
87  double dist = p.DistanceSquare(local_centers[i]);
88  if (dist < min_dist)
89  min_dist = dist, cluster_id = i;
90  }
91  return ClosestCenter { cluster_id, p };
92  });
93
94  closest.Print("closest");
95  //! [step2 classify]
96 }
97
98 int main() {
99  // launch Thrill program: the lambda function will be run on each worker.
100  return thrill::Run(
101  [&](thrill::Context& ctx) { Process(ctx); });
102 }
103
104 /******************************************************************************/
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:85
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:863
thrill::common::Vector< D, double > Point
Compile-Time Fixed-Dimensional Points.
Definition: k-means.hpp:39
std::ostream & operator<<(std::ostream &os, const Point &p)
[Point class]
list x
Definition: gen_data.py:39
void Process(thrill::Context &ctx)
[ClosestCenter class]
int main()