Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
k-means_run.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/k-means/k-means_run.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2015 Matthias Stumpp <[email protected]>
7  * Copyright (C) 2016 Timo Bingmann <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
13 
14 #include <thrill/api/gather.hpp>
15 #include <thrill/api/generate.hpp>
17 #include <thrill/common/logger.hpp>
18 #include <thrill/common/string.hpp>
19 #include <tlx/cmdline_parser.hpp>
20 
21 #include <algorithm>
22 #include <iomanip>
23 #include <random>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 using namespace examples::k_means; // NOLINT
29 
30 //! Output a #rrggbb color for each cluster index
31 class SVGColor
32 {
33 public:
34  explicit SVGColor(size_t cluster) : cluster_(cluster) { }
35  size_t cluster_;
36 };
37 
38 std::ostream& operator << (std::ostream& os, const SVGColor& c) {
39  os << "#" << std::hex << std::setfill('0') << std::setw(2)
40  << unsigned(static_cast<double>(3 * (c.cluster_ + 1) % 11) / 11.0 * 256)
41  << unsigned(static_cast<double>(7 * (c.cluster_ + 1) % 11) / 11.0 * 256)
42  << unsigned(static_cast<double>(9 * (c.cluster_ + 1) % 11) / 11.0 * 256);
43  return os;
44 }
45 
46 //! Output the points and centroids as a SVG drawing.
47 template <typename Point>
48 void OutputSVG(const std::string& svg_path, double svg_scale,
49  const DIA<Point>& list,
50  const KMeansModel<Point>& model) {
51  tlx::unused(svg_path);
52  tlx::unused(svg_scale);
53  tlx::unused(list);
54  tlx::unused(model);
55 }
56 
57 //! Output the points and centroids as a 2-D SVG drawing
58 template <>
59 void OutputSVG(const std::string& svg_path, double svg_scale,
60  const DIA<Point<2> >& point_dia,
61  const KMeansModel<Point<2> >& model) {
62  double width = 0, height = 0;
63 
64  using Point2D = Point<2>;
65 
66  const std::vector<Point2D>& centroids = model.centroids();
67  std::vector<PointClusterId<Point2D> > list =
68  model.ClassifyPairs(point_dia).Gather();
69 
70  for (const PointClusterId<Point2D>& p : list) {
71  width = std::max(width, p.first.x[0]);
72  height = std::max(height, p.first.x[1]);
73  }
74 
75  if (point_dia.context().my_rank() != 0) return;
76 
77  std::ofstream os(svg_path);
78 
79  os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
80  os << "<svg\n";
81  os << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
82  os << " xmlns:cc=\"http://creativecommons.org/ns#\"\n";
83  os << " xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
84  os << " xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
85  os << " xmlns=\"http://www.w3.org/2000/svg\"\n";
86  os << " version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
87  << "\" height=\"" << height * svg_scale << "\">\n";
88  os << " <g id=\"layer1\">\n";
89 
90  for (const PointClusterId<Point2D>& p : list) {
91  os << " <circle r=\"1\" cx=\"" << p.first.x[0] * svg_scale
92  << "\" cy=\"" << p.first.x[1] * svg_scale
93  << "\" style=\"stroke:none;stroke-opacity:1;fill:"
94  << SVGColor(p.second) << ";fill-opacity:1\" />\n";
95  }
96  for (size_t i = 0; i < centroids.size(); ++i) {
97  const Point2D& p = centroids[i];
98  os << " <circle r=\"4\" cx=\"" << p.x[0] * svg_scale
99  << "\" cy=\"" << p.x[1] * svg_scale
100  << "\" style=\"stroke:black;stroke-opacity:1;fill:"
101  << SVGColor(i) << ";fill-opacity:1\" />\n";
102  }
103  os << " </g>\n";
104  os << "</svg>\n";
105 }
106 
107 template <typename Point>
108 static void RunKMeansGenerated(
109  thrill::Context& ctx, bool bisecting,
110  size_t dimensions, size_t num_clusters, size_t iterations, double eps,
111  const std::string& svg_path, double svg_scale,
112  const std::vector<std::string>& input_paths) {
113 
115 
116  std::default_random_engine rng(123456);
117  std::uniform_real_distribution<float> dist(0.0, 1000.0);
118 
119  size_t num_points;
120  if (input_paths.size() != 1 ||
121  !thrill::common::from_str<size_t>(input_paths[0], num_points))
122  die("For generated data, set input_path to the number of points.");
123 
124  auto points =
125  Generate(
126  ctx, num_points,
127  [&](const size_t& /* index */) {
128  return Point::Random(dimensions, dist, rng);
129  })
130  .Cache().KeepForever();
131 
132  auto result = bisecting ?
133  BisecKMeans(points.Keep(), dimensions, num_clusters, iterations, eps) :
134  KMeans(points.Keep(), dimensions, num_clusters, iterations, eps);
135 
136  double cost = result.ComputeCost(points);
137  if (ctx.my_rank() == 0)
138  LOG1 << "k-means cost: " << cost;
139 
140  if (svg_path.size() && dimensions == 2) {
141  OutputSVG(svg_path, svg_scale, points, result);
142  }
143 
144  ctx.net.Barrier();
145  if (ctx.my_rank() == 0) {
146  auto traffic = ctx.net_manager().Traffic();
147  LOG1 << "RESULT"
148  << " benchmark=k-means"
149  << " bisecting=" << bisecting
150  << " dimensions=" << dimensions
151  << " num_clusters=" << num_clusters
152  << " iterations=" << iterations
153  << " eps=" << eps
154  << " cost=" << cost
155  << " time=" << timer
156  << " traffic=" << traffic.total()
157  << " hosts=" << ctx.num_hosts();
158  }
159 }
160 
161 template <typename Point>
162 static void RunKMeansFile(
163  thrill::Context& ctx, bool bisecting,
164  size_t dimensions, size_t num_clusters, size_t iterations, double eps,
165  const std::string& svg_path, double svg_scale,
166  const std::vector<std::string>& input_paths) {
167 
169 
170  auto points =
171  ReadLines(ctx, input_paths).Map(
172  [dimensions](const std::string& input) {
173  // parse "<pt> <pt> <pt> ..." lines
174  Point p = Point::Make(dimensions);
175  char* endptr = const_cast<char*>(input.c_str());
176  for (size_t i = 0; i < dimensions; ++i) {
177  while (*endptr == ' ') ++endptr;
178  p.x[i] = std::strtod(endptr, &endptr);
179  if (!endptr || (*endptr != ' ' && i != dimensions - 1)) {
180  die("Could not parse point coordinates: " << input);
181  }
182  }
183  while (*endptr == ' ') ++endptr;
184  if (!endptr || *endptr != 0) {
185  die("Could not parse point coordinates: " << input);
186  }
187  return p;
188  });
189 
190  auto result = bisecting ?
191  BisecKMeans(points.Keep(), dimensions, num_clusters, iterations, eps) :
192  KMeans(points.Keep(), dimensions, num_clusters, iterations, eps);
193 
194  double cost = result.ComputeCost(points.Keep());
195  if (ctx.my_rank() == 0)
196  LOG1 << "k-means cost: " << cost;
197 
198  if (svg_path.size() && dimensions == 2) {
199  OutputSVG(svg_path, svg_scale, points.Collapse(), result);
200  }
201 
202  ctx.net.Barrier();
203  if (ctx.my_rank() == 0) {
204  auto traffic = ctx.net_manager().Traffic();
205  LOG1 << "RESULT"
206  << " benchmark=k-means"
207  << " bisecting=" << bisecting
208  << " dimensions=" << dimensions
209  << " num_clusters=" << num_clusters
210  << " iterations=" << iterations
211  << " eps=" << eps
212  << " cost=" << cost
213  << " time=" << timer
214  << " traffic=" << traffic.total()
215  << " hosts=" << ctx.num_hosts();
216  }
217 }
218 
219 int main(int argc, char* argv[]) {
220 
221  tlx::CmdlineParser clp;
222 
223  bool generate = false;
224  clp.add_bool('g', "generate", generate,
225  "generate random data, set input = #points");
226 
227  bool bisecting = false;
228  clp.add_bool('b', "bisecting", bisecting,
229  "enable bisecting k-Means");
230 
231  size_t iterations = 10;
232  clp.add_size_t('n', "iterations", iterations,
233  "iterations, default: 10");
234 
235  size_t dimensions = 2;
236  clp.add_param_size_t("dim", dimensions,
237  "dimensions of points 2-10, default: 2");
238 
239  size_t num_clusters;
240  clp.add_param_size_t("clusters", num_clusters, "Number of clusters");
241 
242  double epsilon = 0;
243  clp.add_double('e', "epsilon", epsilon,
244  "centroid position delta for break condition, default: 0");
245 
246  std::string svg_path;
247  clp.add_string('s', "svg", svg_path,
248  "output path for svg drawing (only for dim = 2)");
249 
250  double svg_scale = 1;
251  clp.add_double('S', "svg-scale", svg_scale,
252  "scale coordinates for svg output, default: 1");
253 
254  std::vector<std::string> input_paths;
255  clp.add_param_stringlist("input", input_paths,
256  "input file pattern(s)");
257 
258  if (!clp.process(argc, argv)) {
259  return -1;
260  }
261 
262  clp.print_result();
263 
264  auto start_func =
265  [&](thrill::Context& ctx) {
266  ctx.enable_consume();
267 
268  if (generate) {
269  switch (dimensions) {
270  case 0:
271  die("Zero dimensional clustering is easy.");
272  case 2:
273  RunKMeansGenerated<Point<2> >(
274  ctx, bisecting, dimensions, num_clusters, iterations,
275  epsilon, svg_path, svg_scale, input_paths);
276  break;
277  case 3:
278  RunKMeansGenerated<Point<3> >(
279  ctx, bisecting, dimensions, num_clusters, iterations,
280  epsilon, svg_path, svg_scale, input_paths);
281  break;
282  default:
283  RunKMeansGenerated<VPoint>(
284  ctx, bisecting, dimensions, num_clusters, iterations,
285  epsilon, svg_path, svg_scale, input_paths);
286  }
287  }
288  else {
289  switch (dimensions) {
290  case 0:
291  die("Zero dimensional clustering is easy.");
292  case 2:
293  RunKMeansFile<Point<2> >(
294  ctx, bisecting, dimensions, num_clusters, iterations,
295  epsilon, svg_path, svg_scale, input_paths);
296  break;
297  case 3:
298  RunKMeansFile<Point<3> >(
299  ctx, bisecting, dimensions, num_clusters, iterations,
300  epsilon, svg_path, svg_scale, input_paths);
301  break;
302  default:
303  RunKMeansFile<VPoint>(
304  ctx, bisecting, dimensions, num_clusters, iterations,
305  epsilon, svg_path, svg_scale, input_paths);
306  }
307  }
308  };
309 
310  return thrill::Run(start_func);
311 }
312 
313 /******************************************************************************/
static Vector Make(size_t D_)
Definition: vector.hpp:39
Model returned by KMeans algorithm containing results.
Definition: k-means.hpp:73
void add_size_t(char key, const std::string &longkey, const std::string &keytype, size_t &dest, const std::string &desc)
auto BisecKMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon)
Calculate k-Means using bisecting method.
Definition: k-means.hpp:258
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:85
#define LOG1
Definition: logger.hpp:176
void add_double(char key, const std::string &longkey, const std::string &keytype, double &dest, const std::string &desc)
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:887
static void RunKMeansFile(thrill::Context &ctx, bool bisecting, size_t dimensions, size_t num_clusters, size_t iterations, double eps, const std::string &svg_path, double svg_scale, const std::vector< std::string > &input_paths)
thrill::common::Vector< D, double > Point
Compile-Time Fixed-Dimensional Points.
Definition: k-means.hpp:39
void add_param_size_t(const std::string &name, size_t &dest, const std::string &desc)
add size_t parameter [name] with description and store to dest
#define die(msg)
Instead of abort(), throw the output the message via an exception.
Definition: die.hpp:42
void add_string(char key, const std::string &longkey, const std::string &keytype, std::string &dest, const std::string &desc)
add string option -key, –longkey [keytype] and store to dest
std::pair< Point, size_t > PointClusterId
Definition: k-means.hpp:45
static Vector Random(size_t dim, Distribution &dist, Generator &gen)
Definition: vector.hpp:53
DIA< std::string > ReadLines(Context &ctx, const std::string &filepath)
ReadLines is a DOp, which reads a file from the file system and creates an ordered DIA according to a...
Definition: read_lines.hpp:452
void unused(Types &&...)
Definition: unused.hpp:20
int main(int argc, char *argv[])
void OutputSVG(const std::string &svg_path, double svg_scale, const DIA< Point > &list, const KMeansModel< Point > &model)
Output the points and centroids as a SVG drawing.
Definition: k-means_run.cpp:48
void print_result(std::ostream &os)
print nicely formatted result of processing
auto KMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon=0.0)
Definition: k-means.hpp:175
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
Command line parser which automatically fills variables and prints nice usage messages.
static void RunKMeansGenerated(thrill::Context &ctx, bool bisecting, size_t dimensions, size_t num_clusters, size_t iterations, double eps, const std::string &svg_path, double svg_scale, const std::vector< std::string > &input_paths)
std::ostream & operator<<(std::ostream &os, const SVGColor &c)
Definition: k-means_run.cpp:38
void add_param_stringlist(const std::string &name, std::vector< std::string > &dest, const std::string &desc)
void add_bool(char key, const std::string &longkey, const std::string &keytype, bool &dest, const std::string &desc)
bool process(int argc, const char *const *argv, std::ostream &os)
static constexpr const T & max(const T &a, const T &b)
template for constexpr max, because std::max is not good enough.
Definition: functional.hpp:65