Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
k-means_run.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/k-means/k-means_run.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2015 Matthias Stumpp <[email protected]>
7  * Copyright (C) 2016 Timo Bingmann <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
13 
14 #include <thrill/api/gather.hpp>
15 #include <thrill/api/generate.hpp>
17 #include <thrill/common/logger.hpp>
18 #include <thrill/common/string.hpp>
19 #include <tlx/cmdline_parser.hpp>
20 
21 #include <algorithm>
22 #include <fstream>
23 #include <iomanip>
24 #include <random>
25 #include <string>
26 #include <utility>
27 #include <vector>
28 
29 using namespace examples::k_means; // NOLINT
30 
31 //! Output a #rrggbb color for each cluster index
32 class SVGColor
33 {
34 public:
35  explicit SVGColor(size_t cluster) : cluster_(cluster) { }
36  size_t cluster_;
37 };
38 
39 std::ostream& operator << (std::ostream& os, const SVGColor& c) {
40  os << "#" << std::hex << std::setfill('0') << std::setw(2)
41  << unsigned(static_cast<double>(3 * (c.cluster_ + 1) % 11) / 11.0 * 256)
42  << unsigned(static_cast<double>(7 * (c.cluster_ + 1) % 11) / 11.0 * 256)
43  << unsigned(static_cast<double>(9 * (c.cluster_ + 1) % 11) / 11.0 * 256);
44  return os;
45 }
46 
47 //! Output the points and centroids as a SVG drawing.
48 template <typename Point>
49 void OutputSVG(const std::string& svg_path, double svg_scale,
50  const DIA<Point>& list,
51  const KMeansModel<Point>& model) {
52  tlx::unused(svg_path);
53  tlx::unused(svg_scale);
54  tlx::unused(list);
55  tlx::unused(model);
56 }
57 
58 //! Output the points and centroids as a 2-D SVG drawing
59 template <>
60 void OutputSVG(const std::string& svg_path, double svg_scale,
61  const DIA<Point<2> >& point_dia,
62  const KMeansModel<Point<2> >& model) {
63  double width = 0, height = 0;
64 
65  using Point2D = Point<2>;
66 
67  const std::vector<Point2D>& centroids = model.centroids();
68  std::vector<PointClusterId<Point2D> > list =
69  model.ClassifyPairs(point_dia).Gather();
70 
71  for (const PointClusterId<Point2D>& p : list) {
72  width = std::max(width, p.first.x[0]);
73  height = std::max(height, p.first.x[1]);
74  }
75 
76  if (point_dia.context().my_rank() != 0) return;
77 
78  std::ofstream os(svg_path);
79 
80  os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
81  os << "<svg\n";
82  os << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
83  os << " xmlns:cc=\"http://creativecommons.org/ns#\"\n";
84  os << " xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
85  os << " xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
86  os << " xmlns=\"http://www.w3.org/2000/svg\"\n";
87  os << " version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
88  << "\" height=\"" << height * svg_scale << "\">\n";
89  os << " <g id=\"layer1\">\n";
90 
91  for (const PointClusterId<Point2D>& p : list) {
92  os << " <circle r=\"1\" cx=\"" << p.first.x[0] * svg_scale
93  << "\" cy=\"" << p.first.x[1] * svg_scale
94  << "\" style=\"stroke:none;stroke-opacity:1;fill:"
95  << SVGColor(p.second) << ";fill-opacity:1\" />\n";
96  }
97  for (size_t i = 0; i < centroids.size(); ++i) {
98  const Point2D& p = centroids[i];
99  os << " <circle r=\"4\" cx=\"" << p.x[0] * svg_scale
100  << "\" cy=\"" << p.x[1] * svg_scale
101  << "\" style=\"stroke:black;stroke-opacity:1;fill:"
102  << SVGColor(i) << ";fill-opacity:1\" />\n";
103  }
104  os << " </g>\n";
105  os << "</svg>\n";
106 }
107 
108 template <typename Point>
109 static void RunKMeansGenerated(
110  thrill::Context& ctx, bool bisecting,
111  size_t dimensions, size_t num_clusters, size_t iterations, double eps,
112  const std::string& svg_path, double svg_scale,
113  const std::vector<std::string>& input_paths) {
114 
116 
117  std::default_random_engine rng(123456);
118  std::uniform_real_distribution<float> dist(0.0, 1000.0);
119 
120  size_t num_points;
121  if (input_paths.size() != 1 ||
122  !thrill::common::from_str<size_t>(input_paths[0], num_points))
123  die("For generated data, set input_path to the number of points.");
124 
125  auto points =
126  Generate(
127  ctx, num_points,
128  [&](const size_t& /* index */) {
129  return Point::Random(dimensions, dist, rng);
130  })
131  .Cache().KeepForever();
132 
133  auto result = bisecting ?
134  BisecKMeans(points.Keep(), dimensions, num_clusters, iterations, eps) :
135  KMeans(points.Keep(), dimensions, num_clusters, iterations, eps);
136 
137  double cost = result.ComputeCost(points);
138  if (ctx.my_rank() == 0)
139  LOG1 << "k-means cost: " << cost;
140 
141  if (svg_path.size() && dimensions == 2) {
142  OutputSVG(svg_path, svg_scale, points, result);
143  }
144 
145  ctx.net.Barrier();
146  if (ctx.my_rank() == 0) {
147  auto traffic = ctx.net_manager().Traffic();
148  LOG1 << "RESULT"
149  << " benchmark=k-means"
150  << " bisecting=" << bisecting
151  << " dimensions=" << dimensions
152  << " num_clusters=" << num_clusters
153  << " iterations=" << iterations
154  << " eps=" << eps
155  << " cost=" << cost
156  << " time=" << timer
157  << " traffic=" << traffic.total()
158  << " hosts=" << ctx.num_hosts();
159  }
160 }
161 
162 template <typename Point>
163 static void RunKMeansFile(
164  thrill::Context& ctx, bool bisecting,
165  size_t dimensions, size_t num_clusters, size_t iterations, double eps,
166  const std::string& svg_path, double svg_scale,
167  const std::vector<std::string>& input_paths) {
168 
170 
171  auto points =
172  ReadLines(ctx, input_paths).Map(
173  [dimensions](const std::string& input) {
174  // parse "<pt> <pt> <pt> ..." lines
175  Point p = Point::Make(dimensions);
176  char* endptr = const_cast<char*>(input.c_str());
177  for (size_t i = 0; i < dimensions; ++i) {
178  while (*endptr == ' ') ++endptr;
179  p.x[i] = std::strtod(endptr, &endptr);
180  if (!endptr || (*endptr != ' ' && i != dimensions - 1)) {
181  die("Could not parse point coordinates: " << input);
182  }
183  }
184  while (*endptr == ' ') ++endptr;
185  if (!endptr || *endptr != 0) {
186  die("Could not parse point coordinates: " << input);
187  }
188  return p;
189  });
190 
191  auto result = bisecting ?
192  BisecKMeans(points.Keep(), dimensions, num_clusters, iterations, eps) :
193  KMeans(points.Keep(), dimensions, num_clusters, iterations, eps);
194 
195  double cost = result.ComputeCost(points.Keep());
196  if (ctx.my_rank() == 0)
197  LOG1 << "k-means cost: " << cost;
198 
199  if (svg_path.size() && dimensions == 2) {
200  OutputSVG(svg_path, svg_scale, points.Collapse(), result);
201  }
202 
203  ctx.net.Barrier();
204  if (ctx.my_rank() == 0) {
205  auto traffic = ctx.net_manager().Traffic();
206  LOG1 << "RESULT"
207  << " benchmark=k-means"
208  << " bisecting=" << bisecting
209  << " dimensions=" << dimensions
210  << " num_clusters=" << num_clusters
211  << " iterations=" << iterations
212  << " eps=" << eps
213  << " cost=" << cost
214  << " time=" << timer
215  << " traffic=" << traffic.total()
216  << " hosts=" << ctx.num_hosts();
217  }
218 }
219 
220 int main(int argc, char* argv[]) {
221 
222  tlx::CmdlineParser clp;
223 
224  bool generate = false;
225  clp.add_bool('g', "generate", generate,
226  "generate random data, set input = #points");
227 
228  bool bisecting = false;
229  clp.add_bool('b', "bisecting", bisecting,
230  "enable bisecting k-Means");
231 
232  size_t iterations = 10;
233  clp.add_size_t('n', "iterations", iterations,
234  "iterations, default: 10");
235 
236  size_t dimensions = 2;
237  clp.add_param_size_t("dim", dimensions,
238  "dimensions of points 2-10, default: 2");
239 
240  size_t num_clusters;
241  clp.add_param_size_t("clusters", num_clusters, "Number of clusters");
242 
243  double epsilon = 0;
244  clp.add_double('e', "epsilon", epsilon,
245  "centroid position delta for break condition, default: 0");
246 
247  std::string svg_path;
248  clp.add_string('s', "svg", svg_path,
249  "output path for svg drawing (only for dim = 2)");
250 
251  double svg_scale = 1;
252  clp.add_double('S', "svg-scale", svg_scale,
253  "scale coordinates for svg output, default: 1");
254 
255  std::vector<std::string> input_paths;
256  clp.add_param_stringlist("input", input_paths,
257  "input file pattern(s)");
258 
259  if (!clp.process(argc, argv)) {
260  return -1;
261  }
262 
263  clp.print_result();
264 
265  auto start_func =
266  [&](thrill::Context& ctx) {
267  ctx.enable_consume();
268 
269  if (generate) {
270  switch (dimensions) {
271  case 0:
272  die("Zero dimensional clustering is easy.");
273  case 2:
274  RunKMeansGenerated<Point<2> >(
275  ctx, bisecting, dimensions, num_clusters, iterations,
276  epsilon, svg_path, svg_scale, input_paths);
277  break;
278  case 3:
279  RunKMeansGenerated<Point<3> >(
280  ctx, bisecting, dimensions, num_clusters, iterations,
281  epsilon, svg_path, svg_scale, input_paths);
282  break;
283  default:
284  RunKMeansGenerated<VPoint>(
285  ctx, bisecting, dimensions, num_clusters, iterations,
286  epsilon, svg_path, svg_scale, input_paths);
287  }
288  }
289  else {
290  switch (dimensions) {
291  case 0:
292  die("Zero dimensional clustering is easy.");
293  case 2:
294  RunKMeansFile<Point<2> >(
295  ctx, bisecting, dimensions, num_clusters, iterations,
296  epsilon, svg_path, svg_scale, input_paths);
297  break;
298  case 3:
299  RunKMeansFile<Point<3> >(
300  ctx, bisecting, dimensions, num_clusters, iterations,
301  epsilon, svg_path, svg_scale, input_paths);
302  break;
303  default:
304  RunKMeansFile<VPoint>(
305  ctx, bisecting, dimensions, num_clusters, iterations,
306  epsilon, svg_path, svg_scale, input_paths);
307  }
308  }
309  };
310 
311  return thrill::Run(start_func);
312 }
313 
314 /******************************************************************************/
static Vector Make(size_t D_)
Definition: vector.hpp:39
Model returned by KMeans algorithm containing results.
Definition: k-means.hpp:73
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
void add_size_t(char key, const std::string &longkey, const std::string &keytype, size_t &dest, const std::string &desc)
auto BisecKMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon)
Calculate k-Means using bisecting method.
Definition: k-means.hpp:258
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:85
#define LOG1
Definition: logger.hpp:145
void add_double(char key, const std::string &longkey, const std::string &keytype, double &dest, const std::string &desc)
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:863
static void RunKMeansFile(thrill::Context &ctx, bool bisecting, size_t dimensions, size_t num_clusters, size_t iterations, double eps, const std::string &svg_path, double svg_scale, const std::vector< std::string > &input_paths)
thrill::common::Vector< D, double > Point
Compile-Time Fixed-Dimensional Points.
Definition: k-means.hpp:39
void add_param_size_t(const std::string &name, size_t &dest, const std::string &desc)
add size_t parameter [name] with description and store to dest
#define die(msg)
Instead of std::terminate(), throw the output the message via an exception.
Definition: die.hpp:43
void add_string(char key, const std::string &longkey, const std::string &keytype, std::string &dest, const std::string &desc)
add string option -key, –longkey [keytype] and store to dest
std::pair< Point, size_t > PointClusterId
Definition: k-means.hpp:45
static Vector Random(size_t dim, Distribution &dist, Generator &gen)
Definition: vector.hpp:53
DIA< std::string > ReadLines(Context &ctx, const std::string &filepath)
ReadLines is a DOp, which reads a file from the file system and creates an ordered DIA according to a...
Definition: read_lines.hpp:452
void unused(Types &&...)
Definition: unused.hpp:20
int main(int argc, char *argv[])
void OutputSVG(const std::string &svg_path, double svg_scale, const DIA< Point > &list, const KMeansModel< Point > &model)
Output the points and centroids as a SVG drawing.
Definition: k-means_run.cpp:49
void print_result(std::ostream &os)
print nicely formatted result of processing
auto KMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon=0.0)
Definition: k-means.hpp:175
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
Command line parser which automatically fills variables and prints nice usage messages.
static void RunKMeansGenerated(thrill::Context &ctx, bool bisecting, size_t dimensions, size_t num_clusters, size_t iterations, double eps, const std::string &svg_path, double svg_scale, const std::vector< std::string > &input_paths)
std::ostream & operator<<(std::ostream &os, const SVGColor &c)
Definition: k-means_run.cpp:39
void add_param_stringlist(const std::string &name, std::vector< std::string > &dest, const std::string &desc)
void add_bool(char key, const std::string &longkey, const std::string &keytype, bool &dest, const std::string &desc)
bool process(int argc, const char *const *argv, std::ostream &os)