35 explicit SVGColor(
size_t cluster) : cluster_(cluster) { }
39 std::ostream&
operator << (std::ostream& os,
const SVGColor& c) {
40 os <<
"#" << std::hex << std::setfill(
'0') << std::setw(2)
41 << unsigned(static_cast<double>(3 * (c.cluster_ + 1) % 11) / 11.0 * 256)
42 << unsigned(static_cast<double>(7 * (c.cluster_ + 1) % 11) / 11.0 * 256)
43 << unsigned(static_cast<double>(9 * (c.cluster_ + 1) % 11) / 11.0 * 256);
48 template <
typename Po
int>
61 const DIA<Point<2> >& point_dia,
63 double width = 0, height = 0;
65 using Point2D = Point<2>;
67 const std::vector<Point2D>& centroids = model.centroids();
68 std::vector<PointClusterId<Point2D> > list =
69 model.ClassifyPairs(point_dia).Gather();
72 width =
std::max(width, p.first.x[0]);
73 height =
std::max(height, p.first.x[1]);
76 if (point_dia.context().my_rank() != 0)
return;
78 std::ofstream os(svg_path);
80 os <<
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
82 os <<
" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
83 os <<
" xmlns:cc=\"http://creativecommons.org/ns#\"\n";
84 os <<
" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
85 os <<
" xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
86 os <<
" xmlns=\"http://www.w3.org/2000/svg\"\n";
87 os <<
" version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
88 <<
"\" height=\"" << height * svg_scale <<
"\">\n";
89 os <<
" <g id=\"layer1\">\n";
92 os <<
" <circle r=\"1\" cx=\"" << p.first.x[0] * svg_scale
93 <<
"\" cy=\"" << p.first.x[1] * svg_scale
94 <<
"\" style=\"stroke:none;stroke-opacity:1;fill:" 95 << SVGColor(p.second) <<
";fill-opacity:1\" />\n";
97 for (
size_t i = 0; i < centroids.size(); ++i) {
98 const Point2D& p = centroids[i];
99 os <<
" <circle r=\"4\" cx=\"" << p.x[0] * svg_scale
100 <<
"\" cy=\"" << p.x[1] * svg_scale
101 <<
"\" style=\"stroke:black;stroke-opacity:1;fill:" 102 << SVGColor(i) <<
";fill-opacity:1\" />\n";
108 template <
typename Po
int>
111 size_t dimensions,
size_t num_clusters,
size_t iterations,
double eps,
113 const std::vector<std::string>& input_paths) {
117 std::default_random_engine rng(123456);
118 std::uniform_real_distribution<float> dist(0.0, 1000.0);
121 if (input_paths.size() != 1 ||
122 !thrill::common::from_str<size_t>(input_paths[0], num_points))
123 die(
"For generated data, set input_path to the number of points.");
128 [&](
const size_t& ) {
131 .Cache().KeepForever();
133 auto result = bisecting ?
134 BisecKMeans(points.Keep(), dimensions, num_clusters, iterations, eps) :
135 KMeans(points.Keep(), dimensions, num_clusters, iterations, eps);
137 double cost = result.ComputeCost(points);
139 LOG1 <<
"k-means cost: " << cost;
141 if (svg_path.size() && dimensions == 2) {
142 OutputSVG(svg_path, svg_scale, points, result);
149 <<
" benchmark=k-means" 150 <<
" bisecting=" << bisecting
151 <<
" dimensions=" << dimensions
152 <<
" num_clusters=" << num_clusters
153 <<
" iterations=" << iterations
157 <<
" traffic=" << traffic.total()
162 template <
typename Po
int>
165 size_t dimensions,
size_t num_clusters,
size_t iterations,
double eps,
167 const std::vector<std::string>& input_paths) {
176 char* endptr =
const_cast<char*
>(input.c_str());
177 for (
size_t i = 0; i < dimensions; ++i) {
178 while (*endptr ==
' ') ++endptr;
179 p.x[i] = std::strtod(endptr, &endptr);
180 if (!endptr || (*endptr !=
' ' && i != dimensions - 1)) {
181 die(
"Could not parse point coordinates: " << input);
184 while (*endptr ==
' ') ++endptr;
185 if (!endptr || *endptr != 0) {
186 die(
"Could not parse point coordinates: " << input);
191 auto result = bisecting ?
192 BisecKMeans(points.Keep(), dimensions, num_clusters, iterations, eps) :
193 KMeans(points.Keep(), dimensions, num_clusters, iterations, eps);
195 double cost = result.ComputeCost(points.Keep());
197 LOG1 <<
"k-means cost: " << cost;
199 if (svg_path.size() && dimensions == 2) {
200 OutputSVG(svg_path, svg_scale, points.Collapse(), result);
207 <<
" benchmark=k-means" 208 <<
" bisecting=" << bisecting
209 <<
" dimensions=" << dimensions
210 <<
" num_clusters=" << num_clusters
211 <<
" iterations=" << iterations
215 <<
" traffic=" << traffic.total()
220 int main(
int argc,
char* argv[]) {
224 bool generate =
false;
225 clp.
add_bool(
'g',
"generate", generate,
226 "generate random data, set input = #points");
228 bool bisecting =
false;
229 clp.
add_bool(
'b',
"bisecting", bisecting,
230 "enable bisecting k-Means");
232 size_t iterations = 10;
234 "iterations, default: 10");
236 size_t dimensions = 2;
238 "dimensions of points 2-10, default: 2");
245 "centroid position delta for break condition, default: 0");
249 "output path for svg drawing (only for dim = 2)");
251 double svg_scale = 1;
253 "scale coordinates for svg output, default: 1");
255 std::vector<std::string> input_paths;
257 "input file pattern(s)");
259 if (!clp.
process(argc, argv)) {
267 ctx.enable_consume();
270 switch (dimensions) {
272 die(
"Zero dimensional clustering is easy.");
274 RunKMeansGenerated<Point<2> >(
275 ctx, bisecting, dimensions, num_clusters, iterations,
276 epsilon, svg_path, svg_scale, input_paths);
279 RunKMeansGenerated<Point<3> >(
280 ctx, bisecting, dimensions, num_clusters, iterations,
281 epsilon, svg_path, svg_scale, input_paths);
284 RunKMeansGenerated<VPoint>(
285 ctx, bisecting, dimensions, num_clusters, iterations,
286 epsilon, svg_path, svg_scale, input_paths);
290 switch (dimensions) {
292 die(
"Zero dimensional clustering is easy.");
294 RunKMeansFile<Point<2> >(
295 ctx, bisecting, dimensions, num_clusters, iterations,
296 epsilon, svg_path, svg_scale, input_paths);
299 RunKMeansFile<Point<3> >(
300 ctx, bisecting, dimensions, num_clusters, iterations,
301 epsilon, svg_path, svg_scale, input_paths);
304 RunKMeansFile<VPoint>(
305 ctx, bisecting, dimensions, num_clusters, iterations,
306 epsilon, svg_path, svg_scale, input_paths);
net::FlowControlChannel & net
static Vector Make(size_t D_)
Model returned by KMeans algorithm containing results.
DIA is the interface between the user and the Thrill framework.
static uint_pair max()
return an uint_pair instance containing the largest value possible
auto BisecKMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon)
Calculate k-Means using bisecting method.
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
size_t num_hosts() const
Returns the total number of hosts.
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
void Barrier()
A trivial global barrier.
static void RunKMeansFile(thrill::Context &ctx, bool bisecting, size_t dimensions, size_t num_clusters, size_t iterations, double eps, const std::string &svg_path, double svg_scale, const std::vector< std::string > &input_paths)
thrill::common::Vector< D, double > Point
Compile-Time Fixed-Dimensional Points.
void add_param_size_t(const std::string &name, size_t &dest, const std::string &desc)
add size_t parameter [name] with description and store to dest
void add_size_t(char key, const std::string &longkey, size_t &dest, const std::string &desc)
add size_t option -key, –longkey with description and store to dest
#define die(msg)
Instead of std::terminate(), throw the output the message via an exception.
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
std::pair< Point, size_t > PointClusterId
static Vector Random(size_t dim, Distribution &dist, Generator &gen)
DIA< std::string > ReadLines(Context &ctx, const std::string &filepath)
ReadLines is a DOp, which reads a file from the file system and creates an ordered DIA according to a...
int main(int argc, char *argv[])
void OutputSVG(const std::string &svg_path, double svg_scale, const DIA< Point > &list, const KMeansModel< Point > &model)
Output the points and centroids as a SVG drawing.
void print_result(std::ostream &os)
print nicely formatted result of processing
auto KMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon=0.0)
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Command line parser which automatically fills variables and prints nice usage messages.
void add_string(char key, const std::string &longkey, std::string &dest, const std::string &desc)
add string option -key, –longkey and store to dest
net::Traffic Traffic() const
calculate overall traffic for final stats
size_t my_rank() const
Global rank of this worker among all other workers in the system.
static void RunKMeansGenerated(thrill::Context &ctx, bool bisecting, size_t dimensions, size_t num_clusters, size_t iterations, double eps, const std::string &svg_path, double svg_scale, const std::vector< std::string > &input_paths)
std::ostream & operator<<(std::ostream &os, const SVGColor &c)
void add_double(char key, const std::string &longkey, double &dest, const std::string &desc)
add double option -key, –longkey with description and store to dest
void add_param_stringlist(const std::string &name, std::vector< std::string > &dest, const std::string &desc)
void add_bool(char key, const std::string &longkey, bool &dest, const std::string &desc)
net::Manager & net_manager()
bool process(int argc, const char *const *argv, std::ostream &os)