Thrill  0.1
stochastic_gradient_descent_run.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/stochastic_gradient_descent/stochastic_gradient_descent_run.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2017 Alina Saalfeld <[email protected]>
7  * Copyright (C) 2017 Clemens Wallrath <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #include <thrill/api/cache.hpp>
13 #include <thrill/api/context.hpp>
14 #include <thrill/api/gather.hpp>
15 #include <thrill/api/generate.hpp>
17 
18 #include <thrill/common/logger.hpp>
19 #include <tlx/cmdline_parser.hpp>
20 
21 #include <algorithm>
22 #include <fstream>
23 #include <iomanip>
24 #include <string>
25 #include <vector>
26 
28 
29 using namespace examples::stochastic_gradient_descent; // NOLINT
30 
31 //! Output the points and the fitted linear function as a 2-D SVG drawing
32 template <typename Vector>
33 void OutputSVG(const std::string& svg_path, double svg_scale,
34  const DIA<DataPoint<Vector> >& point_dia,
35  const Vector& model) {
36  double width = 0, height = 0, min_vert = 0, max_vert = 0, min_hor = 0, max_hor = 0;
37 
38  std::vector<DataPoint<Vector> > list = point_dia.Gather();
39 
40  for (const DataPoint<Vector>& p : list) {
41  min_hor = std::min(min_hor, p.data.x[0]);
42  max_hor = std::max(max_hor, p.data.x[0]);
43  min_vert = std::min(min_vert, p.label);
44  max_vert = std::max(max_vert, p.label);
45  }
46 
47  double weight = model.x[0];
48  double y1 = min_hor * weight;
49  double y2 = max_hor * weight;
50  min_vert = std::min(min_vert, y1);
51  min_vert = std::min(min_vert, y2);
52  max_vert = std::max(max_vert, y1);
53  max_vert = std::max(max_vert, y2);
54 
55  width = max_hor - min_hor;
56  height = max_vert - min_vert;
57 
58  std::ofstream os(svg_path);
59 
60  os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
61  os << "<svg\n";
62  os << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
63  os << " xmlns:cc=\"http://creativecommons.org/ns#\"\n";
64  os << " xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
65  os << " xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
66  os << " xmlns=\"http://www.w3.org/2000/svg\"\n";
67  os << " version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
68  << "\" height=\"" << height * svg_scale << "\">\n";
69  os << " <g id=\"layer1\">\n";
70 
71  // Draw grid
72  os << " <line x1=\"0\" y1=\"" << (height + min_vert) * svg_scale
73  << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height + min_vert) * svg_scale
74  << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
75  os << " <line x1=\"" << -min_hor * svg_scale << "\" y1=\"0\""
76  << " x2=\"" << -min_hor * svg_scale << "\" y2=\"" << height * svg_scale
77  << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
78 
79  // Draw points
80  for (const DataPoint<Vector>& p : list) {
81  os << " <circle r=\"1\" cx=\"" << (p.data.x[0] - min_hor) * svg_scale
82  << "\" cy=\"" << (height - p.label + min_vert) * svg_scale
83  << "\" style=\"stroke:none;stroke-opacity:1;fill:#45a2d1;fill-opacity:1\" />\n";
84  }
85 
86  // Draw line
87  os << " <line x1=\"0\" y1=\"" << (height - y1 + min_vert) * svg_scale
88  << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height - y2 + min_vert) * svg_scale
89  << "\" stroke-width=\"1\" stroke=\"#ff9900\" />\n";
90 
91  os << " </g>\n";
92  os << "</svg>\n";
93 }
94 
95 template <typename Vector>
97  thrill::Context& ctx, size_t dimensions, size_t iterations,
98  size_t num_points, double mini_batch_fraction,
99  double step_size, double tolerance,
100  const std::string& svg_path, double svg_scale, size_t repetitions) {
101 
102  std::default_random_engine rng(2342);
103  std::uniform_real_distribution<double> uni_dist(-100.0, 100.0);
104  std::normal_distribution<double> norm_dist(1.0, 0.1);
105  std::normal_distribution<double> weight_dist(1.0, 5);
106 
107  Vector weights = Vector::Random(dimensions, weight_dist, rng);
108  if (ctx.my_rank() == 0)
109  LOG1 << "Generated weights: " << weights;
110 
111  auto points =
112  Generate(
113  ctx, num_points,
114  [&](const size_t& /* index */) {
115  auto x = Vector::Random(dimensions, uni_dist, rng);
116  auto y = weights.dot(x) * norm_dist(rng);
117  return DataPoint<Vector>({ x, y });
118  })
119  .Cache().KeepForever().Execute();
120 
121  auto start = std::chrono::high_resolution_clock::now();
122 
123  Vector result;
124 
125  for (size_t r = 0; r < repetitions; r++) {
126  auto grad_descent = StochasticGradientDescent<Vector>(
127  iterations, mini_batch_fraction, step_size, tolerance);
128 
129  auto initial_weights = Vector::Make(dimensions).fill(1.0);
130  result = grad_descent.optimize(points, initial_weights);
131  }
132 
133  auto end = std::chrono::high_resolution_clock::now();
134  if (ctx.my_rank() == 0) {
135  LOG1 << "Estimated weights: " << result;
136  LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s";
137  }
138 
139  if (svg_path.size() && dimensions == 1) {
140  OutputSVG(svg_path, svg_scale, points.Collapse(), result);
141  }
142 }
143 
144 template <typename Vector>
146  thrill::Context& ctx, size_t dimensions, size_t iterations,
147  double mini_batch_fraction, double step_size, double tolerance,
148  const std::string& svg_path, double svg_scale,
149  const std::string& input_path, size_t repetitions) {
150 
151  auto points =
152  ReadLines(ctx, input_path)
153  .Filter(
154  [](const std::string& input) {
155  // filter empty lines and comments
156  return (!input.empty() && input.at(0) != '#');
157  })
158  .Map(
159  [dimensions](const std::string& input) {
160  // parse "<pt> <pt> <pt> ... <lbl>" lines
161  Vector v = Vector::Make(dimensions);
162  double l;
163  char* endptr = const_cast<char*>(input.c_str());
164  for (size_t i = 0; i < dimensions; ++i) {
165  while (*endptr == ' ') ++endptr;
166  v.x[i] = std::strtod(endptr, &endptr);
167  if (!endptr || *endptr != ' ') {
168  die("Could not parse point coordinates: " << input);
169  }
170  }
171  while (*endptr == ' ') ++endptr;
172  l = std::strtod(endptr, &endptr);
173  if (!endptr) {
174  die("Could not parse point coordinates: " << input);
175  }
176  while (*endptr == ' ') ++endptr;
177  if (!endptr || *endptr != 0) {
178  die("Could not parse point coordinates: " << input);
179  }
180  return DataPoint<Vector>({ v, l });
181  })
182  .Cache().KeepForever().Execute();
183 
184  auto start = std::chrono::high_resolution_clock::now();
185 
186  Vector result;
187 
188  for (size_t r = 0; r < repetitions; r++) {
189  auto grad_descent = StochasticGradientDescent<Vector>(
190  iterations, mini_batch_fraction, step_size, tolerance);
191 
192  auto initial_weights = Vector::Make(dimensions).fill(1.0);
193  result = grad_descent.optimize(points, initial_weights);
194  }
195 
196  auto end = std::chrono::high_resolution_clock::now();
197 
198  if (ctx.my_rank() == 0) {
199  LOG1 << "Estimated weights: " << result;
200  LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s";
201  }
202 
203  if (svg_path.size() && dimensions == 1) {
204  OutputSVG(svg_path, svg_scale, points.Collapse(), result);
205  }
206 }
207 
208 int main(int argc, char* argv[]) {
209 
211 
212  bool generate = false;
213  cp.add_flag('g', "generate", generate,
214  "generate random data, set num = #points");
215 
216  size_t num = 100;
217  cp.add_size_t('n', "num", num,
218  "number of points to generate");
219 
220  size_t dimensions = 1;
221  cp.add_size_t('d', "dim", dimensions,
222  "dimensions of weights 1-10, default: 1");
223 
224  size_t iterations = 100;
225  cp.add_size_t('i', "iterations", iterations,
226  "iterations, default: 100");
227 
228  size_t repetitions = 1;
229  cp.add_size_t('r', "repetitions", repetitions,
230  "repetitions, for timing purpose only.");
231 
232  double mini_batch_fraction = 1;
233  cp.add_double('f', "frac", mini_batch_fraction,
234  "mini_batch_fraction, default: 1");
235 
236  double step_size = 0.001;
237  cp.add_double('s', "step", step_size,
238  "stepsize, default: 0.001");
239 
240  double tolerance = 0.01;
241  cp.add_double('t', "tolerance", tolerance,
242  "tolerance, default: 0.01");
243 
244  std::string input_path = "";
245  cp.add_string('p', "paths", input_path,
246  "input file");
247 
248  std::string svg_path = "";
249  cp.add_string('o', "output", svg_path,
250  "output path for svg drawing (only for dim = 2)");
251 
252  double svg_scale = 1;
253  cp.add_double('S', "svg-scale", svg_scale,
254  "scale coordinates for svg output, default: 1");
255 
256  if (!cp.process(argc, argv)) {
257  return -1;
258  }
259 
260  cp.print_result();
261 
262  if (!generate && input_path == "") {
263  die("Please use -g to generate input data or -p to load files");
264  }
265 
266  auto start_func =
267  [&](thrill::Context& ctx) {
268  ctx.enable_consume();
269  if (generate) {
270  switch (dimensions) {
271  case 0:
272  die("Zero dimensional gradient descent doesn't seem very useful.");
273  break;
274  case 1:
275  RunStochasticGradGenerated<Vector<1> >(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
276  break;
277  case 2:
278  RunStochasticGradGenerated<Vector<2> >(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
279  break;
280  default:
281  RunStochasticGradGenerated<VVector>(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
282  break;
283  }
284  }
285  else {
286  switch (dimensions) {
287  case 0:
288  die("Zero dimensional gradient descent doesn't seem very useful.");
289  break;
290  case 1:
291  RunStochasticGradFile<Vector<1> >(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
292  break;
293  case 2:
294  RunStochasticGradFile<Vector<2> >(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
295  break;
296  default:
297  RunStochasticGradFile<VVector>(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
298  break;
299  }
300  }
301  };
302  return thrill::Run(start_func);
303 }
304 
305 /******************************************************************************/
static void RunStochasticGradGenerated(thrill::Context &ctx, size_t dimensions, size_t iterations, size_t num_points, double mini_batch_fraction, double step_size, double tolerance, const std::string &svg_path, double svg_scale, size_t repetitions)
static Vector Make(size_t D_)
Definition: vector.hpp:39
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:87
#define LOG1
Definition: logger.hpp:28
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:947
void OutputSVG(const std::string &svg_path, double svg_scale, const DIA< DataPoint< Vector > > &point_dia, const Vector &model)
Output the points and the fitted linear function as a 2-D SVG drawing.
static void RunStochasticGradFile(thrill::Context &ctx, size_t dimensions, size_t iterations, double mini_batch_fraction, double step_size, double tolerance, const std::string &svg_path, double svg_scale, const std::string &input_path, size_t repetitions)
void add_size_t(char key, const std::string &longkey, size_t &dest, const std::string &desc)
add size_t option -key, –longkey with description and store to dest
#define die(msg)
Instead of std::terminate(), throw the output the message via an exception.
Definition: die.hpp:22
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
Definition: context.hpp:221
static Vector Random(size_t dim, Distribution &dist, Generator &gen)
Definition: vector.hpp:53
DIA< std::string > ReadLines(Context &ctx, const std::string &filepath)
ReadLines is a DOp, which reads a file from the file system and creates an ordered DIA according to a...
Definition: read_lines.hpp:454
Type dot(const Vector &b) const
Definition: vector.hpp:100
Model for one point consisting of a d-dimensional position and a label.
void print_result(std::ostream &os)
print nicely formatted result of processing
Vector fill(const Type init_val)
Definition: vector.hpp:48
list x
Definition: gen_data.py:39
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
int main(int argc, char *argv[])
Command line parser which automatically fills variables and prints nice usage messages.
void add_string(char key, const std::string &longkey, std::string &dest, const std::string &desc)
add string option -key, –longkey and store to dest
size_t my_rank() const
Global rank of this worker among all other workers in the system.
Definition: context.hpp:243
void add_double(char key, const std::string &longkey, double &dest, const std::string &desc)
add double option -key, –longkey with description and store to dest
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
void add_flag(char key, const std::string &longkey, bool &dest, const std::string &desc)
Type x[D]
coordinates array
Definition: vector.hpp:32
A compile-time fixed-length D-dimensional point with double precision.
Definition: vector.hpp:28
bool process(int argc, const char *const *argv, std::ostream &os)