Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
stochastic_gradient_descent_run.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/stochastic_gradient_descent/stochastic_gradient_descent_run.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2017 Alina Saalfeld <[email protected]>
7  * Copyright (C) 2017 Clemens Wallrath <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #include <thrill/api/cache.hpp>
13 #include <thrill/api/context.hpp>
14 #include <thrill/api/gather.hpp>
15 #include <thrill/api/generate.hpp>
17 
18 #include <thrill/common/logger.hpp>
19 #include <tlx/cmdline_parser.hpp>
20 
21 #include <algorithm>
22 #include <iomanip>
23 #include <string>
24 #include <vector>
25 
27 
28 using namespace examples::stochastic_gradient_descent; // NOLINT
29 
30 //! Output the points and the fitted linear function as a 2-D SVG drawing
31 template <typename Vector>
32 void OutputSVG(const std::string& svg_path, double svg_scale,
33  const DIA<DataPoint<Vector> >& point_dia,
34  const Vector& model) {
35  double width = 0, height = 0, min_vert = 0, max_vert = 0, min_hor = 0, max_hor = 0;
36 
37  std::vector<DataPoint<Vector> > list = point_dia.Gather();
38 
39  for (const DataPoint<Vector>& p : list) {
40  min_hor = std::min(min_hor, p.data.x[0]);
41  max_hor = std::max(max_hor, p.data.x[0]);
42  min_vert = std::min(min_vert, p.label);
43  max_vert = std::max(max_vert, p.label);
44  }
45 
46  double weight = model.x[0];
47  double y1 = min_hor * weight;
48  double y2 = max_hor * weight;
49  min_vert = std::min(min_vert, y1);
50  min_vert = std::min(min_vert, y2);
51  max_vert = std::max(max_vert, y1);
52  max_vert = std::max(max_vert, y2);
53 
54  width = max_hor - min_hor;
55  height = max_vert - min_vert;
56 
57  std::ofstream os(svg_path);
58 
59  os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
60  os << "<svg\n";
61  os << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
62  os << " xmlns:cc=\"http://creativecommons.org/ns#\"\n";
63  os << " xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
64  os << " xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
65  os << " xmlns=\"http://www.w3.org/2000/svg\"\n";
66  os << " version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
67  << "\" height=\"" << height * svg_scale << "\">\n";
68  os << " <g id=\"layer1\">\n";
69 
70  // Draw grid
71  os << " <line x1=\"0\" y1=\"" << (height + min_vert) * svg_scale
72  << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height + min_vert) * svg_scale
73  << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
74  os << " <line x1=\"" << -min_hor * svg_scale << "\" y1=\"0\""
75  << " x2=\"" << -min_hor * svg_scale << "\" y2=\"" << height * svg_scale
76  << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
77 
78  // Draw points
79  for (const DataPoint<Vector>& p : list) {
80  os << " <circle r=\"1\" cx=\"" << (p.data.x[0] - min_hor) * svg_scale
81  << "\" cy=\"" << (height - p.label + min_vert) * svg_scale
82  << "\" style=\"stroke:none;stroke-opacity:1;fill:#45a2d1;fill-opacity:1\" />\n";
83  }
84 
85  // Draw line
86  os << " <line x1=\"0\" y1=\"" << (height - y1 + min_vert) * svg_scale
87  << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height - y2 + min_vert) * svg_scale
88  << "\" stroke-width=\"1\" stroke=\"#ff9900\" />\n";
89 
90  os << " </g>\n";
91  os << "</svg>\n";
92 }
93 
94 template <typename Vector>
96  thrill::Context& ctx, size_t dimensions, size_t iterations,
97  size_t num_points, double mini_batch_fraction,
98  double step_size, double tolerance,
99  const std::string& svg_path, double svg_scale, size_t repetitions) {
100 
101  std::default_random_engine rng(2342);
102  std::uniform_real_distribution<double> uni_dist(-100.0, 100.0);
103  std::normal_distribution<double> norm_dist(1.0, 0.1);
104  std::normal_distribution<double> weight_dist(1.0, 5);
105 
106  Vector weights = Vector::Random(dimensions, weight_dist, rng);
107  if (ctx.my_rank() == 0)
108  LOG1 << "Generated weights: " << weights;
109 
110  auto points =
111  Generate(
112  ctx, num_points,
113  [&](const size_t& /* index */) {
114  auto x = Vector::Random(dimensions, uni_dist, rng);
115  auto y = weights.dot(x) * norm_dist(rng);
116  return DataPoint<Vector>({ x, y });
117  })
118  .Cache().KeepForever().Execute();
119 
120  auto start = std::chrono::high_resolution_clock::now();
121 
122  Vector result;
123 
124  for (size_t r = 0; r < repetitions; r++) {
125  auto grad_descent = StochasticGradientDescent<Vector>(
126  iterations, mini_batch_fraction, step_size, tolerance);
127 
128  auto initial_weights = Vector::Make(dimensions).fill(1.0);
129  result = grad_descent.optimize(points, initial_weights);
130  }
131 
132  auto end = std::chrono::high_resolution_clock::now();
133  if (ctx.my_rank() == 0) {
134  LOG1 << "Estimated weights: " << result;
135  LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s";
136  }
137 
138  if (svg_path.size() && dimensions == 1) {
139  OutputSVG(svg_path, svg_scale, points.Collapse(), result);
140  }
141 }
142 
143 template <typename Vector>
145  thrill::Context& ctx, size_t dimensions, size_t iterations,
146  double mini_batch_fraction, double step_size, double tolerance,
147  const std::string& svg_path, double svg_scale,
148  const std::string& input_path, size_t repetitions) {
149 
150  auto points =
151  ReadLines(ctx, input_path)
152  .Filter(
153  [](const std::string& input) {
154  // filter empty lines and comments
155  return (!input.empty() && input.at(0) != '#');
156  })
157  .Map(
158  [dimensions](const std::string& input) {
159  // parse "<pt> <pt> <pt> ... <lbl>" lines
160  Vector v = Vector::Make(dimensions);
161  double l;
162  char* endptr = const_cast<char*>(input.c_str());
163  for (size_t i = 0; i < dimensions; ++i) {
164  while (*endptr == ' ') ++endptr;
165  v.x[i] = std::strtod(endptr, &endptr);
166  if (!endptr || *endptr != ' ') {
167  die("Could not parse point coordinates: " << input);
168  }
169  }
170  while (*endptr == ' ') ++endptr;
171  l = std::strtod(endptr, &endptr);
172  if (!endptr) {
173  die("Could not parse point coordinates: " << input);
174  }
175  while (*endptr == ' ') ++endptr;
176  if (!endptr || *endptr != 0) {
177  die("Could not parse point coordinates: " << input);
178  }
179  return DataPoint<Vector>({ v, l });
180  })
181  .Cache().KeepForever().Execute();
182 
183  auto start = std::chrono::high_resolution_clock::now();
184 
185  Vector result;
186 
187  for (size_t r = 0; r < repetitions; r++) {
188  auto grad_descent = StochasticGradientDescent<Vector>(
189  iterations, mini_batch_fraction, step_size, tolerance);
190 
191  auto initial_weights = Vector::Make(dimensions).fill(1.0);
192  result = grad_descent.optimize(points, initial_weights);
193  }
194 
195  auto end = std::chrono::high_resolution_clock::now();
196 
197  if (ctx.my_rank() == 0) {
198  LOG1 << "Estimated weights: " << result;
199  LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s";
200  }
201 
202  if (svg_path.size() && dimensions == 1) {
203  OutputSVG(svg_path, svg_scale, points.Collapse(), result);
204  }
205 }
206 
207 int main(int argc, char* argv[]) {
208 
210 
211  bool generate = false;
212  cp.add_flag('g', "generate", generate,
213  "generate random data, set num = #points");
214 
215  size_t num = 100;
216  cp.add_size_t('n', "num", num,
217  "number of points to generate");
218 
219  size_t dimensions = 1;
220  cp.add_size_t('d', "dim", dimensions,
221  "dimensions of weights 1-10, default: 1");
222 
223  size_t iterations = 100;
224  cp.add_size_t('i', "iterations", iterations,
225  "iterations, default: 100");
226 
227  size_t repetitions = 1;
228  cp.add_size_t('r', "repetitions", repetitions,
229  "repetitions, for timing purpose only.");
230 
231  double mini_batch_fraction = 1;
232  cp.add_double('f', "frac", mini_batch_fraction,
233  "mini_batch_fraction, default: 1");
234 
235  double step_size = 0.001;
236  cp.add_double('s', "step", step_size,
237  "stepsize, default: 0.001");
238 
239  double tolerance = 0.01;
240  cp.add_double('t', "tolerance", tolerance,
241  "tolerance, default: 0.01");
242 
243  std::string input_path = "";
244  cp.add_string('p', "paths", input_path,
245  "input file");
246 
247  std::string svg_path = "";
248  cp.add_string('o', "output", svg_path,
249  "output path for svg drawing (only for dim = 2)");
250 
251  double svg_scale = 1;
252  cp.add_double('S', "svg-scale", svg_scale,
253  "scale coordinates for svg output, default: 1");
254 
255  if (!cp.process(argc, argv)) {
256  return -1;
257  }
258 
259  cp.print_result();
260 
261  if (!generate && input_path == "") {
262  die("Please use -g to generate input data or -p to load files");
263  }
264 
265  auto start_func =
266  [&](thrill::Context& ctx) {
267  ctx.enable_consume();
268  if (generate) {
269  switch (dimensions) {
270  case 0:
271  die("Zero dimensional gradient descent doesn't seem very useful.");
272  break;
273  case 1:
274  RunStochasticGradGenerated<Vector<1> >(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
275  break;
276  case 2:
277  RunStochasticGradGenerated<Vector<2> >(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
278  break;
279  default:
280  RunStochasticGradGenerated<VVector>(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
281  break;
282  }
283  }
284  else {
285  switch (dimensions) {
286  case 0:
287  die("Zero dimensional gradient descent doesn't seem very useful.");
288  break;
289  case 1:
290  RunStochasticGradFile<Vector<1> >(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
291  break;
292  case 2:
293  RunStochasticGradFile<Vector<2> >(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
294  break;
295  default:
296  RunStochasticGradFile<VVector>(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
297  break;
298  }
299  }
300  };
301  return thrill::Run(start_func);
302 }
303 
304 /******************************************************************************/
static void RunStochasticGradGenerated(thrill::Context &ctx, size_t dimensions, size_t iterations, size_t num_points, double mini_batch_fraction, double step_size, double tolerance, const std::string &svg_path, double svg_scale, size_t repetitions)
static Vector Make(size_t D_)
Definition: vector.hpp:39
void add_size_t(char key, const std::string &longkey, const std::string &keytype, size_t &dest, const std::string &desc)
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:85
#define LOG1
Definition: logger.hpp:176
void add_double(char key, const std::string &longkey, const std::string &keytype, double &dest, const std::string &desc)
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:887
void OutputSVG(const std::string &svg_path, double svg_scale, const DIA< DataPoint< Vector > > &point_dia, const Vector &model)
Output the points and the fitted linear function as a 2-D SVG drawing.
static void RunStochasticGradFile(thrill::Context &ctx, size_t dimensions, size_t iterations, double mini_batch_fraction, double step_size, double tolerance, const std::string &svg_path, double svg_scale, const std::string &input_path, size_t repetitions)
Type dot(const Vector &b) const
Definition: vector.hpp:100
#define die(msg)
Instead of abort(), throw the output the message via an exception.
Definition: die.hpp:42
void add_string(char key, const std::string &longkey, const std::string &keytype, std::string &dest, const std::string &desc)
add string option -key, –longkey [keytype] and store to dest
static Vector Random(size_t dim, Distribution &dist, Generator &gen)
Definition: vector.hpp:53
DIA< std::string > ReadLines(Context &ctx, const std::string &filepath)
ReadLines is a DOp, which reads a file from the file system and creates an ordered DIA according to a...
Definition: read_lines.hpp:452
void add_flag(char key, const std::string &longkey, const std::string &keytype, bool &dest, const std::string &desc)
Model for one point consisting of a d-dimensional position and a label.
void print_result(std::ostream &os)
print nicely formatted result of processing
Vector fill(const Type init_val)
Definition: vector.hpp:48
list x
Definition: gen_data.py:39
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
int main(int argc, char *argv[])
Command line parser which automatically fills variables and prints nice usage messages.
static constexpr const T & min(const T &a, const T &b)
template for constexpr min, because std::min is not good enough.
Definition: functional.hpp:59
Type x[D]
coordinates array
Definition: vector.hpp:32
bool process(int argc, const char *const *argv, std::ostream &os)
static constexpr const T & max(const T &a, const T &b)
template for constexpr max, because std::max is not good enough.
Definition: functional.hpp:65