32 template <
typename Vector>
36 double width = 0, height = 0, min_vert = 0, max_vert = 0, min_hor = 0, max_hor = 0;
38 std::vector<DataPoint<Vector> > list = point_dia.Gather();
41 min_hor =
std::min(min_hor, p.data.x[0]);
42 max_hor =
std::max(max_hor, p.data.x[0]);
43 min_vert =
std::min(min_vert, p.label);
44 max_vert =
std::max(max_vert, p.label);
47 double weight = model.
x[0];
48 double y1 = min_hor * weight;
49 double y2 = max_hor * weight;
55 width = max_hor - min_hor;
56 height = max_vert - min_vert;
58 std::ofstream os(svg_path);
60 os <<
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
62 os <<
" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
63 os <<
" xmlns:cc=\"http://creativecommons.org/ns#\"\n";
64 os <<
" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
65 os <<
" xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
66 os <<
" xmlns=\"http://www.w3.org/2000/svg\"\n";
67 os <<
" version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
68 <<
"\" height=\"" << height * svg_scale <<
"\">\n";
69 os <<
" <g id=\"layer1\">\n";
72 os <<
" <line x1=\"0\" y1=\"" << (height + min_vert) * svg_scale
73 <<
"\" x2=\"" << width * svg_scale <<
"\" y2=\"" << (height + min_vert) * svg_scale
74 <<
"\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
75 os <<
" <line x1=\"" << -min_hor * svg_scale <<
"\" y1=\"0\"" 76 <<
" x2=\"" << -min_hor * svg_scale <<
"\" y2=\"" << height * svg_scale
77 <<
"\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
81 os <<
" <circle r=\"1\" cx=\"" << (p.data.x[0] - min_hor) * svg_scale
82 <<
"\" cy=\"" << (height - p.label + min_vert) * svg_scale
83 <<
"\" style=\"stroke:none;stroke-opacity:1;fill:#45a2d1;fill-opacity:1\" />\n";
87 os <<
" <line x1=\"0\" y1=\"" << (height - y1 + min_vert) * svg_scale
88 <<
"\" x2=\"" << width * svg_scale <<
"\" y2=\"" << (height - y2 + min_vert) * svg_scale
89 <<
"\" stroke-width=\"1\" stroke=\"#ff9900\" />\n";
95 template <
typename Vector>
98 size_t num_points,
double mini_batch_fraction,
99 double step_size,
double tolerance,
100 const std::string& svg_path,
double svg_scale,
size_t repetitions) {
102 std::default_random_engine rng(2342);
103 std::uniform_real_distribution<double> uni_dist(-100.0, 100.0);
104 std::normal_distribution<double> norm_dist(1.0, 0.1);
105 std::normal_distribution<double> weight_dist(1.0, 5);
109 LOG1 <<
"Generated weights: " << weights;
114 [&](
const size_t& ) {
116 auto y = weights.
dot(
x) * norm_dist(rng);
119 .Cache().KeepForever().Execute();
121 auto start = std::chrono::high_resolution_clock::now();
125 for (
size_t r = 0; r < repetitions; r++) {
127 iterations, mini_batch_fraction, step_size, tolerance);
130 result = grad_descent.optimize(points, initial_weights);
133 auto end = std::chrono::high_resolution_clock::now();
135 LOG1 <<
"Estimated weights: " << result;
136 LOG1 <<
"Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions <<
"s";
139 if (svg_path.size() && dimensions == 1) {
140 OutputSVG(svg_path, svg_scale, points.Collapse(), result);
144 template <
typename Vector>
147 double mini_batch_fraction,
double step_size,
double tolerance,
149 const std::string& input_path,
size_t repetitions) {
156 return (!input.empty() && input.at(0) !=
'#');
163 char* endptr =
const_cast<char*
>(input.c_str());
164 for (
size_t i = 0; i < dimensions; ++i) {
165 while (*endptr ==
' ') ++endptr;
166 v.
x[i] = std::strtod(endptr, &endptr);
167 if (!endptr || *endptr !=
' ') {
168 die(
"Could not parse point coordinates: " << input);
171 while (*endptr ==
' ') ++endptr;
172 l = std::strtod(endptr, &endptr);
174 die(
"Could not parse point coordinates: " << input);
176 while (*endptr ==
' ') ++endptr;
177 if (!endptr || *endptr != 0) {
178 die(
"Could not parse point coordinates: " << input);
182 .Cache().KeepForever().Execute();
184 auto start = std::chrono::high_resolution_clock::now();
188 for (
size_t r = 0; r < repetitions; r++) {
190 iterations, mini_batch_fraction, step_size, tolerance);
193 result = grad_descent.optimize(points, initial_weights);
196 auto end = std::chrono::high_resolution_clock::now();
199 LOG1 <<
"Estimated weights: " << result;
200 LOG1 <<
"Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions <<
"s";
203 if (svg_path.size() && dimensions == 1) {
204 OutputSVG(svg_path, svg_scale, points.Collapse(), result);
208 int main(
int argc,
char* argv[]) {
212 bool generate =
false;
213 cp.
add_flag(
'g',
"generate", generate,
214 "generate random data, set num = #points");
218 "number of points to generate");
220 size_t dimensions = 1;
222 "dimensions of weights 1-10, default: 1");
224 size_t iterations = 100;
226 "iterations, default: 100");
228 size_t repetitions = 1;
229 cp.
add_size_t(
'r',
"repetitions", repetitions,
230 "repetitions, for timing purpose only.");
232 double mini_batch_fraction = 1;
233 cp.
add_double(
'f',
"frac", mini_batch_fraction,
234 "mini_batch_fraction, default: 1");
236 double step_size = 0.001;
238 "stepsize, default: 0.001");
240 double tolerance = 0.01;
242 "tolerance, default: 0.01");
250 "output path for svg drawing (only for dim = 2)");
252 double svg_scale = 1;
254 "scale coordinates for svg output, default: 1");
262 if (!generate && input_path ==
"") {
263 die(
"Please use -g to generate input data or -p to load files");
268 ctx.enable_consume();
270 switch (dimensions) {
272 die(
"Zero dimensional gradient descent doesn't seem very useful.");
275 RunStochasticGradGenerated<Vector<1> >(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
278 RunStochasticGradGenerated<Vector<2> >(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
281 RunStochasticGradGenerated<VVector>(ctx, dimensions, iterations, num, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, repetitions);
286 switch (dimensions) {
288 die(
"Zero dimensional gradient descent doesn't seem very useful.");
291 RunStochasticGradFile<Vector<1> >(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
294 RunStochasticGradFile<Vector<2> >(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
297 RunStochasticGradFile<VVector>(ctx, dimensions, iterations, mini_batch_fraction, step_size, tolerance, svg_path, svg_scale, input_path, repetitions);
static void RunStochasticGradGenerated(thrill::Context &ctx, size_t dimensions, size_t iterations, size_t num_points, double mini_batch_fraction, double step_size, double tolerance, const std::string &svg_path, double svg_scale, size_t repetitions)
static Vector Make(size_t D_)
DIA is the interface between the user and the Thrill framework.
static uint_pair max()
return an uint_pair instance containing the largest value possible
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
void OutputSVG(const std::string &svg_path, double svg_scale, const DIA< DataPoint< Vector > > &point_dia, const Vector &model)
Output the points and the fitted linear function as a 2-D SVG drawing.
static void RunStochasticGradFile(thrill::Context &ctx, size_t dimensions, size_t iterations, double mini_batch_fraction, double step_size, double tolerance, const std::string &svg_path, double svg_scale, const std::string &input_path, size_t repetitions)
void add_size_t(char key, const std::string &longkey, size_t &dest, const std::string &desc)
add size_t option -key, –longkey with description and store to dest
#define die(msg)
Instead of std::terminate(), throw the output the message via an exception.
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
static Vector Random(size_t dim, Distribution &dist, Generator &gen)
DIA< std::string > ReadLines(Context &ctx, const std::string &filepath)
ReadLines is a DOp, which reads a file from the file system and creates an ordered DIA according to a...
Type dot(const Vector &b) const
Model for one point consisting of a d-dimensional position and a label.
void print_result(std::ostream &os)
print nicely formatted result of processing
Vector fill(const Type init_val)
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
int main(int argc, char *argv[])
Command line parser which automatically fills variables and prints nice usage messages.
void add_string(char key, const std::string &longkey, std::string &dest, const std::string &desc)
add string option -key, –longkey and store to dest
size_t my_rank() const
Global rank of this worker among all other workers in the system.
void add_double(char key, const std::string &longkey, double &dest, const std::string &desc)
add double option -key, –longkey with description and store to dest
static uint_pair min()
return an uint_pair instance containing the smallest value possible
void add_flag(char key, const std::string &longkey, bool &dest, const std::string &desc)
Type x[D]
coordinates array
A compile-time fixed-length D-dimensional point with double precision.
bool process(int argc, const char *const *argv, std::ostream &os)