Thrill  0.1
stochastic_gradient_descent.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/stochastic_gradient_descent/stochastic_gradient_descent.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2017 Clemens Wallrath <[email protected]>
7  * Copyright (C) 2017 Alina Saalfeld <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #pragma once
13 #ifndef THRILL_EXAMPLES_STOCHASTIC_GRADIENT_DESCENT_STOCHASTIC_GRADIENT_DESCENT_HEADER
14 #define THRILL_EXAMPLES_STOCHASTIC_GRADIENT_DESCENT_STOCHASTIC_GRADIENT_DESCENT_HEADER
15 
17 #include <thrill/api/dia.hpp>
18 #include <thrill/api/sum.hpp>
19 #include <thrill/common/logger.hpp>
20 #include <thrill/common/vector.hpp>
21 
22 #include <cereal/types/vector.hpp>
24 
25 #include <algorithm>
26 #include <utility>
27 
28 namespace examples {
29 namespace stochastic_gradient_descent {
30 
31 using thrill::DIA;
32 
33 template <size_t D>
35 
37 
38 //! Model for one point consisting of a d-dimensional position and a label
39 template <typename Vector>
40 struct DataPoint {
42  double label;
43 
44  template <typename Archive>
45  void serialize(Archive& ar) {
46  ar(data, label);
47  }
48 };
49 
50 template <typename Vector>
51 std::ostream& operator << (std::ostream& os, const DataPoint<Vector>& p) {
52  return os << "data: " << p.data << ", label: " << p.label;
53 }
54 
55 // weights, loss
56 template <typename Vector>
59  double loss;
60 
61  GradientResult operator + (const GradientResult& b) const {
62  return GradientResult { weights + b.weights, loss + b.loss };
63  }
64 
65  template <typename Archive>
66  void serialize(Archive& ar) {
67  ar(weights, loss);
68  }
69 };
70 
71 // size_t for counting the actual number of data points
72 template <typename Vector>
73 struct SumResult {
75  double count;
76 
77  template <typename Archive>
78  void serialize(Archive& ar) {
79  ar(grad, count);
80  }
81 };
82 
83 //! simple implementation of a gradient computation class using a least squares
84 //! cost function and a linear model (y = w*x)
85 template <typename Vector>
87 {
88 public:
90  const Vector& data, double label, const Vector& weights) {
91  auto diff = data.dot(weights) - label;
92  auto loss = 0.5 * diff * diff;
93  auto gradient = diff * data;
94  return GradientResult<Vector>({ gradient, loss });
95  }
96 };
97 
98 template <typename Vector>
100 {
101 public:
103  size_t num_iterations, double mini_batch_fraction,
104  double step_size, double tolerance)
105  : num_iterations(num_iterations),
106  mini_batch_fraction(mini_batch_fraction),
107  step_size(step_size), tolerance(tolerance)
108  { }
109 
110  //! do the actual computation
111  Vector optimize(const DIA<DataPoint<Vector> >& input_points,
112  const Vector& initial_weights) {
113  auto weights = initial_weights;
114  bool converged = false;
115  size_t i = 1;
116  while (!converged && i <= num_iterations) {
117  LOG1 << "weights: " << weights;
118  auto old_weights = weights;
119  auto sample = input_points.BernoulliSample(mini_batch_fraction);
120  auto sum_result =
121  sample
122  .Map([&weights](const DataPoint<Vector>& p) {
124  p.data, p.label, weights);
125  return SumResult<Vector>({ grad, 1 });
126  })
127  .Sum(
128  [](const SumResult<Vector>& a, const SumResult<Vector>& b) {
129  return SumResult<Vector>(
130  {
131  a.grad + b.grad,
132  // number of data points (BernoulliSample yields
133  // only an approximate fraction)
134  a.count + b.count
135  });
136  },
139  Vector::Make(weights.size()).fill(0.0), 0.0
140  }, 0
141  });
142 
143  auto weight_gradient_sum = sum_result.grad;
144 
145  LOG1 << "n: " << sum_result.count;
146  LOG1 << "grad: " << weight_gradient_sum.weights;
147  LOG1 << "loss: " << weight_gradient_sum.loss;
148 
149  // w = w - eta sum_i=0^n Q(w_i) / n
150  // with adaptive step_size eta, and gradient Q(w_i)
151  weights = weights -
152  (step_size / sqrt(i))
153  * weight_gradient_sum.weights / sum_result.count;
154  ++i;
155  converged = is_converged(old_weights, weights, tolerance);
156  }
157  LOG1 << "iterations: " << i;
158  return weights;
159  }
160 
161 private:
164  double step_size;
165  double tolerance;
166 
167  bool is_converged(Vector& old, Vector& curr, double tolerance) {
168  return old.Distance(curr) < tolerance* std::max(curr.Norm(), 1.0);
169  }
170 };
171 
172 } // namespace stochastic_gradient_descent
173 } // namespace examples
174 
175 #endif // !THRILL_EXAMPLES_STOCHASTIC_GRADIENT_DESCENT_STOCHASTIC_GRADIENT_DESCENT_HEADER
176 
177 /******************************************************************************/
static Vector Make(size_t D_)
Definition: vector.hpp:39
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
A variable-length D-dimensional point with double precision.
Definition: vector.hpp:123
#define LOG1
Definition: logger.hpp:28
Definition: bfs.hpp:21
Type dot(const Vector &b) const
Definition: vector.hpp:100
Model for one point consisting of a d-dimensional position and a label.
auto gradient(const bool &y, const std::array< T, dim > &x, const std::array< T, dim > &w)
static GradientResult< Vector > Compute(const Vector &data, double label, const Vector &weights)
Type Distance(const Vector &b) const
Definition: vector.hpp:69
StochasticGradientDescent(size_t num_iterations, double mini_batch_fraction, double step_size, double tolerance)
Type Norm() const
Definition: vector.hpp:59
Vector optimize(const DIA< DataPoint< Vector > > &input_points, const Vector &initial_weights)
do the actual computation