13 #ifndef THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER 14 #define THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER 24 #include <cereal/types/vector.hpp> 44 template <
typename Po
int>
48 template <
typename Po
int>
53 template <
typename Archive>
60 template <
typename Po
int>
65 template <
typename Archive>
67 archive(cluster_id, center);
72 template <
typename Po
int>
76 KMeansModel(
size_t dimensions,
size_t num_clusters,
size_t iterations,
77 const std::vector<Point>& centroids)
78 : dimensions_(dimensions), num_clusters_(num_clusters),
79 iterations_(iterations), centroids_(centroids)
95 const std::vector<Point>&
centroids()
const {
return centroids_; }
105 size_t closest_id = 0;
106 for (
size_t i = 1; i < centroids_.size(); ++i) {
108 if (dist < min_dist) {
118 template <
typename Po
intDIA>
121 .Map([
this](
const Point&
p) {
return Classify(p); });
126 template <
typename Po
intDIA>
129 .Map([
this](
const Point&
p) {
137 for (
size_t i = 1; i < centroids_.size(); ++i) {
139 if (dist < min_dist) {
148 template <
typename Po
intDIA>
151 .Map([
this](
const Point&
p) {
return ComputeCost(p); })
174 template <
typename Po
int,
typename InStack>
176 size_t num_clusters,
size_t iterations,
double epsilon = 0.0) {
178 auto points = input_points.
Cache();
180 bool break_condition =
false;
185 std::vector<Point> local_centroids =
186 points.Keep().Sample(num_clusters).AllGather();
188 for (
size_t iter = 0; iter < iterations && !break_condition; ++iter) {
190 std::vector<Point> old_centroids = local_centroids;
193 auto closest = points.Keep().Map(
194 [&local_centroids](
const Point&
p) {
195 assert(local_centroids.size());
197 size_t closest_id = 0;
199 for (
size_t i = 1; i < local_centroids.size(); ++i) {
201 if (dist < min_dist) {
220 a.
center.count + b.center.count }
233 local_centroids[uc.count] = uc.p;
240 break_condition =
true;
242 for (
size_t i = 0; i < local_centroids.size(); ++i) {
243 if (local_centroids[i].Distance(old_centroids[i]) > epsilon) {
244 break_condition =
false;
252 dimensions, num_clusters, iterations,
257 template <
typename Po
int,
typename InStack>
259 size_t num_clusters,
size_t iterations,
double epsilon) {
265 size_t initial_size = num_clusters <= 2 ? num_clusters : 2;
269 KMeans(input_points, dimensions, initial_size, iterations, epsilon);
271 for (
size_t size = initial_size; size < num_clusters; ++size) {
274 auto classified_points =
275 result_model.ClassifyPairs(input_points)
285 size_t biggest_cluster_idx =
293 a.
center.count + b.center.count }
303 auto filtered_points =
315 std::vector<Point> tmp_model_centroids =
316 KMeans(filtered_points, dimensions, 2, iterations, epsilon)
321 std::vector<Point> result_model_centroids = result_model.centroids();
322 result_model_centroids.erase(
323 result_model_centroids.begin() + biggest_cluster_idx);
324 result_model_centroids.insert(
325 result_model_centroids.end(),
326 tmp_model_centroids.begin(), tmp_model_centroids.end());
330 dimensions, num_clusters, iterations, result_model_centroids);
339 #endif // !THRILL_EXAMPLES_K_MEANS_K_MEANS_HEADER Model returned by KMeans algorithm containing results.
DIA is the interface between the user and the Thrill framework.
auto BisecKMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon)
Calculate k-Means using bisecting method.
size_t iterations_
number of iterations
A point which contains "count" accumulated vectors.
A variable-length D-dimensional point with double precision.
void serialize(Archive &archive)
KMeansModel(size_t dimensions, size_t num_clusters, size_t iterations, const std::vector< Point > ¢roids)
double ComputeCost(const PointDIA &points) const
size_t dimensions() const
Returns dimensions_.
size_t Classify(const Point &p) const
Calculate closest cluster to point.
std::pair< Point, size_t > PointClusterId
auto Classify(const PointDIA &points) const
size_t num_clusters_
number of clusters
const std::vector< Point > & centroids() const
Returns centroids_.
Type DistanceSquare(const Vector &b) const
std::vector< Point > centroids_
computed centroids in cluster id order
void serialize(Archive &archive)
auto KMeans(const DIA< Point, InStack > &input_points, size_t dimensions, size_t num_clusters, size_t iterations, double epsilon=0.0)
DIA< ValueType > Cache() const
Create a CacheNode which contains all items of a DIA in calculated plain format.
size_t iterations() const
Returns iterations_.
CentroidAccumulated< Point > center
auto ClassifyPairs(const PointDIA &points) const
Assignment of a point to a cluster, which is the input to.
size_t num_clusters() const
Returns number of clusters.
double ComputeCost(const Point &p) const
Calculate the k-means cost: the squared distance to the nearest center.
size_t dimensions_
dimensions of space