Thrill  0.1
word_count.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/word_count/word_count.hpp
3  *
4  * This file contains the WordCount core example. See word_count_run.cpp for how
5  * to run it on different inputs.
6  *
7  * Part of Project Thrill - http://project-thrill.org
8  *
9  * Copyright (C) 2015 Alexander Noe <[email protected]>
10  * Copyright (C) 2016 Timo Bingmann <[email protected]>
11  *
12  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
13  ******************************************************************************/
14 
15 #pragma once
16 #ifndef THRILL_EXAMPLES_WORD_COUNT_WORD_COUNT_HEADER
17 #define THRILL_EXAMPLES_WORD_COUNT_WORD_COUNT_HEADER
18 
21 
22 #include <string>
23 #include <utility>
24 
25 namespace examples {
26 namespace word_count {
27 
28 using namespace thrill; // NOLINT
29 
30 using WordCountPair = std::pair<std::string, size_t>;
31 
32 //! The most basic WordCount user program: reads a DIA containing std::string
33 //! words, and returns a DIA containing WordCountPairs.
34 template <typename InputStack>
36 
37  auto word_pairs = input.template FlatMap<WordCountPair>(
38  [](const std::string& line, auto emit) -> void {
39  /* map lambda: emit each word */
41  ' ', line, [&](const tlx::string_view& sv) {
42  if (sv.size() == 0) return;
43  emit(WordCountPair(sv.to_string(), 1));
44  });
45  });
46 
47  return word_pairs.ReduceByKey(
48  [](const WordCountPair& in) -> std::string {
49  /* reduction key: the word string */
50  return in.first;
51  },
52  [](const WordCountPair& a, const WordCountPair& b) -> WordCountPair {
53  /* associative reduction operator: add counters */
54  return WordCountPair(a.first, a.second + b.second);
55  });
56 }
57 
58 /******************************************************************************/
59 
60 using HashWord = std::pair<size_t, std::string>;
61 using HashWordCount = std::pair<HashWord, size_t>;
62 
64  size_t operator () (const HashWord& w) const {
65  // return first which is the hash of the word
66  return w.first;
67  }
68 };
69 
70 //! The second WordCount user program: reads a DIA containing std::string words,
71 //! creates hash values from the words prior to reducing by hash and
72 //! word. Returns a DIA containing WordCountPairs.
73 template <typename InputStack>
75 
76  std::hash<std::string> string_hasher;
77 
78  auto r =
79  input
80  .template FlatMap<std::string>(
81  [](const std::string& line, auto emit) {
82  /* map lambda: emit each word */
84  ' ', line, [&](const tlx::string_view& sv) {
85  if (sv.size() == 0) return;
86  emit(sv.to_string());
87  });
88  })
89  .Map([&](const std::string& word) {
90  return HashWordCount(HashWord(string_hasher(word), word), 1);
91  })
92  .ReduceByKey(
93  [](const HashWordCount& in) {
94  /* reduction key: the word string */
95  return in.first;
96  },
97  [](const HashWordCount& a, const HashWordCount& b) {
98  /* associative reduction operator: add counters */
99  return HashWordCount(a.first, a.second + b.second);
100  },
102  .Map([](const HashWordCount& in) {
103  return WordCountPair(in.first.second, in.second);
104  });
105  return r;
106 }
107 
108 } // namespace word_count
109 } // namespace examples
110 
111 #endif // !THRILL_EXAMPLES_WORD_COUNT_WORD_COUNT_HEADER
112 
113 /******************************************************************************/
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
auto ReduceByKey(const KeyExtractor &key_extractor, const ReduceFunction &reduce_function, const ReduceConfig &reduce_config=ReduceConfig()) const
ReduceByKey is a DOp, which groups elements of the DIA with the key_extractor and reduces each key-bu...
std::pair< std::string, size_t > WordCountPair
Definition: word_count.hpp:30
Definition: bfs.hpp:21
auto WordCount(const DIA< std::string, InputStack > &input)
Definition: word_count.hpp:35
auto HashWordCountExample(const DIA< std::string, InputStack > &input)
Definition: word_count.hpp:74
std::pair< size_t, std::string > HashWord
Definition: word_count.hpp:60
size_t size() const noexcept
Returns the size of this StringView.
Definition: string_view.hpp:93
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
StringView is a reference to a part of a string, consisting of only a char pointer and a length...
Definition: string_view.hpp:32
std::string to_string() const
Returns the data of this StringView as a std::string.
static void split_view(char sep, const std::string &str, Functor &&callback, std::string::size_type limit=std::string::npos)
Split the given string at each separator character into distinct substrings, and call the given callb...
Definition: split_view.hpp:38
std::pair< HashWord, size_t > HashWordCount
Definition: word_count.hpp:61