docs/master/group__by__key_8hpp_source.html

 /*******************************************************************************
  * thrill/api/group_by_key.hpp
  *
  * DIANode for a groupby operation. Performs the actual groupby operation
  *
  * Part of Project Thrill - http://project-thrill.org
  *
  * Copyright (C) 2015 Huyen Chau Nguyen <[email protected]>
  * Copyright (C) 2016 Alexander Noe <[email protected]>
  *
  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
  ******************************************************************************/

 #pragma once
 #ifndef THRILL_API_GROUP_BY_KEY_HEADER
 #define THRILL_API_GROUP_BY_KEY_HEADER

 #include <thrill/api/dia.hpp>
 #include <thrill/api/dop_node.hpp>
 #include <thrill/api/group_by_iterator.hpp>
 #include <thrill/common/functional.hpp>
 #include <thrill/common/logger.hpp>
 #include <thrill/core/location_detection.hpp>
 #include <thrill/core/reduce_functional.hpp>
 #include <thrill/data/file.hpp>

 #include <tlx/vector_free.hpp>

 #include <algorithm>
 #include <deque>
 #include <functional>
 #include <type_traits>
 #include <typeinfo>
 #include <unordered_map>
 #include <utility>
 #include <vector>

 namespace thrill {
 namespace api {

 /*!
  * \ingroup api_layer
  */
 template <typename ValueType,
           typename KeyExtractor, typename GroupFunction, typename HashFunction,
           bool UseLocationDetection>
 class GroupByNode final : public DOpNode<ValueType>
 {
 private:
     static constexpr bool debug = false;

     using Super = DOpNode<ValueType>;
     using Super::context_;

     using Key = typename common::FunctionTraits<KeyExtractor>::result_type;
     using ValueOut = ValueType;
     using ValueIn =
         typename common::FunctionTraits<KeyExtractor>::template arg_plain<0>;

     struct ValueComparator {
     public:
         explicit ValueComparator(const GroupByNode& node) : node_(node) { }

         bool operator () (const ValueIn& a, const ValueIn& b) const {
             return node_.key_extractor_(a) < node_.key_extractor_(b);
         }

     private:
         const GroupByNode& node_;
     };

     class HashCount
     {
     public:
         using HashType = size_t;
         using CounterType = uint8_t;

         size_t hash;
         CounterType count;

         static constexpr size_t counter_bits_ = 8 * sizeof(CounterType);

         HashCount operator + (const HashCount& b) const {
             assert(hash == b.hash);
             return HashCount { hash, common::AddTruncToType(count, b.count) };
         }

         HashCount& operator += (const HashCount& b) {
             assert(hash == b.hash);
             count = common::AddTruncToType(count, b.count);
             return *this;
         }

         bool operator < (const HashCount& b) const { return hash < b.hash; }

         //! method to check if this hash count should be broadcasted to all
         //! workers interested -- for GroupByKey -> always.
         bool NeedBroadcast() const {
             return true;
         }

         //! Read count from BitReader
         template <typename BitReader>
         void ReadBits(BitReader& reader) {
             count = reader.GetBits(counter_bits_);
         }

         //! Write count and dia_mask to BitWriter
         template <typename BitWriter>
         void WriteBits(BitWriter& writer) const {
             writer.PutBits(count, counter_bits_);
         }
     };

 public:
     /*!
      * Constructor for a GroupByNode. Sets the DataManager, parent, stack,
      * key_extractor and reduce_function.
      */
     template <typename ParentDIA>
     GroupByNode(const ParentDIA& parent,
                 const KeyExtractor& key_extractor,
                 const GroupFunction& groupby_function,
                 const HashFunction& hash_function = HashFunction())
         : Super(parent.ctx(), "GroupByKey", { parent.id() }, { parent.node() }),
           key_extractor_(key_extractor),
           groupby_function_(groupby_function),
           hash_function_(hash_function),
           location_detection_(parent.ctx(), Super::dia_id()),
           pre_file_(context_.GetFile(this)) {
         // Hook PreOp
         auto pre_op_fn = [=](const ValueIn& input) {
                              PreOp(input);
                          };
         // close the function stack with our pre op and register it at
         // parent node for output
         auto lop_chain = parent.stack().push(pre_op_fn).fold();
         parent.node()->AddChild(this, lop_chain);
     }

     void StartPreOp(size_t /* parent_index */) final {
         emitters_ = stream_->GetWriters();
         pre_writer_ = pre_file_.GetWriter();
         if (UseLocationDetection)
             location_detection_.Initialize(DIABase::mem_limit_);
     }

     //! Send all elements to their designated PEs
     void PreOp(const ValueIn& v) {
         size_t hash = hash_function_(key_extractor_(v));
         if (UseLocationDetection) {
             pre_writer_.Put(v);
             location_detection_.Insert(HashCount { hash, 1 });
         }
         else {
             const size_t recipient = hash % emitters_.size();
             emitters_[recipient].Put(v);
         }
     }

     void StopPreOp(size_t /* parent_index */) final {
         pre_writer_.Close();
     }

     DIAMemUse PreOpMemUse() final {
         return DIAMemUse::Max();
     }

     DIAMemUse ExecuteMemUse() final {
         return DIAMemUse::Max();
     }

     DIAMemUse PushDataMemUse() final {
         if (files_.size() <= 1) {
             // direct push, no merge necessary
             return 0;
         }
         else {
             // need to perform multiway merging
             return DIAMemUse::Max();
         }
     }

     void Execute() override {
         if (UseLocationDetection) {
             std::unordered_map<size_t, size_t> target_processors;
             size_t max_hash = location_detection_.Flush(target_processors);
             auto file_reader = pre_file_.GetConsumeReader();
             while (file_reader.HasNext()) {
                 ValueIn in = file_reader.template Next<ValueIn>();
                 Key key = key_extractor_(in);

                 size_t hr = hash_function_(key) % max_hash;
                 auto target_processor = target_processors.find(hr);
                 emitters_[target_processor->second].Put(in);
             }
         }
         // data has been pushed during pre-op -> close emitters
         emitters_.Close();

         MainOp();
     }

     void PushData(bool consume) final {
         LOG << "sort data";
         common::StatsTimerStart timer;
         const size_t num_runs = files_.size();
         if (num_runs == 0) {
             // nothing to push
         }
         else if (num_runs == 1) {
             // if there's only one run, call user funcs
             RunUserFunc(files_[0], consume);
         }
         else {
             // otherwise sort all runs using multiway merge
             size_t merge_degree, prefetch;

             // merge batches of files if necessary
             while (std::tie(merge_degree, prefetch) =
                        context_.block_pool().MaxMergeDegreePrefetch(files_.size()),
                    files_.size() > merge_degree)
             {
                 sLOG1 << "Partial multi-way-merge of"
                       << merge_degree << "files with prefetch" << prefetch;

                 // create merger for first merge_degree_ Files
                 std::vector<data::File::ConsumeReader> seq;
                 seq.reserve(merge_degree);

                 for (size_t t = 0; t < merge_degree; ++t) {
                     seq.emplace_back(
                         files_[t].GetConsumeReader(/* prefetch */ 0));
                 }

                 StartPrefetch(seq, prefetch);

                 auto puller = core::make_multiway_merge_tree<ValueIn>(
                     seq.begin(), seq.end(), ValueComparator(*this));

                 // create new File for merged items
                 files_.emplace_back(context_.GetFile(this));
                 auto writer = files_.back().GetWriter();

                 while (puller.HasNext()) {
                     writer.Put(puller.Next());
                 }
                 writer.Close();

                 // this clear is important to release references to the files.
                 seq.clear();

                 // remove merged files
                 files_.erase(files_.begin(), files_.begin() + merge_degree);
             }

             std::vector<data::File::Reader> seq;
             seq.reserve(num_runs);

             for (size_t t = 0; t < num_runs; ++t) {
                 seq.emplace_back(
                     files_[t].GetReader(consume, /* prefetch */ 0));
             }

             StartPrefetch(seq, prefetch);

             LOG << "start multiwaymerge for real";
             auto puller = core::make_multiway_merge_tree<ValueIn>(
                 seq.begin(), seq.end(), ValueComparator(*this));

             LOG << "run user func";
             if (puller.HasNext()) {
                 // create iterator to pass to user_function
                 auto user_iterator = GroupByMultiwayMergeIterator<
                     ValueIn, KeyExtractor, ValueComparator>(
                     puller, key_extractor_);

                 while (user_iterator.HasNextForReal()) {
                     // call user function
                     const ValueOut res = groupby_function_(
                         user_iterator, user_iterator.GetNextKey());
                     // push result to callback functions
                     this->PushItem(res);
                 }
             }
         }
         timer.Stop();
         LOG << "RESULT"
             << " name=multiwaymerge"
             << " time=" << timer
             << " multiwaymerge=" << (num_runs > 1);
     }

     void Dispose() override { }

 private:
     KeyExtractor key_extractor_;
     GroupFunction groupby_function_;
     HashFunction hash_function_;

     core::LocationDetection<HashCount> location_detection_;

     data::CatStreamPtr stream_ { context_.GetNewCatStream(this) };
     data::CatStream::Writers emitters_;

     std::deque<data::File> files_;
     data::File sorted_elems_ { context_.GetFile(this) };
     size_t totalsize_ = 0;

     //! location detection and associated files
     data::File pre_file_;
     data::File::Writer pre_writer_;

     void RunUserFunc(data::File& f, bool consume) {
         auto r = f.GetReader(consume);
         if (r.HasNext()) {
             // create iterator to pass to user_function
             LOG << "get iterator";
             auto user_iterator = GroupByIterator<
                 ValueIn, KeyExtractor, ValueComparator>(r, key_extractor_);
             LOG << "start running user func";
             while (user_iterator.HasNextForReal()) {
                 // call user function
                 const ValueOut res = groupby_function_(user_iterator,
                                                        user_iterator.GetNextKey());
                 // push result to callback functions
                 this->PushItem(res);
             }
             LOG << "finished user func";
         }
     }

     //! Sort and store elements in a file
     void FlushVectorToFile(std::vector<ValueIn>& v) {
         // sort run and sort to file
         std::sort(v.begin(), v.end(), ValueComparator(*this));
         totalsize_ += v.size();

         files_.emplace_back(context_.GetFile(this));
         data::File::Writer w = files_.back().GetWriter();
         for (const ValueIn& e : v) {
             w.Put(e);
         }
         w.Close();
     }

     //! Receive elements from other workers.
     void MainOp() {
         LOG << "running group by main op";

         std::vector<ValueIn> incoming;

         common::StatsTimerStart timer;
         // get incoming elements
         auto reader = stream_->GetCatReader(/* consume */ true);
         while (reader.HasNext()) {
             // if vector is full save to disk
             if (mem::memory_exceeded) {
                 FlushVectorToFile(incoming);
                 incoming.clear();
             }
             // store incoming element
             incoming.emplace_back(reader.template Next<ValueIn>());
         }
         FlushVectorToFile(incoming);
         tlx::vector_free(incoming);
         LOG << "finished receiving elems";
         stream_.reset();

         timer.Stop();

         LOG << "RESULT"
             << " name=mainop"
             << " time=" << timer
             << " number_files=" << files_.size();
     }
 };

 /******************************************************************************/

 template <typename ValueType, typename Stack>
 template <typename ValueOut, bool LocationDetectionValue,
           typename KeyExtractor, typename GroupFunction, typename HashFunction>
 auto DIA<ValueType, Stack>::GroupByKey(
     const LocationDetectionFlag<LocationDetectionValue>&,
     const KeyExtractor& key_extractor,
     const GroupFunction& groupby_function,
     const HashFunction& hash_function) const {

     static_assert(
         std::is_same<
             typename std::decay<typename common::FunctionTraits<KeyExtractor>
                                 ::template arg<0> >::type,
             ValueType>::value,
         "KeyExtractor has the wrong input type");

     using GroupByNode = api::GroupByNode<
         ValueOut, KeyExtractor, GroupFunction, HashFunction,
         LocationDetectionValue>;

     auto node = tlx::make_counting<GroupByNode>(
         *this, key_extractor, groupby_function, hash_function);

     return DIA<ValueOut>(node);
 }

 template <typename ValueType, typename Stack>
 template <typename ValueOut, typename KeyExtractor,
           typename GroupFunction, typename HashFunction>
 auto DIA<ValueType, Stack>::GroupByKey(
     const KeyExtractor& key_extractor,
     const GroupFunction& groupby_function,
     const HashFunction& hash_function) const {
     // forward to other method _without_ location detection
     return GroupByKey<ValueOut>(
         NoLocationDetectionTag, key_extractor, groupby_function, hash_function);
 }

 template <typename ValueType, typename Stack>
 template <typename ValueOut, typename KeyExtractor, typename GroupFunction>
 auto DIA<ValueType, Stack>::GroupByKey(
     const KeyExtractor& key_extractor,
     const GroupFunction& groupby_function) const {
     // forward to other method _without_ location detection
     return GroupByKey<ValueOut>(
         NoLocationDetectionTag, key_extractor, groupby_function,
         std::hash<typename FunctionTraits<KeyExtractor>::result_type>());
 }

 } // namespace api
 } // namespace thrill

 #endif // !THRILL_API_GROUP_BY_KEY_HEADER

 /******************************************************************************/
thrill::api::GroupByNode::totalsize_
size_t totalsize_
Definition: group_by_key.hpp:308

thrill::data::StartPrefetch
void StartPrefetch(std::vector< Reader > &readers, size_t prefetch_size)
Take a vector of Readers and prefetch equally from them.
Definition: file.hpp:585

thrill::api::DIAMemUse::Max
static DIAMemUse Max()
Definition: dia_base.hpp:60

thrill::api::DIA
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141

thrill::api::GroupByNode::PushDataMemUse
DIAMemUse PushDataMemUse() final
Amount of RAM used by PushData()
Definition: group_by_key.hpp:173

thrill::api::GroupByNode::location_detection_
core::LocationDetection< HashCount > location_detection_
Definition: group_by_key.hpp:301

thrill::api::GroupByNode::HashCount::WriteBits
void WriteBits(BitWriter &writer) const
Write count and dia_mask to BitWriter.
Definition: group_by_key.hpp:110

thrill::api::DIANode::PushItem
void PushItem(const ValueType &item) const
Method for derived classes to Push a single item to all children.
Definition: dia_node.hpp:147

thrill::api::DIA< BfsNode >::ValueType
BfsNode ValueType
Definition: dia.hpp:152

thrill::api::DIAMemUse
Description of the amount of RAM the internal data structures of a DIANode require.
Definition: dia_base.hpp:51

thrill::api::GroupByNode::HashCount
Definition: group_by_key.hpp:72

thrill::api::GroupByNode::StartPreOp
void StartPreOp(size_t) final
Virtual method for preparing start of PushData.
Definition: group_by_key.hpp:141

vector_free.hpp

thrill::api::GroupByNode::key_extractor_
KeyExtractor key_extractor_
Definition: group_by_key.hpp:297

thrill::api::GroupByNode::ExecuteMemUse
DIAMemUse ExecuteMemUse() final
Amount of RAM used by Execute()
Definition: group_by_key.hpp:169

thrill::api::GroupByNode::ValueComparator::node_
const GroupByNode & node_
Definition: group_by_key.hpp:69

thrill::api::GroupByNode::groupby_function_
GroupFunction groupby_function_
Definition: group_by_key.hpp:298

thrill::data::BlockPool::MaxMergeDegreePrefetch
std::pair< size_t, size_t > MaxMergeDegreePrefetch(size_t num_files)
Definition: block_pool.cpp:703

thrill::api::GroupByIterator
Definition: group_by_iterator.hpp:48

thrill::data::File
A File is an ordered sequence of Block objects for storing items.
Definition: file.hpp:56

tlx::CountingPtr::reset
void reset()
release contained pointer, frees object if this is the last reference.
Definition: counting_ptr.hpp:231

thrill::mem::CounterType
ssize_t CounterType
Definition: malloc_tracker.cpp:111

logger.hpp

thrill::api::GroupByNode::ValueOut
ValueType ValueOut
Definition: group_by_key.hpp:56

thrill::data::BlockWriter
BlockWriter contains a temporary Block object into which a) any serializable item can be stored or b)...
Definition: block_writer.hpp:53

thrill::api::GroupByNode::Key
typename common::FunctionTraits< KeyExtractor >::result_type Key
Definition: group_by_key.hpp:55

thrill::mem::memory_exceeded
bool memory_exceeded
memory limit exceeded indicator
Definition: malloc_tracker.cpp:172

thrill::api::GroupByNode::hash_function_
HashFunction hash_function_
Definition: group_by_key.hpp:299

dia.hpp

thrill::api::GroupByNode::HashCount::ReadBits
void ReadBits(BitReader &reader)
Read count from BitReader.
Definition: group_by_key.hpp:104

sLOG1
#define sLOG1
Definition: logger.hpp:38

thrill::api::GroupByNode::pre_file_
data::File pre_file_
location detection and associated files
Definition: group_by_key.hpp:311

thrill::api::GroupByNode::HashCount::count
CounterType count
Definition: group_by_key.hpp:79

thrill::api::DIA::GroupByKey
auto GroupByKey(const KeyExtractor &key_extractor, const GroupByFunction &groupby_function) const
GroupByKey is a DOp, which groups elements of the DIA by its key.

thrill::data::StreamData::Writers::Close
void Close()
custom destructor to close writers is a cyclic fashion
Definition: stream_data.cpp:92

thrill::api::GroupByNode::Execute
void Execute() override
Virtual execution method. Triggers actual computation in sub-classes.
Definition: group_by_key.hpp:184

thrill::api::GroupByNode::debug
static constexpr bool debug
Definition: group_by_key.hpp:50

operator+=
uint_pair & operator+=(const uint_pair &b)
addition operator (uses 64-bit arithmetic)
Definition: uint_types.hpp:166

thrill::data::StreamData::Writers
An extra class derived from std::vector<> for delivery of the BlockWriters of a Stream.
Definition: stream_data.hpp:59

thrill::api::GroupByNode::stream_
data::CatStreamPtr stream_
Definition: group_by_key.hpp:303

thrill::api::Context::GetNewCatStream
data::CatStreamPtr GetNewCatStream(size_t dia_id)
Definition: context.cpp:1209

thrill::data::File::GetConsumeReader
ConsumeReader GetConsumeReader(size_t prefetch_size=File::default_prefetch_size_)
Get consuming BlockReader for beginning of File.
Definition: file.cpp:73

dop_node.hpp

gen_data.value
int value
Definition: gen_data.py:41

thrill::api::GroupByNode::ValueIn
typename common::FunctionTraits< KeyExtractor >::template arg_plain< 0 > ValueIn
Definition: group_by_key.hpp:58

thrill::api::GroupByMultiwayMergeIterator
Definition: group_by_iterator.hpp:132

thrill::api::GroupByNode::Dispose
void Dispose() override
Virtual clear method. Triggers actual disposing in sub-classes.
Definition: group_by_key.hpp:294

thrill::api::GroupByNode::pre_writer_
data::File::Writer pre_writer_
Definition: group_by_key.hpp:312

thrill::api::GroupByNode::ValueComparator::operator()
bool operator()(const ValueIn &a, const ValueIn &b) const
Definition: group_by_key.hpp:64

thrill::api::GroupByNode::emitters_
data::CatStream::Writers emitters_
Definition: group_by_key.hpp:304

thrill::api::GroupByNode::ValueComparator
Definition: group_by_key.hpp:60

thrill::core::LocationDetection
Definition: location_detection.hpp:70

thrill::api::NoLocationDetectionTag
const struct LocationDetectionFlag< false > NoLocationDetectionTag
global const LocationDetectionFlag instance
Definition: dia.hpp:125

tlx::CountingPtr< CatStream >

thrill::api::GroupByNode::sorted_elems_
data::File sorted_elems_
Definition: group_by_key.hpp:307

thrill::api::GroupByNode::HashCount::NeedBroadcast
bool NeedBroadcast() const
Definition: group_by_key.hpp:98

thrill::common::AddTruncToType
static IntegerType AddTruncToType(const IntegerType &a, const IntegerType &b)
Definition: math.hpp:31

thrill::api::GroupByNode
Definition: group_by_iterator.hpp:39

thrill::api::Context::GetFile
data::File GetFile(size_t dia_id)
Returns a new File object containing a sequence of local Blocks.
Definition: context.hpp:283

location_detection.hpp

thrill::api::DIABase::dia_id
const size_t & dia_id() const
return unique id of DIANode subclass as stored by StatsNode
Definition: dia_base.hpp:213

thrill::api::DIABase::mem_limit_
DIAMemUse mem_limit_
Definition: dia_base.hpp:314

thrill::api::GroupByNode::RunUserFunc
void RunUserFunc(data::File &f, bool consume)
Definition: group_by_key.hpp:314

tlx::vector_free
void vector_free(std::vector< Type > &v)
Definition: vector_free.hpp:21

thrill::api::GroupByNode::FlushVectorToFile
void FlushVectorToFile(std::vector< ValueIn > &v)
Sort and store elements in a file.
Definition: group_by_key.hpp:334

thrill::api::DOpNode
A DOpNode is a typed node representing and distributed operations in Thrill.
Definition: dop_node.hpp:32

thrill::api::GroupByNode::MainOp
void MainOp()
Receive elements from other workers.
Definition: group_by_key.hpp:348

thrill::data::File::GetReader
Reader GetReader(bool consume, size_t prefetch_size=File::default_prefetch_size_)
Get BlockReader or a consuming BlockReader for beginning of File.
Definition: file.cpp:78

thrill::api::GroupByNode::PreOpMemUse
DIAMemUse PreOpMemUse() final
Amount of RAM used by PreOp after StartPreOp()
Definition: group_by_key.hpp:165

thrill::api::GroupByNode::PreOp
void PreOp(const ValueIn &v)
Send all elements to their designated PEs.
Definition: group_by_key.hpp:149

file.hpp

thrill::api::GroupByNode::files_
std::deque< data::File > files_
Definition: group_by_key.hpp:306

thrill::data::BlockWriter::Put
TLX_ATTRIBUTE_ALWAYS_INLINE BlockWriter & Put(const T &x)
Put appends a complete item, or fails with a FullException.
Definition: block_writer.hpp:210

reduce_functional.hpp

thrill::api::GroupByNode::HashCount::HashType
size_t HashType
Definition: group_by_key.hpp:75

thrill::api::GroupByNode::PushData
void PushData(bool consume) final
Virtual method for pushing data. Triggers actual pushing in sub-classes.
Definition: group_by_key.hpp:204

thrill::api::GroupByNode::GroupByNode
GroupByNode(const ParentDIA &parent, const KeyExtractor &key_extractor, const GroupFunction &groupby_function, const HashFunction &hash_function=HashFunction())
Constructor for a GroupByNode.
Definition: group_by_key.hpp:121

thrill
Definition: action_node.hpp:21

thrill::api::GroupByNode::HashCount::hash
size_t hash
Definition: group_by_key.hpp:78

thrill::common::StatsTimerBaseStarted
Definition: stats_timer.hpp:271

thrill::data::BlockWriter::Close
void Close()
Explicitly close the writer.
Definition: block_writer.hpp:121

group_by_iterator.hpp

thrill::api::LocationDetectionFlag
tag structure for GroupByKey(), and InnerJoin()
Definition: dia.hpp:116

thrill::common::hash
HashCrc32< T > hash
Select a hashing method.
Definition: hash.hpp:262

operator<
bool operator<(const uint_pair &b) const
less-than comparison operator
Definition: uint_types.hpp:187

LOG
#define LOG
Default logging method: output if the local debug variable is true.
Definition: logger.hpp:24

thrill::api::GroupByNode::ValueComparator::ValueComparator
ValueComparator(const GroupByNode &node)
Definition: group_by_key.hpp:62

thrill::api::GroupByNode::HashCount::CounterType
uint8_t CounterType
Definition: group_by_key.hpp:76

thrill::data::File::GetWriter
Writer GetWriter(size_t block_size=default_block_size)
Get BlockWriter.
Definition: file.cpp:63

thrill::api::GroupByNode::StopPreOp
void StopPreOp(size_t) final
Virtual method for preparing end of PushData.
Definition: group_by_key.hpp:161

thrill::api::DIABase::context_
Context & context_
associated Context
Definition: dia_base.hpp:293

thrill::api::Context::block_pool
data::BlockPool & block_pool()
the block manager keeps all data blocks moving through the system.
Definition: context.hpp:324

functional.hpp