docs/master/reduce__pre__phase_8hpp_source.html

 /*******************************************************************************
  * thrill/core/reduce_pre_phase.hpp
  *
  * Hash table with support for reduce and partitions.
  *
  * Part of Project Thrill - http://project-thrill.org
  *
  * Copyright (C) 2015 Matthias Stumpp <mstumpp@gmail.com>
  * Copyright (C) 2015 Alexander Noe <aleexnoe@gmail.com>
  * Copyright (C) 2015 Timo Bingmann <tb@panthema.net>
  *
  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
  ******************************************************************************/

 #pragma once
 #ifndef THRILL_CORE_REDUCE_PRE_PHASE_HEADER
 #define THRILL_CORE_REDUCE_PRE_PHASE_HEADER

 #include <thrill/common/defines.hpp>
 #include <thrill/common/logger.hpp>
 #include <thrill/common/math.hpp>
 #include <thrill/core/duplicate_detection.hpp>
 #include <thrill/core/reduce_bucket_hash_table.hpp>
 #include <thrill/core/reduce_functional.hpp>
 #include <thrill/core/reduce_old_probing_hash_table.hpp>
 #include <thrill/core/reduce_probing_hash_table.hpp>
 #include <thrill/data/block_reader.hpp>
 #include <thrill/data/block_writer.hpp>
 #include <thrill/data/file.hpp>

 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <functional>
 #include <string>
 #include <utility>
 #include <vector>

 namespace thrill {
 namespace core {

 //! Emitter implementation to plug into a reduce hash table for
 //! collecting/flushing items while reducing. Items flushed in the pre-phase are
 //! transmitted via a network Channel.
 template <typename TableItem, bool VolatileKey, typename BlockWriter>
 class ReducePrePhaseEmitter
 {
     static constexpr bool debug = false;

 public:
     explicit ReducePrePhaseEmitter(std::vector<BlockWriter>& writer)
         : writer_(writer),
           stats_(writer.size(), 0) { }

     //! output an element into a partition, template specialized for robust and
     //! non-robust keys
     void Emit(const size_t& partition_id, const TableItem& p) {
         assert(partition_id < writer_.size());
         stats_[partition_id]++;
         writer_[partition_id].Put(p);
     }

     void Flush(size_t partition_id) {
         assert(partition_id < writer_.size());
         writer_[partition_id].Flush();
     }

     void CloseAll() {
         sLOG << "emit stats:";
         size_t i = 0;
         for (BlockWriter& e : writer_) {
             e.Close();
             sLOG << "emitter" << i << "pushed" << stats_[i++];
         }
     }

 public:
     //! Set of emitters, one per partition.
     std::vector<BlockWriter>& writer_;

     //! Emitter stats.
     std::vector<size_t> stats_;
 };

 template <typename TableItem, typename Key, typename Value,
           typename KeyExtractor, typename ReduceFunction,
           const bool VolatileKey,
           typename BlockWriter,
           typename ReduceConfig_ = DefaultReduceConfig,
           typename IndexFunction = ReduceByHash<Key>,
           typename KeyEqualFunction = std::equal_to<Key>,
           typename HashFunction = std::hash<Key>,
           bool UseDuplicateDetection = false>
 class ReducePrePhase;

 template <typename TableItem, typename Key, typename Value,
           typename KeyExtractor, typename ReduceFunction,
           const bool VolatileKey, typename BlockWriter,
           typename ReduceConfig_,
           typename IndexFunction,
           typename KeyEqualFunction,
           typename HashFunction>
 class ReducePrePhase<TableItem, Key, Value,
                      KeyExtractor, ReduceFunction,
                      VolatileKey, BlockWriter,
                      ReduceConfig_,
                      IndexFunction,
                      KeyEqualFunction,
                      HashFunction,
                      false>
 {
     static constexpr bool debug = false;

 public:
     using ReduceConfig = ReduceConfig_;
     using Emitter = ReducePrePhaseEmitter<TableItem, VolatileKey, BlockWriter>;
     using MakeTableItem = ReduceMakeTableItem<Value, TableItem, VolatileKey>;

     using Table = typename ReduceTableSelect<
         ReduceConfig::table_impl_,
         TableItem, Key, Value,
         KeyExtractor, ReduceFunction, Emitter,
         VolatileKey, ReduceConfig, IndexFunction, KeyEqualFunction>::type;

     /*!
      * A data structure which takes an arbitrary value and extracts a key using
      * a key extractor function from that value. Afterwards, the value is hashed
      * based on the key into some slot.
      */
     ReducePrePhase(Context& ctx, size_t dia_id,
                    size_t num_partitions,
                    KeyExtractor key_extractor,
                    ReduceFunction reduce_function,
                    std::vector<BlockWriter>& emit,
                    const ReduceConfig& config = ReduceConfig(),
                    const IndexFunction& index_function = IndexFunction(),
                    const KeyEqualFunction& key_equal_function = KeyEqualFunction(),
                    const HashFunction hash_function = HashFunction(),
                    bool duplicates = false)
         : emit_(emit),
           key_extractor_(key_extractor),
           table_(ctx, dia_id,
                  key_extractor, reduce_function, emit_,
                  num_partitions, config, !duplicates,
                  index_function, key_equal_function) {

         tlx::unused(hash_function);

         sLOG << "creating ReducePrePhase with" << emit.size() << "output emitters";

         assert(num_partitions == emit.size());
     }

     //! non-copyable: delete copy-constructor
     ReducePrePhase(const ReducePrePhase&) = delete;
     //! non-copyable: delete assignment operator
     ReducePrePhase& operator = (const ReducePrePhase&) = delete;

     void Initialize(size_t limit_memory_bytes) {
         table_.Initialize(limit_memory_bytes);
     }

     void InitializeSkip() {
         table_.InitializeSkip();
     }

     bool Insert(const Value& v) {
         // for VolatileKey this makes std::pair and extracts the key
         return table_.Insert(MakeTableItem::Make(v, table_.key_extractor()));
     }

     void InsertSkip(const Value& v) {
         TableItem t = MakeTableItem::Make(v, table_.key_extractor());
         typename IndexFunction::Result h = table_.calculate_index(t);
         emit_.Emit(h.partition_id, t);
     }

     //! Flush all partitions
     void FlushAll() {
         for (size_t id = 0; id < table_.num_partitions(); ++id) {
             FlushPartition(id, /* consume */ true, /* grow */ false);
         }
     }

     //! Flushes a partition
     void FlushPartition(size_t partition_id, bool consume, bool grow) {
         table_.FlushPartition(partition_id, consume, grow);
         // data is flushed immediately, there is no spilled data
     }

     //! Closes all emitter
     void CloseAll() {
         emit_.CloseAll();
         table_.Dispose();
     }

     //! \name Accessors
     //! \{

     //! Returns the total num of items in the table.
     size_t num_items() const { return table_.num_items(); }

     //! calculate key range for the given output partition
     common::Range key_range(size_t partition_id)
     { return table_.key_range(partition_id); }

     //! \}

 protected:
     //! Emitters used to parameterize hash table for output to network.
     Emitter emit_;

     //! extractor function which maps a value to it's key
     KeyExtractor key_extractor_;

     //! the first-level hash table implementation
     Table table_;
 };

 template <typename TableItem, typename Key, typename Value,
           typename KeyExtractor, typename ReduceFunction,
           const bool VolatileKey, typename BlockWriter,
           typename ReduceConfig,
           typename IndexFunction,
           typename EqualToFunction,
           typename HashFunction>
 class ReducePrePhase<TableItem, Key, Value,
                      KeyExtractor,
                      ReduceFunction,
                      VolatileKey,
                      BlockWriter,
                      ReduceConfig,
                      IndexFunction,
                      EqualToFunction,
                      HashFunction,
                      true>
     : public ReducePrePhase<TableItem, Key, Value,
                             KeyExtractor,
                             ReduceFunction,
                             VolatileKey,
                             BlockWriter,
                             ReduceConfig,
                             IndexFunction,
                             EqualToFunction,
                             HashFunction,
                             false>
 {

 public:
     using Super = ReducePrePhase<TableItem, Key, Value, KeyExtractor,
                                  ReduceFunction, VolatileKey, BlockWriter,
                                  ReduceConfig,
                                  IndexFunction, EqualToFunction, HashFunction,
                                  false>;
     using KeyValuePair = std::pair<Key, Value>;

     ReducePrePhase(Context& ctx, size_t dia_id,
                    size_t num_partitions,
                    KeyExtractor key_extractor,
                    ReduceFunction reduce_function,
                    std::vector<BlockWriter>& emit,
                    const ReduceConfig& config = ReduceConfig(),
                    const IndexFunction& index_function = IndexFunction(),
                    const EqualToFunction& equal_to_function = EqualToFunction(),
                    const HashFunction hash_function = HashFunction())
         : Super(ctx, dia_id, num_partitions, key_extractor, reduce_function,
                 emit, config, index_function, equal_to_function, hash_function,
                 /*duplicates*/ true),
           hash_function_(hash_function) { }

     void Insert(const Value& v) {
         if (Super::table_.Insert(
                 Super::MakeTableItem::Make(v, Super::table_.key_extractor()))) {
             hashes_.push_back(hash_function_(Super::key_extractor_(v)));
         }
     }

     //! Flush all partitions
     void FlushAll() {
         DuplicateDetection dup_detect;
         max_hash_ = dup_detect.FindNonDuplicates(non_duplicates_,
                                                  hashes_,
                                                  Super::table_.ctx(),
                                                  Super::table_.dia_id());

         for (size_t id = 0; id < Super::table_.num_partitions(); ++id) {
             FlushPartition(id, /* consume */ true, /* grow */ false);
         }
     }

     void FlushPartition(size_t partition_id, bool consume, bool grow) {
         Super::table_.FlushPartitionEmit(
             partition_id, consume, grow,
             [this](const size_t& partition_id, const TableItem& ti) {
                 Key key = Super::MakeTableItem::GetKey(
                     ti, Super::table_.key_extractor());
                 if (!non_duplicates_[hash_function_(key) % max_hash_]) {

                     duplicated_elements_++;
                     Super::emit_.Emit(partition_id, ti);
                 }
                 else {
                     non_duplicate_elements_++;
                     Super::emit_.Emit(Super::table_.ctx().my_rank(), ti);
                 }
             });

         if (Super::table_.has_spilled_data_on_partition(partition_id)) {
             data::File::Reader reader =
                 Super::table_.partition_files()[partition_id].GetReader(true);
             while (reader.HasNext()) {
                 TableItem ti = reader.Next<TableItem>();
                 Key key = Super::MakeTableItem::GetKey(
                     ti, Super::table_.key_extractor());
                 if (!non_duplicates_[hash_function_(key) % max_hash_]) {

                     duplicated_elements_++;
                     Super::emit_.Emit(partition_id, ti);
                 }
                 else {
                     non_duplicate_elements_++;
                     Super::emit_.Emit(Super::table_.ctx().my_rank(), ti);
                 }
             }
         }

         // flush elements pushed into emitter
         Super::emit_.Flush(partition_id);
         Super::emit_.Flush(Super::table_.ctx().my_rank());
     }

     //! \name Duplicate Detection
     //! \{

     HashFunction hash_function_;
     //! Hashes of all keys.
     std::vector<size_t> hashes_;
     //! All elements occuring on more than one worker. (Elements not appearing here
     //! can be reduced locally)
     std::vector<bool> non_duplicates_;
     //! Modulo for all hashes in duplicate detection to reduce hash space.
     size_t max_hash_;

     size_t duplicated_elements_ = 0;
     size_t non_duplicate_elements_ = 0;

     //! \}
 };

 } // namespace core
 } // namespace thrill

 #endif // !THRILL_CORE_REDUCE_PRE_PHASE_HEADER

 /******************************************************************************/
thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::key_range
common::Range key_range(size_t partition_id)
calculate key range for the given output partition
Definition: reduce_pre_phase.hpp:204

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::FlushPartition
void FlushPartition(size_t partition_id, bool consume, bool grow)
Definition: reduce_pre_phase.hpp:291

thrill::core::ReducePrePhaseEmitter::stats_
std::vector< size_t > stats_
Emitter stats.
Definition: reduce_pre_phase.hpp:82

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::ReducePrePhase
ReducePrePhase(Context &ctx, size_t dia_id, size_t num_partitions, KeyExtractor key_extractor, ReduceFunction reduce_function, std::vector< BlockWriter > &emit, const ReduceConfig &config=ReduceConfig(), const IndexFunction &index_function=IndexFunction(), const KeyEqualFunction &key_equal_function=KeyEqualFunction(), const HashFunction hash_function=HashFunction(), bool duplicates=false)
A data structure which takes an arbitrary value and extracts a key using a key extractor function fro...
Definition: reduce_pre_phase.hpp:130

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::FlushPartition
void FlushPartition(size_t partition_id, bool consume, bool grow)
Flushes a partition.
Definition: reduce_pre_phase.hpp:186

sLOG
#define sLOG
Default logging method: output if the local debug variable is true.
Definition: logger.hpp:34

thrill::core::ReducePrePhaseEmitter::Flush
void Flush(size_t partition_id)
Definition: reduce_pre_phase.hpp:63

block_writer.hpp

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::FlushAll
void FlushAll()
Flush all partitions.
Definition: reduce_pre_phase.hpp:279

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::Table
typename ReduceTableSelect< ReduceConfig::table_impl_, TableItem, Key, Value, KeyExtractor, ReduceFunction, Emitter, VolatileKey, ReduceConfig, IndexFunction, KeyEqualFunction >::type Table
Definition: reduce_pre_phase.hpp:123

reduce_bucket_hash_table.hpp

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::CloseAll
void CloseAll()
Closes all emitter.
Definition: reduce_pre_phase.hpp:192

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::num_items
size_t num_items() const
Returns the total num of items in the table.
Definition: reduce_pre_phase.hpp:201

thrill::core::ReduceTableSelect
Type selection via ReduceTableImpl enum.
Definition: reduce_table.hpp:348

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::InitializeSkip
void InitializeSkip()
Definition: reduce_pre_phase.hpp:163

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::non_duplicates_
std::vector< bool > non_duplicates_
Definition: reduce_pre_phase.hpp:340

defines.hpp

thrill::data::BlockReader::Next
TLX_ATTRIBUTE_ALWAYS_INLINE T Next()
Next() reads a complete item T.
Definition: block_reader.hpp:89

thrill::common::Range
represents a 1 dimensional range (interval) [begin,end)
Definition: math.hpp:41

logger.hpp

thrill::core::DuplicateDetection::FindNonDuplicates
size_t FindNonDuplicates(std::vector< bool > &non_duplicates, std::vector< size_t > &hashes, Context &context, size_t dia_id)
Identifies all hashes which occur on only a single worker.
Definition: duplicate_detection.hpp:150

thrill::core::ReducePrePhaseEmitter::Emit
void Emit(const size_t &partition_id, const TableItem &p)
Definition: reduce_pre_phase.hpp:57

thrill::api::Context
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
Definition: context.hpp:221

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::emit_
Emitter emit_
Emitters used to parameterize hash table for output to network.
Definition: reduce_pre_phase.hpp:211

thrill::core::ReducePrePhaseEmitter
Definition: reduce_pre_phase.hpp:46

thrill::core::ReducePrePhaseEmitter::writer_
std::vector< BlockWriter > & writer_
Set of emitters, one per partition.
Definition: reduce_pre_phase.hpp:79

tlx::unused
void unused(Types &&...)
Definition: unused.hpp:20

thrill::core::ReduceByHash
A reduce index function which returns a hash index and partition.
Definition: reduce_functional.hpp:31

reduce_old_probing_hash_table.hpp

block_reader.hpp

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::max_hash_
size_t max_hash_
Modulo for all hashes in duplicate detection to reduce hash space.
Definition: reduce_pre_phase.hpp:342

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::Initialize
void Initialize(size_t limit_memory_bytes)
Definition: reduce_pre_phase.hpp:159

thrill::data::BlockReader
BlockReader takes Block objects from BlockSource and allows reading of a) serializable Items or b) ar...
Definition: block_reader.hpp:42

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::key_extractor_
KeyExtractor key_extractor_
extractor function which maps a value to it&#39;s key
Definition: reduce_pre_phase.hpp:214

thrill::core::DefaultReduceConfig
Configuration class to define operational parameters of reduce hash tables and reduce phases...
Definition: reduce_table.hpp:40

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::ReduceConfig
ReduceConfig_ ReduceConfig
Definition: reduce_pre_phase.hpp:115

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::hashes_
std::vector< size_t > hashes_
Hashes of all keys.
Definition: reduce_pre_phase.hpp:337

thrill::core::ReducePrePhaseEmitter::CloseAll
void CloseAll()
Definition: reduce_pre_phase.hpp:68

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::FlushAll
void FlushAll()
Flush all partitions.
Definition: reduce_pre_phase.hpp:179

reduce_probing_hash_table.hpp

math.hpp

thrill::core::ReducePrePhaseEmitter::debug
static constexpr bool debug
Definition: reduce_pre_phase.hpp:48

thrill::core::ReduceMakeTableItem
Definition: reduce_functional.hpp:157

duplicate_detection.hpp

file.hpp

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::Insert
void Insert(const Value &v)
Definition: reduce_pre_phase.hpp:271

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::Insert
bool Insert(const Value &v)
Definition: reduce_pre_phase.hpp:167

reduce_functional.hpp

thrill
Definition: action_node.hpp:21

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::InsertSkip
void InsertSkip(const Value &v)
Definition: reduce_pre_phase.hpp:172

thrill::data::BlockReader::HasNext
TLX_ATTRIBUTE_ALWAYS_INLINE bool HasNext()
HasNext() returns true if at least one more item is available.
Definition: block_reader.hpp:118

thrill::core::ReducePrePhase
Definition: reduce_pre_phase.hpp:94

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::ReducePrePhase
ReducePrePhase(Context &ctx, size_t dia_id, size_t num_partitions, KeyExtractor key_extractor, ReduceFunction reduce_function, std::vector< BlockWriter > &emit, const ReduceConfig &config=ReduceConfig(), const IndexFunction &index_function=IndexFunction(), const EqualToFunction &equal_to_function=EqualToFunction(), const HashFunction hash_function=HashFunction())
Definition: reduce_pre_phase.hpp:257

thrill::core::ReducePrePhaseEmitter::ReducePrePhaseEmitter
ReducePrePhaseEmitter(std::vector< BlockWriter > &writer)
Definition: reduce_pre_phase.hpp:51

thrill::core::DuplicateDetection
Duplicate detection to identify all elements occuring only on one worker.
Definition: duplicate_detection.hpp:46

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::hash_function_
HashFunction hash_function_
Definition: reduce_pre_phase.hpp:335

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig, IndexFunction, EqualToFunction, HashFunction, true >::KeyValuePair
std::pair< Key, Value > KeyValuePair
Definition: reduce_pre_phase.hpp:255

thrill::core::ReducePrePhase< TableItem, Key, Value, KeyExtractor, ReduceFunction, VolatileKey, BlockWriter, ReduceConfig_, IndexFunction, KeyEqualFunction, HashFunction, false >::table_
Table table_
the first-level hash table implementation
Definition: reduce_pre_phase.hpp:217