Thrill  0.1
hyperloglog.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * thrill/core/hyperloglog.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2017 Moritz Kiefer <[email protected]>
7  * Copyright (C) 2017 Tino Fuhrmann <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
12 #pragma once
13 #ifndef THRILL_CORE_HYPERLOGLOG_HEADER
14 #define THRILL_CORE_HYPERLOGLOG_HEADER
15 
17 #include <tlx/die.hpp>
18 #include <tlx/math/clz.hpp>
19 #include <tlx/siphash.hpp>
20 
21 #include <cmath>
22 #include <vector>
23 
24 namespace thrill {
25 namespace core {
26 
27 // The high 25 bit in this register are used for the index, the next 6 bits for
28 // the value and the last bit is currently unused
29 using HyperLogLogSparseRegister = uint32_t;
30 
32 
33 template <size_t p>
35 {
36 public:
38 
39  size_t size() const { return entries_.size(); }
40 
41  void toDense();
42 
43  bool shouldConvertToDense();
44  bool shouldMerge();
45 
46  template <typename ValueType>
47  void insert(const ValueType& value) {
48  // first p bits are the index
49  insert_hash(tlx::siphash(value));
50  }
51 
52  void insert_hash(const uint64_t& hash_value);
53  void mergeSparse();
54 
55  void mergeDense(const HyperLogLogRegisters<p>& b);
56 
57  //! calculate count estimation result adjusted for bias
58  double result();
59 
60  //! combine two HyperloglogRegisters, switches between sparse/dense
61  //! representations
62  HyperLogLogRegisters operator + (
63  const HyperLogLogRegisters<p>& registers2) const;
64 
65  //! declare friendship with serializers
66  template <typename Archive, typename T, typename Enable>
67  friend struct data::Serialization;
68 
69 private:
70  unsigned sparse_size_ = 0;
72 
73  // Register values are always smaller than 64. We thus need log2(64) = 6
74  // bits to store them. In particular an uint8_t is sufficient
75  std::vector<uint8_t> sparseListBuffer_;
76  std::vector<HyperLogLogSparseRegister> deltaSet_;
77  std::vector<uint8_t> entries_;
78 };
79 
80 /******************************************************************************/
81 // Additional Helpers, exposed mainly for testing
82 
83 namespace hyperloglog {
84 
85 template <size_t sparsePrecision, size_t densePrecision>
86 uint32_t encodeHash(uint64_t hash);
87 
88 template <size_t sparsePrecision, size_t densePrecision>
89 std::pair<size_t, uint8_t> decodeHash(HyperLogLogSparseRegister reg);
90 
91 //! Perform a varint and a difference encoding
92 std::vector<uint8_t> encodeSparseList(const std::vector<uint32_t>& sparseList);
93 std::vector<uint32_t> decodeSparseList(const std::vector<uint8_t>& sparseList);
94 
95 } // namespace hyperloglog
96 } // namespace core
97 
98 namespace data {
99 
100 template <typename Archive, size_t p>
101 struct Serialization<Archive, core::HyperLogLogRegisters<p> > {
102 
103  static void Serialize(const core::HyperLogLogRegisters<p>& x, Archive& ar);
104  static core::HyperLogLogRegisters<p> Deserialize(Archive& ar);
105 
106  static constexpr bool is_fixed_size = false;
107  static constexpr size_t fixed_size = 0;
108 };
109 
110 } // namespace data
111 } // namespace thrill
112 
113 #endif // !THRILL_CORE_HYPERLOGLOG_HEADER
114 
115 /******************************************************************************/
std::pair< size_t, uint8_t > decodeHash(HyperLogLogSparseRegister reg)
uint32_t HyperLogLogSparseRegister
Definition: hyperloglog.hpp:29
std::vector< uint8_t > sparseListBuffer_
Definition: hyperloglog.hpp:75
void insert(const ValueType &value)
Definition: hyperloglog.hpp:47
std::vector< uint32_t > decodeSparseList(const std::vector< uint8_t > &sparseList)
list x
Definition: gen_data.py:39
int value
Definition: gen_data.py:41
static uint64_t siphash(const uint8_t key[16], const uint8_t *msg, size_t size)
Definition: siphash.hpp:240
HyperLogLogRegisterFormat format_
Definition: hyperloglog.hpp:71
std::vector< HyperLogLogSparseRegister > deltaSet_
Definition: hyperloglog.hpp:76
std::vector< uint8_t > entries_
Definition: hyperloglog.hpp:77
uint32_t encodeHash(uint64_t hash)
std::vector< uint8_t > encodeSparseList(const std::vector< uint32_t > &sparseList)
Perform a varint and a difference encoding.
HashCrc32< T > hash
Select a hashing method.
Definition: hash.hpp:262