Thrill  0.1
hash.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * thrill/common/hash.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Lorenz Hübschle-Schneider <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
11 #pragma once
12 #ifndef THRILL_COMMON_HASH_HEADER
13 #define THRILL_COMMON_HASH_HEADER
14 
15 #include <thrill/common/config.hpp>
17 
18 #include <array>
19 #include <cassert>
20 #include <cstdint>
21 #include <cstdlib>
22 #include <random>
23 #include <string>
24 #include <type_traits>
25 
26 #ifdef THRILL_HAVE_SSE4_2
27 #include <smmintrin.h> // crc32 instructions
28 #endif
29 
30 namespace thrill {
31 namespace common {
32 
33 /*
34  * A reinterpret_cast that doesn't violate the strict aliasing rule. Zero
35  * overhead with any reasonable compiler (g++ -O1 or higher, clang++ -O2 or
36  * higher)
37  */
38 template <typename To, typename From>
40  static_assert(sizeof(To) == sizeof(From),
41  "Cannot cast types of different sizes");
42  union {
43  From* in;
44  To * out;
45  };
46 };
47 
48 template <typename To, typename From>
49 To& alias_cast(From& raw_data) {
51  ac.in = &raw_data;
52  return *ac.out;
53 }
54 
55 template <typename To, typename From>
56 const To& alias_cast(const From& raw_data) {
58  ac.in = &raw_data;
59  return *ac.out;
60 }
61 
62 //! This is the Hash128to64 function from Google's cityhash (available under the
63 //! MIT License).
64 static inline uint64_t Hash128to64(const uint64_t upper, const uint64_t lower) {
65  // Murmur-inspired hashing.
66  const uint64_t k = 0x9DDFEA08EB382D69ull;
67  uint64_t a = (lower ^ upper) * k;
68  a ^= (a >> 47);
69  uint64_t b = (upper ^ a) * k;
70  b ^= (b >> 47);
71  b *= k;
72  return b;
73 }
74 
75 /*!
76  * Returns a uint32_t hash of a uint64_t.
77  *
78  * This comes from http://www.concentric.net/~ttwang/tech/inthash.htm
79  *
80  * This hash gives no guarantees on the cryptographic suitability nor the
81  * quality of randomness produced, and the mapping may change in the future.
82  */
83 static inline uint32_t Hash64to32(uint64_t key) {
84  key = (~key) + (key << 18);
85  key = key ^ (key >> 31);
86  key = key * 21;
87  key = key ^ (key >> 11);
88  key = key + (key << 6);
89  key = key ^ (key >> 22);
90  return (uint32_t)key;
91 }
92 
93 /*!
94  * Hashing helper that decides what is hashed
95  *
96  * Defaults to pointer to the object and sizeof(its type). Override these values
97  * for heap-allocated types. Some default overrides are provided.
98  */
99 template <typename T>
101  static const char * ptr(const T& x)
102  { return reinterpret_cast<const char*>(&x); }
103  static size_t size(const T&) { return sizeof(T); }
104 };
105 
106 template <>
108  static const char * ptr(const std::string& s) { return s.c_str(); }
109  static size_t size(const std::string& s) { return s.length(); }
110 };
111 
112 #ifdef THRILL_HAVE_SSE4_2
113 /**
114  * A CRC32C hasher using SSE4.2 intrinsics.
115  *
116  * Note that you need to provide specializations of HashDataSwitch if you want
117  * to hash types with heap storage.
118  */
119 template <typename ValueType>
120 struct HashCrc32Sse42 {
121  // Hash data with Intel's CRC32C instructions
122  // Copyright 2008,2009,2010 Massachusetts Institute of Technology.
123  // For constant sizes, this is neatly optimized away at higher optimization
124  // levels - only a mov (for initialization) and crc32 instructions remain
125  uint32_t hash_bytes(const void* data, size_t length, uint32_t crc = 0xffffffff) const {
126  const char* p_buf = (const char*)data;
127  // The 64-bit crc32 instruction returns a 64-bit value (even though a
128  // CRC32 hash has - well - 32 bits. Whatever.
129  uint64_t crc_carry = crc;
130  for (size_t i = 0; i < length / sizeof(uint64_t); i++) {
131  crc_carry = _mm_crc32_u64(crc_carry, *(const uint64_t*)p_buf);
132  p_buf += sizeof(uint64_t);
133  }
134  crc = (uint32_t)crc_carry; // discard the rest
135  length &= 7; // remaining length
136 
137  // ugly switch statement, faster than a loop-based version
138  switch (length) {
139  case 7:
140  crc = _mm_crc32_u8(crc, *p_buf++);
142  case 6:
143  crc = _mm_crc32_u16(crc, *(const uint16_t*)p_buf);
144  p_buf += 2;
146  // case 5 is below: 4 + 1
147  case 4:
148  crc = _mm_crc32_u32(crc, *(const uint32_t*)p_buf);
149  break;
150  case 3:
151  crc = _mm_crc32_u8(crc, *p_buf++);
153  case 2:
154  crc = _mm_crc32_u16(crc, *(const uint16_t*)p_buf);
155  break;
156  case 5:
157  crc = _mm_crc32_u32(crc, *(const uint32_t*)p_buf);
158  p_buf += 4;
160  case 1:
161  crc = _mm_crc32_u8(crc, *p_buf);
162  break;
163  case 0:
164  break;
165  default: // wat
166  assert(false);
167  }
168  return crc;
169  }
170 
171  uint32_t operator () (const ValueType& val, uint32_t crc = 0xffffffff) const {
172  const char* ptr = HashDataSwitch<ValueType>::ptr(val);
173  size_t size = HashDataSwitch<ValueType>::size(val);
174  return hash_bytes(ptr, size, crc);
175  }
176 };
177 #endif
178 
179 // CRC32C, adapted from Evan Jones' BSD-licensed implementation at
180 // http://www.evanjones.ca/crc32c.html
181 uint32_t crc32_slicing_by_8(uint32_t crc, const void* data, size_t length);
182 
183 /**
184  * Fallback CRC32C implementation in software.
185  *
186  * Note that you need to provide specializations of HashDataSwitch if you want to
187  * hash types with heap storage.
188  */
189 template <typename ValueType>
191  uint32_t operator () (const ValueType& val, uint32_t crc = 0xffffffff) const {
192  const char* ptr = HashDataSwitch<ValueType>::ptr(val);
193  size_t size = HashDataSwitch<ValueType>::size(val);
194  return crc32_slicing_by_8(crc, ptr, size);
195  }
196 };
197 
198 // If SSE4.2 is available, use the hardware implementation, which is roughly
199 // four to five times faster than the software fallback (less for small sizes).
200 #ifdef THRILL_HAVE_SSE4_2
201 template <typename T>
202 using HashCrc32 = HashCrc32Sse42<T>;
203 #else
204 template <typename T>
206 #endif
207 
208 /*!
209  * Tabulation Hashing, see https://en.wikipedia.org/wiki/Tabulation_hashing
210  *
211  * Keeps a table with size * 256 entries of type hash_t, filled with random
212  * values. Elements are hashed by treating them as a vector of 'size' bytes,
213  * and XOR'ing the values in the data[i]-th position of the i-th table, with i
214  * ranging from 0 to size - 1.
215  */
216 
217 template <size_t size, typename hash_t = uint32_t,
218  typename prng_t = std::mt19937>
220 {
221 public:
222  using hash_type = hash_t; // make public
223  using prng_type = prng_t;
224  using Subtable = std::array<hash_type, 256>;
225  using Table = std::array<Subtable, size>;
226 
227  explicit TabulationHashing(size_t seed = 0) { init(seed); }
228 
229  //! (re-)initialize the table by filling it with random values
230  void init(const size_t seed) {
231  prng_t rng { seed };
232  for (size_t i = 0; i < size; ++i) {
233  for (size_t j = 0; j < 256; ++j) {
234  table_[i][j] = rng();
235  }
236  }
237  }
238 
239  //! Hash an element
240  template <typename T>
241  hash_type operator () (const T& x) const {
242  static_assert(sizeof(T) == size, "Size mismatch with operand type");
243 
244  hash_t hash = 0;
245  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(&x);
246  for (size_t i = 0; i < size; ++i) {
247  hash ^= table_[i][*(ptr + i)];
248  }
249  return hash;
250  }
251 
252 protected:
254 };
255 
256 //! Tabulation hashing
257 template <typename T>
259 
260 //! Select a hashing method.
261 template <typename T>
263 
264 } // namespace common
265 } // namespace thrill
266 
267 #endif // !THRILL_COMMON_HASH_HEADER
268 
269 /******************************************************************************/
Tabulation Hashing, see https://en.wikipedia.org/wiki/Tabulation_hashing.
Definition: hash.hpp:219
void init(const size_t seed)
(re-)initialize the table by filling it with random values
Definition: hash.hpp:230
double T
TabulationHashing(size_t seed=0)
Definition: hash.hpp:227
static size_t size(const T &)
Definition: hash.hpp:103
#define TLX_ATTRIBUTE_FALLTHROUGH
STL namespace.
std::array< hash_type, 256 > Subtable
Definition: hash.hpp:224
static const char * ptr(const T &x)
Definition: hash.hpp:101
std::array< Subtable, size > Table
Definition: hash.hpp:225
static size_t size(const std::string &s)
Definition: hash.hpp:109
Hashing helper that decides what is hashed.
Definition: hash.hpp:100
static uint32_t Hash64to32(uint64_t key)
Returns a uint32_t hash of a uint64_t.
Definition: hash.hpp:83
list x
Definition: gen_data.py:39
uint64_t ull() const
return the number as an uint64_t (unsigned long long)
Definition: uint_types.hpp:128
static uint64_t Hash128to64(const uint64_t upper, const uint64_t lower)
Definition: hash.hpp:64
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
unsigned seed
HashCrc32< T > hash
Select a hashing method.
Definition: hash.hpp:262
To & alias_cast(From &raw_data)
Definition: hash.hpp:49
static const char * ptr(const std::string &s)
Definition: hash.hpp:108
uint32_t crc32_slicing_by_8(uint32_t crc, const void *data, size_t length)
Definition: hash.cpp:489