Thrill  0.1
dc7.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/suffix_sorting/dc7.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2015-2016 Timo Bingmann <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
13 
15 #include <thrill/api/cache.hpp>
16 #include <thrill/api/collapse.hpp>
17 #include <thrill/api/dia.hpp>
18 #include <thrill/api/gather.hpp>
19 #include <thrill/api/max.hpp>
20 #include <thrill/api/merge.hpp>
22 #include <thrill/api/print.hpp>
23 #include <thrill/api/size.hpp>
24 #include <thrill/api/sort.hpp>
25 #include <thrill/api/sum.hpp>
26 #include <thrill/api/union.hpp>
27 #include <thrill/api/window.hpp>
32 #include <tlx/cmdline_parser.hpp>
33 
34 #include <algorithm>
35 #include <iomanip>
36 #include <limits>
37 #include <random>
38 #include <stdexcept>
39 #include <string>
40 #include <tuple>
41 #include <utility>
42 #include <vector>
43 
44 namespace examples {
45 namespace suffix_sorting {
46 namespace dc7_local {
47 
48 //! A tuple with index (i,t_i,t_{i+1},t_{i+2},...,t_{i+6}).
49 template <typename AlphabetType>
50 struct Chars {
51  AlphabetType ch[7];
52 
53  bool operator == (const Chars& b) const {
54  return std::tie(ch[0], ch[1], ch[2], ch[3], ch[4], ch[5], ch[6])
55  == std::tie(b.ch[0], b.ch[1], b.ch[2], b.ch[3],
56  b.ch[4], b.ch[5], b.ch[6]);
57  }
58 
59  bool operator < (const Chars& b) const {
60  return std::tie(ch[0], ch[1], ch[2], ch[3], ch[4], ch[5], ch[6])
61  < std::tie(b.ch[0], b.ch[1], b.ch[2], b.ch[3],
62  b.ch[4], b.ch[5], b.ch[6]);
63  }
64 
65  friend std::ostream& operator << (std::ostream& os, const Chars& chars) {
66  return os << '[' << chars.ch[0] << ',' << chars.ch[1]
67  << ',' << chars.ch[2] << ',' << chars.ch[3]
68  << ',' << chars.ch[4] << ',' << chars.ch[5]
69  << ',' << chars.ch[6] << ']';
70  }
71 
72  static Chars Lowest() {
73  return Chars {
74  {
75  std::numeric_limits<AlphabetType>::lowest(),
76  std::numeric_limits<AlphabetType>::lowest(),
77  std::numeric_limits<AlphabetType>::lowest(),
78  std::numeric_limits<AlphabetType>::lowest(),
79  std::numeric_limits<AlphabetType>::lowest(),
80  std::numeric_limits<AlphabetType>::lowest(),
81  std::numeric_limits<AlphabetType>::lowest()
82  }
83  };
84  }
86 
87 //! A tuple with index (i,t_i,t_{i+1},t_{i+2}).
88 template <typename Index, typename AlphabetType>
89 struct IndexChars {
90  Index index;
91  Chars<AlphabetType> chars;
92 
93  AlphabetType at_radix(size_t depth) const { return chars.ch[depth]; }
94 
95  friend std::ostream& operator << (std::ostream& os, const IndexChars& tc) {
96  return os << '[' << tc.index << '|' << tc.chars << ']';
97  }
99 
100 //! A pair (index, rank)
101 template <typename Index>
102 struct IndexRank {
103  Index index;
104  Index rank;
105 
106  friend std::ostream& operator << (std::ostream& os, const IndexRank& tr) {
107  return os << '(' << tr.index << '|' << tr.rank << ')';
108  }
110 
111 //! fragments at string positions i = 0 mod 7.
112 template <typename Index, typename AlphabetType>
113 struct StringFragmentMod0 {
114  Index index;
115  Index r0, r1, r3;
116  AlphabetType t[3];
117 
118  friend std::ostream& operator << (std::ostream& os,
119  const StringFragmentMod0& sf) {
120  return os << "i=" << sf.index
121  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
122  << " r0=" << sf.r0 << " r1=" << sf.r1 << " r3=" << sf.r3;
123  }
125 
126 //! fragments at string positions i = 1 mod 7.
127 template <typename Index, typename AlphabetType>
128 struct StringFragmentMod1 {
129  Index index;
130  Index r0, r2, r6;
131  AlphabetType t[6];
132 
133  friend std::ostream& operator << (std::ostream& os,
134  const StringFragmentMod1& sf) {
135  return os << "i=" << sf.index
136  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
137  << " t3=" << sf.t[3] << " t4=" << sf.t[4] << " t5=" << sf.t[5]
138  << " r0=" << sf.r0 << " r2=" << sf.r2 << " r6=" << sf.r6;
139  }
141 
142 //! fragments at string positions i = 2 mod 7.
143 template <typename Index, typename AlphabetType>
144 struct StringFragmentMod2 {
145  Index index;
146  Index r1, r5, r6;
147  AlphabetType t[6];
148 
149  friend std::ostream& operator << (std::ostream& os,
150  const StringFragmentMod2& sf) {
151  return os << "i=" << sf.index
152  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
153  << " t3=" << sf.t[3] << " t4=" << sf.t[4] << " t5=" << sf.t[5]
154  << " r1=" << sf.r1 << " r5=" << sf.r5 << " r6=" << sf.r6;
155  }
157 
158 //! fragments at string positions i = 3 mod 7.
159 template <typename Index, typename AlphabetType>
160 struct StringFragmentMod3 {
161  Index index;
162  Index r0, r4, r5;
163  AlphabetType t[5];
164 
165  friend std::ostream& operator << (std::ostream& os,
166  const StringFragmentMod3& sf) {
167  return os << "i=" << sf.index
168  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
169  << " t3=" << sf.t[3] << " t4=" << sf.t[4]
170  << " r0=" << sf.r0 << " r4=" << sf.r4 << " r5=" << sf.r5;
171  }
173 
174 //! fragments at string positions i = 4 mod 7.
175 template <typename Index, typename AlphabetType>
176 struct StringFragmentMod4 {
177  Index index;
178  Index r3, r4, r6;
179  AlphabetType t[6];
180 
181  friend std::ostream& operator << (std::ostream& os,
182  const StringFragmentMod4& sf) {
183  return os << "i=" << sf.index
184  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
185  << " t3=" << sf.t[3] << " t4=" << sf.t[4] << " t5=" << sf.t[5]
186  << " r3=" << sf.r3 << " r4=" << sf.r4 << " r6=" << sf.r6;
187  }
189 
190 //! fragments at string positions i = 5 mod 7.
191 template <typename Index, typename AlphabetType>
192 struct StringFragmentMod5 {
193  Index index;
194  Index r2, r3, r5;
195  AlphabetType t[5];
196 
197  friend std::ostream& operator << (std::ostream& os,
198  const StringFragmentMod5& sf) {
199  return os << "i=" << sf.index
200  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
201  << " t3=" << sf.t[3] << " t4=" << sf.t[4]
202  << " r2=" << sf.r2 << " r3=" << sf.r3 << " r5=" << sf.r5;
203  }
205 
206 //! fragments at string positions i = 6 mod 7.
207 template <typename Index, typename AlphabetType>
208 struct StringFragmentMod6 {
209  Index index;
210  Index r1, r2, r4;
211  AlphabetType t[4];
212 
213  friend std::ostream& operator << (std::ostream& os,
214  const StringFragmentMod6& sf) {
215  return os << "i=" << sf.index
216  << " t0=" << sf.t[0] << " t1=" << sf.t[1] << " t2=" << sf.t[2]
217  << " t3=" << sf.t[3]
218  << " r1=" << sf.r1 << " r2=" << sf.r2 << " r4=" << sf.r4;
219  }
221 
222 //! Union of String Fragments with Index
223 template <typename Index, typename AlphabetType>
224 struct StringFragment {
225 
226  struct Common {
227  Index index;
228  Index ranks[3];
229  AlphabetType text[6];
231 
232  union {
233  Index index;
234  Common common;
235  StringFragmentMod0<Index, AlphabetType> mod0;
236  StringFragmentMod1<Index, AlphabetType> mod1;
237  StringFragmentMod2<Index, AlphabetType> mod2;
238  StringFragmentMod3<Index, AlphabetType> mod3;
239  StringFragmentMod4<Index, AlphabetType> mod4;
240  StringFragmentMod5<Index, AlphabetType> mod5;
241  StringFragmentMod6<Index, AlphabetType> mod6;
243 
244  StringFragment() = default;
245 
246  // conversion from StringFragmentMod0
247  explicit StringFragment(
248  const StringFragmentMod0<Index, AlphabetType>& _mod0) : mod0(_mod0) { }
249 
250  // conversion from StringFragmentMod1
251  explicit StringFragment(
252  const StringFragmentMod1<Index, AlphabetType>& _mod1) : mod1(_mod1) { }
253 
254  // conversion from StringFragmentMod2
255  explicit StringFragment(
256  const StringFragmentMod2<Index, AlphabetType>& _mod2) : mod2(_mod2) { }
257 
258  // conversion from StringFragmentMod3
259  explicit StringFragment(
260  const StringFragmentMod3<Index, AlphabetType>& _mod3) : mod3(_mod3) { }
261 
262  // conversion from StringFragmentMod4
263  explicit StringFragment(
264  const StringFragmentMod4<Index, AlphabetType>& _mod4) : mod4(_mod4) { }
265 
266  // conversion from StringFragmentMod5
267  explicit StringFragment(
268  const StringFragmentMod5<Index, AlphabetType>& _mod5) : mod5(_mod5) { }
269 
270  // conversion from StringFragmentMod6
271  explicit StringFragment(
272  const StringFragmentMod6<Index, AlphabetType>& _mod6) : mod6(_mod6) { }
273 
274  friend std::ostream& operator << (std::ostream& os,
275  const StringFragment& tc) {
276  os << '[' << std::to_string(tc.index) << '|';
277  if (tc.index % 7 == 0)
278  return os << "0|" << tc.mod0 << ']';
279  else if (tc.index % 7 == 1)
280  return os << "1|" << tc.mod1 << ']';
281  else if (tc.index % 7 == 2)
282  return os << "2|" << tc.mod2 << ']';
283  else if (tc.index % 7 == 3)
284  return os << "3|" << tc.mod3 << ']';
285  else if (tc.index % 7 == 4)
286  return os << "4|" << tc.mod4 << ']';
287  else if (tc.index % 7 == 5)
288  return os << "5|" << tc.mod5 << ']';
289  else if (tc.index % 7 == 6)
290  return os << "6|" << tc.mod6 << ']';
291  abort();
292  }
293 
294  AlphabetType at_radix(size_t depth) const { return common.text[depth]; }
295  Index sort_rank() const { return common.ranks[0]; }
297 
298 static constexpr size_t fragment_comparator_params[7][7][3] =
299 {
300  {
301  { 0, 0, 0 }, { 0, 0, 0 }, { 1, 1, 0 }, { 0, 0, 0 },
302  { 3, 2, 0 }, { 3, 2, 1 }, { 1, 1, 0 }
303  },
304  {
305  { 0, 0, 0 }, { 0, 0, 0 }, { 6, 2, 2 }, { 0, 0, 0 },
306  { 6, 2, 2 }, { 2, 1, 0 }, { 2, 1, 1 }
307  },
308  {
309  { 1, 0, 1 }, { 6, 2, 2 }, { 1, 0, 0 }, { 5, 1, 2 },
310  { 6, 2, 2 }, { 5, 1, 2 }, { 1, 0, 0 }
311  },
312  {
313  { 0, 0, 0 }, { 0, 0, 0 }, { 5, 2, 1 }, { 0, 0, 0 },
314  { 4, 1, 1 }, { 5, 2, 2 }, { 4, 1, 2 }
315  },
316  {
317  { 3, 0, 2 }, { 6, 2, 2 }, { 6, 2, 2 }, { 4, 1, 1 },
318  { 3, 0, 0 }, { 3, 0, 1 }, { 4, 1, 2 }
319  },
320  {
321  { 3, 1, 2 }, { 2, 0, 1 }, { 5, 2, 1 }, { 5, 2, 2 },
322  { 3, 1, 0 }, { 2, 0, 0 }, { 2, 0, 1 }
323  },
324  {
325  { 1, 0, 1 }, { 2, 1, 1 }, { 1, 0, 0 }, { 4, 2, 1 },
326  { 4, 2, 1 }, { 2, 1, 0 }, { 1, 0, 0 }
327  },
328 };
329 
330 template <typename StringFragment>
331 struct FragmentComparator {
332 
333  bool operator () (const StringFragment& a, const StringFragment& b) const {
334 
335  unsigned ai = a.index % 7, bi = b.index % 7;
336 
337  const size_t* params = fragment_comparator_params[ai][bi];
338 
339  for (size_t d = 0; d < params[0]; ++d)
340  {
341  if (a.common.text[d] == b.common.text[d]) continue;
342  return (a.common.text[d] < b.common.text[d]);
343  }
344 
345  return a.common.ranks[params[1]] < b.common.ranks[params[2]];
346  }
347 };
348 
349 template <typename Index, typename Char>
350 struct CharsRanks013 {
351  Chars<Char> chars;
352  Index rank0;
353  Index rank1;
354  Index rank3;
355 
356  friend std::ostream& operator << (std::ostream& os,
357  const CharsRanks013& c) {
358  return os << "(ch=" << c.chars
359  << " r0=" << c.rank0 << " r1=" << c.rank1
360  << " r3=" << c.rank3 << ")";
361  }
363 
364 template <typename Index, typename Char>
365 struct IndexCR013Pair {
366  Index index;
367  CharsRanks013<Index, Char> cr0;
368  CharsRanks013<Index, Char> cr1;
370 
371 template <typename Type, size_t MaxDepth>
372 class RadixSortFragment
373 {
374 public:
375  explicit RadixSortFragment(size_t K) : K_(K) { }
376  template <typename CompareFunction>
377  void operator () (
378  typename std::vector<Type>::iterator begin,
379  typename std::vector<Type>::iterator end,
380  const CompareFunction& cmp) const {
381  if (K_ <= 4096) {
382  thrill::common::radix_sort_CI<MaxDepth>(
383  begin, end, K_, cmp, [](auto begin, auto end, auto) {
384  // sub sorter: sort StringFragments by rank
385  std::sort(begin, end, [](const Type& a, const Type& b) {
386  return a.sort_rank() < b.sort_rank();
387  });
388  });
389  }
390  else {
391  std::sort(begin, end, cmp);
392  }
393  }
394 
395 private:
396  const size_t K_;
397 };
398 
399 } // namespace dc7_local
400 
401 using namespace thrill; // NOLINT
403 
404 static inline bool IsDiffCover7(size_t i) {
405  size_t m = i % 7;
406  return m == 0 || m == 1 || m == 3;
407 }
408 
409 template <typename Index, typename InputDIA>
411 DC7Recursive(const InputDIA& input_dia, size_t input_size, size_t K) {
412 
413  using Char = typename InputDIA::ValueType;
414  using IndexChars = dc7_local::IndexChars<Index, Char>;
415  using IndexRank = dc7_local::IndexRank<Index>;
416  using Chars = dc7_local::Chars<Char>;
417 
418  Context& ctx = input_dia.context();
419 
420  auto tuple_sorted =
421  input_dia.Keep()
422  // map (t_i) -> (i,t_i,t_{i+1},...,t_{i+6}) where i = 0,1,3 mod 7
423  .template FlatWindow<IndexChars>(
424  7,
425  [](size_t index, const RingBuffer<Char>& r, auto emit) {
426  if (IsDiffCover7(index))
427  emit(IndexChars {
428  Index(index), {
429  { r[0], r[1], r[2], r[3], r[4], r[5], r[6] }
430  }
431  });
432  },
433  [input_size](size_t index, const RingBuffer<Char>& r, auto emit) {
434  // emit last sentinel items.
435  if (IsDiffCover7(index)) {
436  emit(IndexChars {
437  Index(index), {
438  { r.size() >= 1 ? r[0] : Char(),
439  r.size() >= 2 ? r[1] : Char(),
440  r.size() >= 3 ? r[2] : Char(),
441  r.size() >= 4 ? r[3] : Char(),
442  r.size() >= 5 ? r[4] : Char(),
443  r.size() >= 6 ? r[5] : Char(),
444  Char() }
445  }
446  });
447  }
448 
449  if (index + 1 == input_size && input_size % 7 == 0) {
450  // emit a sentinel tuple for inputs n % 7 == 0 to
451  // separate mod0 and mod1 strings in recursive
452  // subproblem. example which needs this: aaaaaaaaaa.
453  emit(IndexChars { Index(input_size), Chars::Lowest() });
454  }
455  if (index + 1 == input_size && input_size % 7 == 1) {
456  // emit a sentinel tuple for inputs n % 7 == 1 to
457  // separate mod1 and mod3 strings in recursive
458  // subproblem. example which needs this: aaaaaaaaaa.
459  emit(IndexChars { Index(input_size), Chars::Lowest() });
460  }
461  })
462  // sort tuples by contained letters
463  .Sort([](const IndexChars& a, const IndexChars& b) {
464  return a.chars < b.chars;
466 
467  if (debug_print)
468  tuple_sorted.Keep().Print("tuple_sorted");
469 
470  // save tuple's indexes (sorted by tuple content) -> less storage
471  auto tuple_index_sorted =
472  tuple_sorted
473  .Map([](const IndexChars& tc) { return tc.index; })
474  .Cache();
475 
476  auto tuple_prerank_sums =
477  tuple_sorted
478  .template FlatWindow<Index>(
479  2, [](size_t index, const RingBuffer<IndexChars>& rb, auto emit) {
480  assert(rb.size() == 2);
481 
482  // emit one sentinel for index 0.
483  if (index == 0) emit(0);
484 
485  // emit 0 or 1 depending on whether previous tuple is equal
486  emit(rb[0].chars == rb[1].chars ? 0 : 1);
487  })
488  .PrefixSum();
489 
490  if (debug_print)
491  tuple_prerank_sums.Keep().Print("tuple_prerank_sums");
492 
493  // get the last element via an associative reduce.
494  const Index max_lexname = tuple_prerank_sums.Keep().Max();
495 
496  // size of the mod0 part of the recursive subproblem
497  const Index size_mod0 = Index(input_size / 7 + 1);
498 
499  // size of the mod1 part of the recursive subproblem
500  const Index size_mod1 = Index(input_size / 7 + (input_size % 7 >= 1));
501 
502  // size of the mod3 part of the recursive subproblem
503  const Index size_mod3 = Index(input_size / 7 + (input_size % 7 >= 4));
504 
505  // size of both the mod0 and mod1 parts
506  const Index size_mod01 = size_mod0 + size_mod1;
507 
508  // compute the size of the 3/7 subproblem.
509  const Index size_subp = size_mod01 + size_mod3;
510 
511  if (debug_print) {
512  sLOG1 << "max_lexname=" << max_lexname
513  << " size_subp=" << size_subp
514  << " size_mod0=" << size_mod0
515  << " size_mod1=" << size_mod1
516  << " size_mod3=" << size_mod3;
517  }
518 
519  if (debug_print) {
520  assert_equal(
521  tuple_sorted.Keep().Filter([](const IndexChars& a) {
522  return a.index % 7 == 0;
523  }).Size(), size_mod0);
524 
525  assert_equal(
526  tuple_sorted.Keep().Filter([](const IndexChars& a) {
527  return a.index % 7 == 1;
528  }).Size(), size_mod1);
529 
530  assert_equal(
531  tuple_sorted.Keep().Filter([](const IndexChars& a) {
532  return a.index % 7 == 3;
533  }).Size(), size_mod3);
534  }
535 
536  assert_equal(tuple_index_sorted.Keep().Size(), size_subp);
537 
538  DIA<IndexRank> ranks_mod013;
539 
540  if (max_lexname + Index(1) != size_subp) {
541  // some lexical name is not unique -> perform recursion on three
542  // substrings (mod 0, mod 1, and mod 3)
543 
544  // zip tuples and ranks.
545  auto tuple_ranks =
546  tuple_index_sorted
548  tuple_prerank_sums,
549  [](const Index& tuple_index, const Index& rank) {
550  return IndexRank { tuple_index, rank };
551  });
552 
553  if (debug_print)
554  tuple_ranks.Keep().Print("tuple_ranks");
555 
556  // construct recursion string with all ranks at mod 0 indices followed
557  // by all ranks at mod 1 indices followed by all ranks at mod 3 indices.
558  DIA<Index> string_mod013 =
559  tuple_ranks
560  .Sort([](const IndexRank& a, const IndexRank& b) {
561  if (a.index % 7 == b.index % 7)
562  return a.index < b.index;
563  else
564  return a.index % 7 < b.index % 7;
565  })
566  .Map([](const IndexRank& tr) {
567  return tr.rank;
568  })
569  .Cache();
570 
571  if (debug_print)
572  string_mod013.Keep().Print("string_mod013");
573 
574  assert_equal(string_mod013.Keep().Size(), size_subp);
575 
576  using RecStringFragment = dc7_local::StringFragment<Index, Index>;
577 
578  DIA<RecStringFragment> suffix_array_rec = DC7Recursive<Index>(
579  string_mod013, size_subp, max_lexname + Index(1));
580 
581  // reverse suffix array of recursion strings to find ranks for mod 0,
582  // mod 1, and mod 3 positions.
583 
584  if (debug_print)
585  suffix_array_rec.Keep().Print("suffix_array_rec");
586 
587  ranks_mod013 =
588  suffix_array_rec
589  .ZipWithIndex([](const RecStringFragment& sa, const size_t& i) {
590  // add one to ranks such that zero can be used as sentinel
591  // for suffixes beyond the end of the string.
592  return IndexRank { sa.index, Index(i + 1) };
593  })
594  .Sort([size_mod0, size_mod01](
595  const IndexRank& a, const IndexRank& b) {
596 
597  Index ai = (a.index < size_mod0 ? a.index :
598  a.index < size_mod01 ? a.index - size_mod0 :
599  a.index - size_mod01);
600 
601  Index bi = (b.index < size_mod0 ? b.index :
602  b.index < size_mod01 ? b.index - size_mod0 :
603  b.index - size_mod01);
604 
605  // use sort order to interleave ranks mod 0/1/3
606  return ai < bi || (ai == bi && a.index < b.index);
607  });
608 
609  if (debug_print) {
610  // check that ranks are correctly interleaved
611  ranks_mod013.Keep()
612  .Window(
613  DisjointTag, 3,
614  [size_mod0, size_mod01](size_t, const std::vector<IndexRank>& ir) {
615  die_unless(ir[0].index < size_mod0);
616  die_unless(ir[1].index >= size_mod0 || ir[1].rank < size_mod01);
617  die_unless(ir[2].index >= size_mod01);
618  return true;
619  })
620  .Execute();
621  ranks_mod013.Keep().Print("ranks_rec");
622  }
623  }
624  else {
625  if (ctx.my_rank() == 0)
626  sLOG1 << "*** recursion finished ***";
627 
628  if (debug_print)
629  tuple_index_sorted.Keep().Print("tuple_index_sorted");
630 
631  ranks_mod013 =
632  tuple_index_sorted
633  .ZipWithIndex(
634  [](const Index& sa, const size_t& i) {
635  // add one to ranks such that zero can be used as sentinel
636  // for suffixes beyond the end of the string.
637  return IndexRank { sa, Index(i + 1) };
638  })
639  .Sort([](const IndexRank& a, const IndexRank& b) {
640  // use sort order for better locality later.
641  return a.index / 7 < b.index / 7 || (
642  a.index / 7 == b.index / 7 &&
643  a.index < b.index);
644  });
645 
646  if (debug_print) {
647  // check that ranks are correctly interleaved
648  ranks_mod013.Keep()
649  .Window(
650  DisjointTag, 3,
651  [](size_t, const std::vector<IndexRank>& ir) {
652  die_unless(ir[0].index % 7 == 0);
653  die_unless(ir[1].index % 7 == 1);
654  die_unless(ir[2].index % 7 == 3);
655  return true;
656  })
657  .Execute();
658  ranks_mod013.Keep().Print("ranks_rec");
659  }
660  }
661 
662  // *** construct StringFragments ***
663 
664  // Zip together the three arrays, create pairs, and extract needed
665  // tuples into string fragments.
666 
667  using StringFragmentMod0 = dc7_local::StringFragmentMod0<Index, Char>;
668  using StringFragmentMod1 = dc7_local::StringFragmentMod1<Index, Char>;
669  using StringFragmentMod2 = dc7_local::StringFragmentMod2<Index, Char>;
670  using StringFragmentMod3 = dc7_local::StringFragmentMod3<Index, Char>;
671  using StringFragmentMod4 = dc7_local::StringFragmentMod4<Index, Char>;
672  using StringFragmentMod5 = dc7_local::StringFragmentMod5<Index, Char>;
673  using StringFragmentMod6 = dc7_local::StringFragmentMod6<Index, Char>;
674 
675  using CharsRanks013 = dc7_local::CharsRanks013<Index, Char>;
676  using IndexCR013Pair = dc7_local::IndexCR013Pair<Index, Char>;
677 
678  auto zip_tuple_pairs1 =
679  ZipWindow(
680  ArrayTag, PadTag, /* window_size */ {
681  { 7, 3 }
682  },
683  [](const std::array<Char, 7>& ch, const std::array<IndexRank, 3>& mod013) {
684  return CharsRanks013 {
685  {
686  { ch[0], ch[1], ch[2], ch[3], ch[4], ch[5], ch[6] }
687  },
688  mod013[0].rank, mod013[1].rank, mod013[2].rank
689  };
690  },
691  std::make_tuple(std::numeric_limits<Char>::lowest(), IndexRank { 0, 0 }),
692  input_dia, ranks_mod013);
693 
694  if (debug_print)
695  zip_tuple_pairs1.Keep().Print("zip_tuple_pairs1");
696 
697  auto zip_tuple_pairs =
698  zip_tuple_pairs1
699  .template FlatWindow<IndexCR013Pair>(
700  2, [size_mod0](
701  size_t index, const RingBuffer<CharsRanks013>& rb, auto emit) {
702  emit(IndexCR013Pair { Index(7 * index), rb[0], rb[1] });
703  if (index + 2 == size_mod0) {
704  // emit last sentinel
705  emit(IndexCR013Pair {
706  Index(7 * (index + 1)), rb[1],
707  CharsRanks013 { Chars::Lowest(), 0, 0, 0 }
708  });
709  }
710  });
711 
712  auto fragments_mod0 =
713  zip_tuple_pairs
714  .Map([](const IndexCR013Pair& ip) {
715  return StringFragmentMod0 {
716  ip.index,
717  ip.cr0.rank0, ip.cr0.rank1, ip.cr0.rank3,
718  { ip.cr0.chars.ch[0], ip.cr0.chars.ch[1],
719  ip.cr0.chars.ch[2] }
720  };
721  })
722  .Filter([input_size](const StringFragmentMod0& mod0) {
723  return mod0.index < Index(input_size);
724  });
725 
726  auto fragments_mod1 =
727  zip_tuple_pairs
728  .Map([](const IndexCR013Pair& ip) {
729  return StringFragmentMod1 {
730  ip.index + Index(1),
731  ip.cr0.rank1, ip.cr0.rank3, ip.cr1.rank0,
732  { ip.cr0.chars.ch[1], ip.cr0.chars.ch[2],
733  ip.cr0.chars.ch[3], ip.cr0.chars.ch[4],
734  ip.cr0.chars.ch[5], ip.cr0.chars.ch[6] }
735  };
736  })
737  .Filter([input_size](const StringFragmentMod1& mod1) {
738  return mod1.index < Index(input_size);
739  });
740 
741  auto fragments_mod2 =
742  zip_tuple_pairs
743  .Map([](const IndexCR013Pair& ip) {
744  return StringFragmentMod2 {
745  ip.index + Index(2),
746  ip.cr0.rank3, ip.cr1.rank0, ip.cr1.rank1,
747  { ip.cr0.chars.ch[2], ip.cr0.chars.ch[3],
748  ip.cr0.chars.ch[4], ip.cr0.chars.ch[5],
749  ip.cr0.chars.ch[6], ip.cr1.chars.ch[0] }
750  };
751  })
752  .Filter([input_size](const StringFragmentMod2& mod2) {
753  return mod2.index < Index(input_size);
754  });
755 
756  auto fragments_mod3 =
757  zip_tuple_pairs
758  .Map([](const IndexCR013Pair& ip) {
759  return StringFragmentMod3 {
760  ip.index + Index(3),
761  ip.cr0.rank3, ip.cr1.rank0, ip.cr1.rank1,
762  { ip.cr0.chars.ch[3], ip.cr0.chars.ch[4],
763  ip.cr0.chars.ch[5], ip.cr0.chars.ch[6],
764  ip.cr1.chars.ch[0] }
765  };
766  })
767  .Filter([input_size](const StringFragmentMod3& mod3) {
768  return mod3.index < Index(input_size);
769  });
770 
771  auto fragments_mod4 =
772  zip_tuple_pairs
773  .Map([](const IndexCR013Pair& ip) {
774  return StringFragmentMod4 {
775  ip.index + Index(4),
776  ip.cr1.rank0, ip.cr1.rank1, ip.cr1.rank3,
777  { ip.cr0.chars.ch[4], ip.cr0.chars.ch[5],
778  ip.cr0.chars.ch[6], ip.cr1.chars.ch[0],
779  ip.cr1.chars.ch[1], ip.cr1.chars.ch[2] }
780  };
781  })
782  .Filter([input_size](const StringFragmentMod4& mod4) {
783  return mod4.index < Index(input_size);
784  });
785 
786  auto fragments_mod5 =
787  zip_tuple_pairs
788  .Map([](const IndexCR013Pair& ip) {
789  return StringFragmentMod5 {
790  ip.index + Index(5),
791  ip.cr1.rank0, ip.cr1.rank1, ip.cr1.rank3,
792  { ip.cr0.chars.ch[5], ip.cr0.chars.ch[6],
793  ip.cr1.chars.ch[0], ip.cr1.chars.ch[1],
794  ip.cr1.chars.ch[2] }
795  };
796  })
797  .Filter([input_size](const StringFragmentMod5& mod5) {
798  return mod5.index < Index(input_size);
799  });
800 
801  auto fragments_mod6 =
802  zip_tuple_pairs
803  .Map([](const IndexCR013Pair& ip) {
804  return StringFragmentMod6 {
805  ip.index + Index(6),
806  ip.cr1.rank0, ip.cr1.rank1, ip.cr1.rank3,
807  { ip.cr0.chars.ch[6], ip.cr1.chars.ch[0],
808  ip.cr1.chars.ch[1], ip.cr1.chars.ch[2] }
809  };
810  })
811  .Filter([input_size](const StringFragmentMod6& mod6) {
812  return mod6.index < Index(input_size);
813  });
814 
815  if (debug_print) {
816  fragments_mod0.Keep().Print("fragments_mod0");
817  fragments_mod1.Keep().Print("fragments_mod1");
818  fragments_mod2.Keep().Print("fragments_mod2");
819  fragments_mod3.Keep().Print("fragments_mod3");
820  fragments_mod4.Keep().Print("fragments_mod4");
821  fragments_mod5.Keep().Print("fragments_mod5");
822  fragments_mod6.Keep().Print("fragments_mod6");
823  }
824 
825  // Sort/Merge and map to only suffix array
826 
827  using StringFragment = dc7_local::StringFragment<Index, Char>;
828 
829  auto string_fragments_mod0 =
830  fragments_mod0
831  .Map([](const StringFragmentMod0& mod0)
832  { return StringFragment(mod0); });
833 
834  auto string_fragments_mod1 =
835  fragments_mod1
836  .Map([](const StringFragmentMod1& mod1)
837  { return StringFragment(mod1); });
838 
839  auto string_fragments_mod2 =
840  fragments_mod2
841  .Map([](const StringFragmentMod2& mod2)
842  { return StringFragment(mod2); });
843 
844  auto string_fragments_mod3 =
845  fragments_mod3
846  .Map([](const StringFragmentMod3& mod3)
847  { return StringFragment(mod3); });
848 
849  auto string_fragments_mod4 =
850  fragments_mod4
851  .Map([](const StringFragmentMod4& mod4)
852  { return StringFragment(mod4); });
853 
854  auto string_fragments_mod5 =
855  fragments_mod5
856  .Map([](const StringFragmentMod5& mod5)
857  { return StringFragment(mod5); });
858 
859  auto string_fragments_mod6 =
860  fragments_mod6
861  .Map([](const StringFragmentMod6& mod6)
862  { return StringFragment(mod6); });
863 
864  auto suffix_array =
865  Union(string_fragments_mod0,
866  string_fragments_mod1,
867  string_fragments_mod2,
868  string_fragments_mod3,
869  string_fragments_mod4,
870  string_fragments_mod5,
871  string_fragments_mod6)
872  .Sort(dc7_local::FragmentComparator<StringFragment>())
873  .Execute();
874 
875  // debug output
876 
877  if (debug_print) {
878  std::vector<Char> input_vec = input_dia.Keep().Gather();
879  std::vector<Index> vec =
880  suffix_array.Keep()
881  .Map([](const StringFragment& a) { return a.index; })
882  .Gather();
883 
884  if (ctx.my_rank() == 0) {
885  size_t p = 0;
886  for (const Index& index : vec) {
887  std::cout << std::setw(5) << p << std::setw(5) << index << " =";
888  for (Index i = index;
889  i < index + Index(64) && i < Index(input_size); ++i) {
890  std::cout << ' ' << uint64_t(input_vec[i]);
891  }
892  std::cout << '\n';
893  ++p;
894  }
895  }
896  }
897 
898  // check intermediate result, requires an input_dia.Keep() above!
899  // die_unless(CheckSA(input_dia, suffix_array.Keep()));
900 
901  return suffix_array.Collapse();
902 }
903 
904 template <typename Index, typename InputDIA>
905 DIA<Index> DC7(const InputDIA& input_dia, size_t input_size, size_t K) {
906 
907  using Char = typename InputDIA::ValueType;
908  using StringFragment = dc7_local::StringFragment<Index, Char>;
909 
910  return DC7Recursive<Index>(input_dia, input_size, K)
911  .Map([](const StringFragment& a) { return a.index; })
912  .Collapse();
913 }
914 
916  const DIA<uint8_t>& input_dia, size_t input_size, size_t K);
917 
918 #if !THRILL_ON_TRAVIS
919 
920 template DIA<common::uint40> DC7<common::uint40>(
921  const DIA<uint8_t>& input_dia, size_t input_size, size_t K);
922 
924  const DIA<uint8_t>& input_dia, size_t input_size, size_t K);
925 
926 #endif
927 
928 } // namespace suffix_sorting
929 } // namespace examples
930 
931 /******************************************************************************/
tlx::RingBuffer< Type, Allocator > RingBuffer
Definition: ring_buffer.hpp:21
tag structure for ZipWindow()
Definition: zip_window.hpp:43
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
auto Union(const FirstDIA &first_dia, const DIAs &... dias)
Union is a LOp, which creates the union of all items from any number of DIAs as a single DIA...
Definition: union.hpp:319
#define die_unless(X)
Definition: die.hpp:27
DIA< Index > DC7(const InputDIA &input_dia, size_t input_size, size_t K)
Definition: dc7.cpp:905
A ring (circular) buffer of static (non-growing) size.
Definition: ring_buffer.hpp:36
Type
VFS object type.
Definition: file_io.hpp:52
Definition: bfs.hpp:21
auto ZipWindow(const std::array< size_t, 1+sizeof ...(DIAs)> &window_size, const ZipFunction &zip_function, const DIA< FirstDIAType, FirstDIAStack > &first_dia, const DIAs &... dias)
Zips two DIAs of equal size in style of functional programming by applying zip_function to the i-th f...
Definition: zip_window.hpp:621
#define sLOG1
Definition: logger.hpp:38
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
Definition: context.hpp:221
template DIA< uint32_t > DC7< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, size_t K)
static by_string to_string(int val)
convert to string
tag structure for Zip()
Definition: dia.hpp:78
static constexpr size_t fragment_comparator_params[7][7][3]
Definition: dc7.cpp:298
#define assert_equal(X, Y)
Definition: die.hpp:55
template DIA< uint64_t > DC7< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, size_t K)
auto Sort(const CompareFunction &compare_function=CompareFunction()) const
Sort is a DOp, which sorts a given DIA according to the given compare_function.
Definition: sort.hpp:800
auto ZipWithIndex(const ZipFunction &zip_function) const
Zips each item of a DIA with its zero-based array index.
tag structure for Zip()
Definition: dia.hpp:86
bool operator==(const uint_pair &b) const
equality checking operator
Definition: uint_types.hpp:175
size_t my_rank() const
Global rank of this worker among all other workers in the system.
Definition: context.hpp:243
std::ostream & operator<<(std::ostream &os, const Status &s)
struct examples::suffix_sorting::dc7_local::Chars TLX_ATTRIBUTE_PACKED
static const uint64_t K[80]
Definition: sha512.cpp:32
DIA< dc7_local::StringFragment< Index, typename InputDIA::ValueType > > DC7Recursive(const InputDIA &input_dia, size_t input_size, size_t K)
Definition: dc7.cpp:411
auto Zip(const SecondDIA &second_dia, const ZipFunction &zip_function) const
Zips two DIAs of equal size in style of functional programming by applying zip_function to the i-th e...
Definition: zip.hpp:686
list params
Definition: gen_data.py:30
tag structure for Window() and FlatWindow()
Definition: dia.hpp:62
const DIA & Keep(size_t increase=1) const
Mark the referenced DIANode for keeping, which makes children not consume the data when executing...
Definition: dia.hpp:310
bool operator<(const uint_pair &b) const
less-than comparison operator
Definition: uint_types.hpp:187
static bool IsDiffCover7(size_t i)
Definition: dc7.cpp:404
SortAlgorithm class for use with api::Sort() which calls radix_sort_CI() if K is small enough...
Definition: radix_sort.hpp:147