Thrill  0.1
prefix_doubling.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/suffix_sorting/prefix_doubling.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Florian Kurpicz <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
13 
14 #include <thrill/api/cache.hpp>
15 #include <thrill/api/collapse.hpp>
16 #include <thrill/api/dia.hpp>
17 #include <thrill/api/generate.hpp>
19 #include <thrill/api/print.hpp>
21 #include <thrill/api/size.hpp>
22 #include <thrill/api/sort.hpp>
23 #include <thrill/api/union.hpp>
24 #include <thrill/api/window.hpp>
25 #include <thrill/common/logger.hpp>
27 
28 #include <algorithm>
29 #include <limits>
30 #include <random>
31 #include <stdexcept>
32 #include <string>
33 #include <tuple>
34 #include <utility>
35 #include <vector>
36 
37 namespace examples {
38 namespace suffix_sorting {
39 
40 using namespace thrill; // NOLINT
42 
43 template <typename CharsType, typename Index>
44 struct IndexKMer {
45  Index index;
46  CharsType chars;
47 
48  bool operator == (const IndexKMer& b) const {
49  return chars == b.chars;
50  }
51 
52  bool operator < (const IndexKMer& b) const {
53  return chars < b.chars;
54  }
55 
56  friend std::ostream& operator << (std::ostream& os, const IndexKMer& iom) {
57  return os << "[i=" << iom.index << ",c=" << iom.chars << ']';
58  }
60 
61 //! A pair (index, rank)
62 template <typename Index>
63 struct IndexRank {
64  Index index;
65  Index rank;
66 
67  friend std::ostream& operator << (std::ostream& os, const IndexRank& ri) {
68  return os << "(i=" << ri.index << ",r=" << ri.rank << ')';
69  }
71 
72 //! A triple (index, rank_1, rank_2)
73 template <typename Index>
74 struct IndexRankRank {
75  Index index;
76  Index rank1;
77  Index rank2;
78 
79  //! Two IndexRankRanks are equal iff their ranks are equal.
80  bool operator == (const IndexRankRank& b) const {
81  return rank1 == b.rank1 && rank2 == b.rank2;
82  }
83 
84  //! A IndexRankRank is smaller than another iff either its fist rank is
85  //! smaller or if the first ranks are equal if its second rank is smaller,
86  //! or if both ranks are equal and the first index is _larger_ than the
87  //! second. The _larger_ is due to suffixes with larger index being smaller.
88  bool operator < (const IndexRankRank& b) const {
89  return std::tie(rank1, rank2, b.index) < std::tie(b.rank1, b.rank2, index);
90  }
91 
92  friend std::ostream& operator << (std::ostream& os, const IndexRankRank& rri) {
93  return os << "(i=" << rri.index << ",r1=" << rri.rank1 << ",r2=" << rri.rank2 << ")";
94  }
96 
97 //! A triple with index (index, rank_1, rank_2, rank_3)
98 template <typename Index>
99 struct Index3Rank {
100  Index index;
101  Index rank1;
102  Index rank2;
103  Index rank3;
104 
105  friend std::ostream& operator << (std::ostream& os, const Index3Rank& irrr) {
106  return os << "(i=" << irrr.index << ",r1=" << irrr.rank1
107  << ",r2=" << irrr.rank2 << ",r3=" << irrr.rank3 << ")";
108  }
110 
111 template <typename Char, typename Index>
112 struct CharCharIndex {
113  Char ch[2];
114  Index index;
115 
116  bool operator == (const CharCharIndex& b) const {
117  return std::tie(ch[0], ch[1]) == std::tie(b.ch[0], b.ch[1]);
118  }
119 
120  bool operator < (const CharCharIndex& b) const {
121  return std::tie(ch[0], ch[1]) < std::tie(b.ch[0], b.ch[1]);
122  }
123 
124  friend std::ostream& operator << (std::ostream& os, const CharCharIndex& cci) {
125  return os << "[ch0=" << cci.ch[0] << ",ch1=" << cci.ch[1]
126  << ",index=" << cci.index << ']';
127  }
129 
130 enum class Status : uint8_t {
131  UNDECIDED = 0,
132  UNIQUE = 1,
133  FULLY_DISCARDED = 2
134 };
135 
136 std::ostream& operator << (std::ostream& os, const Status& s) {
137  switch (s) {
138  case Status::UNDECIDED:
139  return os << 'N';
140  case Status::UNIQUE:
141  return os << 'U';
143  return os << 'D';
144  }
145  return os << '?';
146 }
147 
148 //! A triple (index, rank, status)
149 template <typename Index>
150 struct IndexRankStatus {
151  Index index;
152  Index rank;
153  Status status;
154 
155  //! Two IndexRandStatuses are equal iff their ranks are equal.
156  bool operator == (const IndexRankStatus& b) const {
157  return rank == b.rank;
158  }
159 
160  //! A IndexRankStatus is smaller than another iff either its fist rank is
161  //! smaller or if both ranks are equal and the first index is _larger_ than
162  //! the second. The _larger_ is due to suffixes with larger index being
163  //! smaller.
164  bool operator < (const IndexRankStatus& b) const {
165  return rank < b.rank || (rank == b.rank && index > b.index);
166  }
167 
168  friend std::ostream& operator << (std::ostream& os, const IndexRankStatus& irs) {
169  return os << "(index=" << irs.index << ",rank=" << irs.rank << ",status="
170  << irs.status << ")";
171  }
173 
174 //! A triple (index, rank, status)
175 template <typename Index>
176 struct IndexRankRankStatus {
177  Index index;
178  Index rank1;
179  Index rank2;
180  Status status;
181 
182  friend std::ostream& operator << (std::ostream& os, const IndexRankRankStatus& irrs) {
183  return os << "(index=" << irrs.index
184  << ",rank1=" << irrs.rank1 << ",rank2=" << irrs.rank2
185  << ",status=" << irrs.status << ")";
186  }
188 
189 //! take input and pack it into an array of Index characters
190 template <typename Index, typename InputDIA>
192  const InputDIA& input_dia, size_t input_size, bool packed, size_t& iteration) {
193 
194  using Char = typename InputDIA::ValueType;
196  using CharCharIndex = suffix_sorting::CharCharIndex<Char, Index>;
197 
198  if (packed && sizeof(Char) == 1) {
199 
200  // make histogram of characters
201  std::vector<size_t> alpha_map(256);
202 
203  input_dia.Keep()
204  .Map([&alpha_map](const Char& c) { alpha_map[c]++; return c; })
205  .Size();
206 
207  alpha_map = input_dia.ctx().net.AllReduce(
208  alpha_map, common::ComponentSum<std::vector<size_t> >());
209 
210  // determine alphabet size and map to names, keeping zero reserved
211  size_t alphabet_size = 1;
212  for (size_t i = 0; i < 256; ++i) {
213  if (alpha_map[i] != 0) {
214  alpha_map[i] = alphabet_size;
215  alphabet_size++;
216  }
217  }
218 
219  // calculate number of characters fit into the bits of an Index, and the
220  // next iteration
221  size_t input_bit_size = tlx::integer_log2_ceil(alphabet_size);
222  size_t k_fitting = 8 * sizeof(Index) / input_bit_size;
223  iteration = tlx::integer_log2_floor(k_fitting);
224 
225  if (input_dia.ctx().my_rank() == 0) {
226  LOG1 << "Packing:"
227  << " alphabet_size=" << alphabet_size - 1
228  << " input_bit_size=" << input_bit_size
229  << " k_fitting=" << k_fitting
230  << " next_iteration=" << iteration;
231  }
232 
233  // pack and sort character groups
234  auto chars_sorted =
235  input_dia
236  .template FlatWindow<IndexRank>(
237  k_fitting,
238  [=, alpha_map = std::move(alpha_map)](
239  size_t index, const RingBuffer<Char>& rb, auto emit) {
240 
241  size_t result = alpha_map[rb[0]];
242  for (size_t i = 1; i < k_fitting; ++i)
243  result = (result << input_bit_size) | alpha_map[rb[i]];
244  emit(IndexRank { Index(index), Index(result) });
245 
246  if (index + k_fitting == input_size) {
247  for (size_t i = 1; i < k_fitting; ++i) {
248  result = alpha_map[rb[i]];
249  for (size_t j = i + 1; j < k_fitting; ++j)
250  result = (result << input_bit_size) | alpha_map[rb[j]];
251  result <<= i * input_bit_size;
252  emit(IndexRank { Index(index + i), Index(result) });
253  }
254  }
255  })
256  .Sort([](const IndexRank& a, const IndexRank& b) {
257  return a.rank < b.rank;
258  });
259 
260  if (debug_print)
261  chars_sorted.Keep().Print("chars_sorted packed");
262 
263  return chars_sorted.template FlatWindow<IndexRank>(
264  2,
265  [](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
266  if (index == 0)
267  emit(IndexRank { rb[0].index, Index(1) });
268  emit(IndexRank {
269  rb[1].index, Index(rb[0].rank == rb[1].rank ? 0 : index + 2)
270  });
271  });
272  }
273  else {
274  iteration = 1;
275 
276  // sorts pairs of characters to generate first iteration of lexnames
277 
278  auto chars_sorted =
279  input_dia
280  .template FlatWindow<CharCharIndex>(
281  2,
282  [](size_t index, const RingBuffer<Char>& rb, auto emit) {
283  // emit CharCharIndex for each character pair
284  emit(CharCharIndex {
285  { rb[0], rb[1] }, Index(index)
286  });
287  },
288  [=](size_t index, const RingBuffer<Char>& rb, auto emit) {
289  if (index + 1 == input_size) {
290  // emit CharCharIndex for last suffix position
291  emit(CharCharIndex {
292  { rb[0], std::numeric_limits<Char>::lowest() },
293  Index(index)
294  });
295  }
296  })
297  // sort character pairs
298  .Sort();
299 
300  if (debug_print)
301  chars_sorted.Keep().Print("chars_sorted");
302 
303  return chars_sorted.template FlatWindow<IndexRank>(
304  2,
305  [](size_t index, const RingBuffer<CharCharIndex>& rb, auto emit) {
306  if (index == 0) {
307  // emit rank 1 for smallest character pair
308  emit(IndexRank { rb[0].index, Index(1) });
309  }
310  // emit next rank if character pair is unequal, else 0 which
311  // will become the previous rank in the subsequent max().
312  emit(IndexRank {
313  rb[1].index, Index(rb[0] == rb[1] ? 0 : index + 2)
314  });
315  });
316  }
317 }
318 
319 template <typename Index, typename InputDIA>
321  const InputDIA& input_dia, size_t input_size, bool packed) {
322 
323  if (input_dia.ctx().my_rank() == 0)
324  LOG1 << "Running PrefixDoublingSorting";
325 
327  using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
328 
329  size_t iteration;
330 
331  DIA<IndexRank> names = PrefixDoublingPack<Index>(
332  input_dia, input_size, packed, iteration);
333 
334  if (debug_print)
335  names.Keep().Print("names");
336 
337  // count number of duplicate character pairs, these are 0 indicators
338  auto number_duplicates =
339  names
340  .Filter([](const IndexRank& ir) {
341  return ir.rank == Index(0);
342  })
343  .SizeFuture();
344 
345  // construct lexnames array by maxing ranks = filling in zeros with names.
346  names =
347  names
348  .PrefixSum(
349  [](const IndexRank& a, const IndexRank& b) {
350  return IndexRank { b.index, std::max(a.rank, b.rank) };
351  });
352 
353  if (number_duplicates.get() == 0) {
354  if (input_dia.context().my_rank() == 0)
355  sLOG1 << "Finished before doubling in loop";
356 
357  // suffix array already known, as character pairs are unique
358  auto sa =
359  names
360  .Map([](const IndexRank& ir) {
361  return ir.index;
362  });
363 
364  return sa.Collapse();
365  }
366 
367  if (debug_print)
368  names.Keep().Print("names before loop");
369 
370  auto last_number_duplicates = number_duplicates.get();
371 
372  while (true) {
373  // reorder names such that 2^k+i and 2^(k+1)+i are adjacent
374  auto names_sorted =
375  names
376  .Sort([iteration](const IndexRank& a, const IndexRank& b) {
377  Index mod_mask = (Index(1) << iteration) - 1;
378  Index div_mask = ~mod_mask;
379 
380  if ((a.index & mod_mask) == (b.index & mod_mask))
381  return (a.index & div_mask) < (b.index & div_mask);
382  else
383  return (a.index & mod_mask) < (b.index & mod_mask);
384  });
385 
386  if (debug_print)
387  names_sorted.Keep().Print("names_sorted");
388 
389  size_t next_index = size_t(1) << iteration++;
390 
391  if (input_dia.context().my_rank() == 0) {
392  sLOG1 << "next_index" << next_index;
393  }
394 
395  auto triple =
396  names_sorted
397  .template FlatWindow<IndexRankRank>(
398  2,
399  [=](size_t /* index */, const RingBuffer<IndexRank>& rb, auto emit) {
400  emit(IndexRankRank {
401  rb[0].index, rb[0].rank,
402  // check if at boundary between 2^k+i range, emit 0
403  // if crossing boundary
404  (rb[0].index + Index(next_index) == rb[1].index)
405  ? rb[1].rank : Index(0)
406  });
407  },
408  [=](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
409  if (index + 1 == input_size)
410  emit(IndexRankRank { rb[0].index, rb[0].rank, Index(0) });
411  });
412 
413  if (debug_print)
414  triple.Keep().Print("triple");
415 
416  auto triple_sorted = triple.Sort();
417 
418  if (debug_print)
419  triple_sorted.Keep().Print("triple_sorted");
420 
421  names =
422  triple_sorted
423  .template FlatWindow<IndexRank>(
424  2,
425  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
426  if (index == 0)
427  emit(IndexRank { rb[0].index, Index(1) });
428 
429  emit(IndexRank {
430  rb[1].index,
431  (rb[0] == rb[1] && rb[0].rank2 != Index(0))
432  ? Index(0) : Index(index + 2)
433  });
434  });
435 
436  if (debug_print)
437  names.Keep().Print("names indicator");
438 
439  number_duplicates =
440  names
441  .Filter([](const IndexRank& ir) {
442  return ir.rank == Index(0);
443  })
444  .SizeFuture();
445 
446  names =
447  names
448  .PrefixSum([](const IndexRank& a, const IndexRank& b) {
449  return IndexRank { b.index, std::max(a.rank, b.rank) };
450  });
451 
452  if (input_dia.context().my_rank() == 0) {
453  sLOG1 << "iteration" << iteration - 1
454  << "duplicates" << number_duplicates.get();
455  }
456 
457  if (number_duplicates.get() > last_number_duplicates) {
458  sLOG1 << "number_duplicates" << number_duplicates.get()
459  << "last_number_duplicates" << last_number_duplicates;
460 
461  auto sa =
462  names
463  .Map([](const IndexRank& ir) {
464  return ir.index;
465  });
466 
467  return sa.Collapse();
468  }
469 
470  last_number_duplicates = number_duplicates.get();
471 
472  if (number_duplicates.get() == 0) {
473  auto sa =
474  names
475  .Map([](const IndexRank& ir) {
476  return ir.index;
477  });
478 
479  return sa.Collapse();
480  }
481 
482  if (debug_print)
483  names.Keep().Print("names");
484  }
485 }
486 
487 template <typename Index, typename InputDIA>
489  const InputDIA& input_dia, size_t input_size, bool packed) {
490 
491  if (input_dia.ctx().my_rank() == 0)
492  LOG1 << "Running PrefixDoublingWindow";
493 
495  using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
496 
497  size_t iteration;
498 
499  DIA<IndexRank> names = PrefixDoublingPack<Index>(
500  input_dia, input_size, packed, iteration);
501 
502  auto number_duplicates =
503  names
504  .Filter([](const IndexRank& ir) {
505  return ir.rank == Index(0);
506  })
507  .SizeFuture();
508 
509  names =
510  names
511  .PrefixSum([](const IndexRank& a, const IndexRank& b) {
512  return IndexRank { b.index, std::max(a.rank, b.rank) };
513  });
514 
515  // The first rank is always 0 and all other duplicates have "rank" 0
516  // before we compute the correct new rank.
517  if (number_duplicates.get() == 0) {
518  if (input_dia.context().my_rank() == 0)
519  sLOG1 << "Finished before doubling in loop.";
520 
521  auto sa =
522  names
523  .Map([](const IndexRank& ir) {
524  return ir.index;
525  });
526  return sa.Collapse();
527  }
528 
529  if (debug_print)
530  names.Keep().Print("names");
531 
532  while (true) {
533  auto isa =
534  names
535  // .Sort([](const IndexRank& a, const IndexRank& b) {
536  // return a.index < b.index;
537  // });
538  .ReduceToIndex(
540  [](const IndexRank& a) { return static_cast<size_t>(a.index); },
541  [](const IndexRank&, const IndexRank& b) { return b; },
542  input_size);
543 
544  if (debug_print)
545  isa.Keep().Print("isa");
546 
547  size_t shift_by = (size_t(1) << (iteration - 1)) + 1;
548 
549  auto triple_sorted =
550  isa
551  .template FlatWindow<IndexRankRank>(
552  shift_by,
553  [](size_t /*index*/, const RingBuffer<IndexRank>& rb, auto emit) {
554  emit(IndexRankRank { rb[0].index, rb.front().rank, rb.back().rank });
555  },
556  [](size_t /*index*/, const RingBuffer<IndexRank>& rb, auto emit) {
557  emit(IndexRankRank { rb[0].index, rb[0].rank, Index(0) });
558  })
559  .Sort();
560 
561  if (debug_print)
562  triple_sorted.Keep().Print("triple_sorted");
563 
564  names =
565  triple_sorted
566  .template FlatWindow<IndexRank>(
567  2,
568  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
569  if (index == 0) emit(IndexRank { rb[0].index, Index(1) });
570  emit(IndexRank { rb[1].index,
571  Index(rb[0] == rb[1] ? 0 : index + 2) });
572  });
573 
574  if (debug_print)
575  names.Keep().Print("names");
576 
577  number_duplicates =
578  names
579  .Filter([](const IndexRank& ir) {
580  return ir.rank == Index(0);
581  })
582  .SizeFuture();
583 
584  names =
585  names
586  .PrefixSum([](const IndexRank& a, const IndexRank& b) {
587  return IndexRank { b.index, std::max(a.rank, b.rank) };
588  });
589 
590  if (input_dia.context().my_rank() == 0) {
591  sLOG1 << "iteration" << iteration
592  << "shift_by" << shift_by
593  << "duplicates" << number_duplicates.get();
594  }
595  ++iteration;
596 
597  if (number_duplicates.get() == 0) {
598  auto sa =
599  names
600  .Map([](const IndexRank& ir) {
601  return ir.index;
602  });
603  return sa.Collapse();
604  }
605 
606  if (debug_print)
607  names.Keep().Print("names");
608  }
609 }
610 
611 template <typename Index, typename InputDIA>
613  const InputDIA& input_dia, size_t input_size, bool packed) {
614 
615  if (input_dia.ctx().my_rank() == 0)
616  LOG1 << "Running PrefixDoublingDiscarding";
617 
619  using IndexRankStatus = suffix_sorting::IndexRankStatus<Index>;
620  using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
621  using Index3Rank = suffix_sorting::Index3Rank<Index>;
622  using IndexRankRankStatus = suffix_sorting::IndexRankRankStatus<Index>;
623 
624  size_t iteration;
625 
626  DIA<IndexRank> names = PrefixDoublingPack<Index>(
627  input_dia, input_size, packed, iteration);
628 
629  names =
630  names.PrefixSum([](const IndexRank a, const IndexRank b) {
631  return IndexRank { b.index, std::max(a.rank, b.rank) };
632  });
633 
634  if (debug_print)
635  names.Keep().Print("names");
636 
637  auto names_unique =
638  names
639  .template FlatWindow<IndexRankStatus>(
640  3,
641  [=](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
642  if (index == 0) {
643  Status status = rb[0].rank != rb[1].rank ? Status::UNIQUE : Status::UNDECIDED;
644  emit(IndexRankStatus { rb[0].index, rb[0].rank, status });
645  }
646  {
647  Status status = rb[0].rank != rb[1].rank && rb[1].rank != rb[2].rank
649 
650  emit(IndexRankStatus { rb[1].index, rb[1].rank, status });
651  }
652  if (index == input_size - 3) {
653  Status status = rb[1].rank != rb[2].rank ? Status::UNIQUE : Status::UNDECIDED;
654  emit(IndexRankStatus { rb[2].index, rb[2].rank, status });
655  }
656  });
657 
658  if (debug_print)
659  names_unique.Keep().Print("names_unique");
660 
661  auto names_unique_sorted =
662  names_unique
663  .Sort([iteration](const IndexRankStatus& a, const IndexRankStatus& b) {
664  Index mod_mask = (Index(1) << iteration) - 1;
665  Index div_mask = ~mod_mask;
666 
667  if ((a.index & mod_mask) == (b.index & mod_mask))
668  return (a.index & div_mask) < (b.index & div_mask);
669  else
670  return (a.index & mod_mask) < (b.index & mod_mask);
671  });
672 
673  std::vector<DIA<IndexRank> > fully_discarded;
674 
675  while (true) {
676  ++iteration;
677 
678  size_t names_size = names_unique_sorted.Keep().Size();
679 
680  if (debug_print)
681  names_unique_sorted.Keep().Print("names_unique_sorted");
682 
683  auto discarded_names =
684  names_unique_sorted.Keep()
685  .template FlatWindow<IndexRankRankStatus>(
686  3,
687  [=](size_t index, const RingBuffer<IndexRankStatus>& rb, auto emit) {
688  // Discarded names (we need to change the status since we remove it one step later)
689  if (index == 0) {
690  if (rb[0].status == Status::UNIQUE)
691  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::FULLY_DISCARDED });
692  if (rb[1].status == Status::UNIQUE)
693  // Since there is just one preceding entry it's either undiscarded or unique
694  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::FULLY_DISCARDED });
695  }
696  if (rb[2].status == Status::UNIQUE) {
697  if (rb[0].status == Status::UNIQUE || rb[1].status == Status::UNIQUE)
698  emit(IndexRankRankStatus { rb[2].index, rb[2].rank, Index(0), Status::FULLY_DISCARDED });
699  else
700  emit(IndexRankRankStatus { rb[2].index, rb[2].rank, Index(0), Status::UNIQUE });
701  }
702  if (rb[0].status == Status::UNDECIDED) {
703  if (rb[0].index + Index(1llu << (iteration - 1)) == rb[1].index)
704  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, rb[1].rank, Status::UNDECIDED });
705  else
706  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::UNDECIDED });
707  }
708  },
709  [=](size_t index, const RingBuffer<IndexRankStatus>& rb, auto emit) {
710  if (index == 0) {
711  if (rb[0].status == Status::UNIQUE)
712  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::FULLY_DISCARDED });
713  else // (rb[0].status == Status::UNDECIDED)
714  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::UNDECIDED });
715  if (rb[1].status == Status::UNIQUE)
716  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::FULLY_DISCARDED });
717  else // (rb[1].status == Status::UNDECIDED)
718  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::UNDECIDED });
719  }
720  if (index == names_size - 2) {
721  if (rb[0].status == Status::UNDECIDED)
722  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, rb[1].rank, Status::UNDECIDED });
723  if (rb[1].status == Status::UNDECIDED)
724  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::UNDECIDED });
725  }
726  });
727 
728  if (debug_print)
729  discarded_names.Keep().Print("discarded_names");
730 
731  auto new_decided =
732  discarded_names.Keep()
733  .Filter([](const IndexRankRankStatus& irs) {
734  return irs.status == Status::FULLY_DISCARDED;
735  })
736  .Map([](const IndexRankRankStatus& irs) {
737  return IndexRank { irs.index, irs.rank1 };
738  });
739 
740  auto partial_discarded =
741  discarded_names.Keep()
742  .Filter([](const IndexRankRankStatus& irs) {
743  return irs.status == Status::UNIQUE;
744  })
745  .Map([](const IndexRankRankStatus& irs) {
746  return IndexRankStatus { irs.index, irs.rank1, Status::UNIQUE };
747  });
748 
749  auto undiscarded =
750  discarded_names
751  .Filter([](const IndexRankRankStatus& irs) {
752  return irs.status == Status::UNDECIDED;
753  })
754  .Map([](const IndexRankRankStatus& irs) {
755  return IndexRankRank { irs.index, irs.rank1, irs.rank2 };
756  })
757  .Sort();
758 
759  fully_discarded.emplace_back(new_decided.Cache());
760 
761  size_t duplicates = undiscarded.Keep().Size();
762 
763  if (input_dia.context().my_rank() == 0) {
764  sLOG1 << "iteration" << iteration - 1
765  << "duplicates" << duplicates;
766  }
767 
768  if (duplicates == 0) {
769  auto sa =
770  Union(fully_discarded)
771  .Sort([](const IndexRank& a, const IndexRank& b) {
772  return a.rank < b.rank;
773  })
774  .Map([](const IndexRank& ir) {
775  return ir.index;
776  });
777  return sa.Collapse();
778  }
779 
780  auto new_ranks =
781  undiscarded
782  .template FlatWindow<Index3Rank>(
783  2,
784  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
785  if (index == 0) {
786  emit(Index3Rank { rb[0].index, Index(0), Index(0), rb[0].rank1 });
787  }
788  Index i = rb[0].rank1 == rb[1].rank1 ? Index(0) : Index(index + 1);
789  Index r;
790  if (rb[0].rank1 != rb[1].rank1)
791  r = Index(index + 1);
792  else
793  r = (rb[0].rank2 == rb[1].rank2) ? Index(0) : Index(index + 1);
794  emit(Index3Rank { rb[1].index, i, r, rb[1].rank1 });
795  },
796  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
797  if (index == 0)
798  emit(Index3Rank { rb[0].index, Index(0), Index(0), rb[0].rank1 });
799  })
800  .PrefixSum([](const Index3Rank& a, const Index3Rank& b) {
801  return Index3Rank {
802  b.index,
803  std::max<Index>(a.rank1, b.rank1),
804  std::max<Index>(a.rank2, b.rank2),
805  b.rank3
806  };
807  })
808  .Map([](const Index3Rank& ir) {
809  return IndexRank { ir.index, ir.rank3 + (ir.rank2 - ir.rank1) };
810  });
811 
812  if (debug_print)
813  new_ranks.Keep().Print("new_ranks");
814 
815  names_unique =
816  new_ranks
817  .template FlatWindow<IndexRankStatus>(
818  3,
819  [=](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
820  if (index == 0) {
821  Status status = rb[0].rank != rb[1].rank ? Status::UNIQUE : Status::UNDECIDED;
822  emit(IndexRankStatus { rb[0].index, rb[0].rank, status });
823  }
824  if (rb[0].rank != rb[1].rank && rb[1].rank != rb[2].rank)
825  emit(IndexRankStatus { rb[1].index, rb[1].rank, Status::UNIQUE });
826  else
827  emit(IndexRankStatus { rb[1].index, rb[1].rank, Status::UNDECIDED });
828  if (index + 3 == duplicates) {
829  Status status = rb[1].rank != rb[2].rank ? Status::UNIQUE : Status::UNDECIDED;
830  emit(IndexRankStatus { rb[2].index, rb[2].rank, status });
831  }
832  },
833  [](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
834  if (index == 0) { // We know that there are exactly 2 names
835  emit(IndexRankStatus { rb[0].index, rb[0].rank, Status::UNIQUE });
836  emit(IndexRankStatus { rb[1].index, rb[1].rank, Status::UNIQUE });
837  }
838  });
839 
840  if (debug_print)
841  names_unique.Keep().Print("names_unique");
842 
843  names_unique_sorted =
844  names_unique
845  .Union(partial_discarded)
846  .Sort([iteration](const IndexRankStatus& a, const IndexRankStatus& b) {
847  Index mod_mask = (Index(1) << iteration) - 1;
848  Index div_mask = ~mod_mask;
849 
850  if ((a.index & mod_mask) == (b.index & mod_mask))
851  return (a.index & div_mask) < (b.index & div_mask);
852  else
853  return (a.index & mod_mask) < (b.index & mod_mask);
854  });
855  }
856 }
857 
859  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
860 
862  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
863 
865  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
866 
867 #if !THRILL_ON_TRAVIS
868 
869 template DIA<common::uint40> PrefixDoublingWindow<common::uint40>(
870  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
871 
872 template DIA<common::uint40> PrefixDoublingSorting<common::uint40>(
873  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
874 
875 template DIA<common::uint40> PrefixDoublingDiscarding<common::uint40>(
876  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
877 
879  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
880 
882  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
883 
885  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
886 
887 #endif
888 
889 } // namespace suffix_sorting
890 } // namespace examples
891 
892 /******************************************************************************/
tlx::RingBuffer< Type, Allocator > RingBuffer
Definition: ring_buffer.hpp:21
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
auto Union(const FirstDIA &first_dia, const DIAs &... dias)
Union is a LOp, which creates the union of all items from any number of DIAs as a single DIA...
Definition: union.hpp:319
template DIA< uint32_t > PrefixDoublingDiscarding< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
template DIA< uint32_t > PrefixDoublingSorting< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
A ring (circular) buffer of static (non-growing) size.
Definition: ring_buffer.hpp:36
#define LOG1
Definition: logger.hpp:28
auto ReduceToIndex(const KeyExtractor &key_extractor, const ReduceFunction &reduce_function, size_t size, const ValueType &neutral_element=ValueType(), const ReduceConfig &reduce_config=ReduceConfig()) const
ReduceToIndex is a DOp, which groups elements of the DIA with the key_extractor returning an unsigned...
Definition: bfs.hpp:21
tag structure for ReduceToIndex()
Definition: dia.hpp:54
template DIA< uint64_t > PrefixDoublingWindow< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
#define sLOG1
Definition: logger.hpp:38
auto PrefixSum(const SumFunction &sum_function=SumFunction(), const ValueType &initial_element=ValueType()) const
PrefixSum is a DOp, which computes the (inclusive) prefix sum of all elements.
Definition: prefix_sum.hpp:134
template DIA< uint32_t > PrefixDoublingWindow< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
template for computing the component-wise sum of std::array or std::vector.
Definition: functional.hpp:94
DIA< Index > PrefixDoublingDiscarding(const InputDIA &input_dia, size_t input_size, bool packed)
DIA< Index > PrefixDoublingSorting(const InputDIA &input_dia, size_t input_size, bool packed)
template DIA< uint64_t > PrefixDoublingDiscarding< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
auto Filter(const FilterFunction &filter_function) const
Each item of a DIA is tested using filter_function : to determine whether it is copied into the outp...
Definition: dia.hpp:405
auto Sort(const CompareFunction &compare_function=CompareFunction()) const
Sort is a DOp, which sorts a given DIA according to the given compare_function.
Definition: sort.hpp:800
static unsigned integer_log2_floor(int i)
calculate the log2 floor of an integer type
auto Map(const MapFunction &map_function) const
Map applies map_function : to each item of a DIA and delivers a new DIA contains the returned values...
Definition: dia.hpp:358
bool operator==(const uint_pair &b) const
equality checking operator
Definition: uint_types.hpp:175
std::ostream & operator<<(std::ostream &os, const Status &s)
struct examples::suffix_sorting::IndexRank TLX_ATTRIBUTE_PACKED
static unsigned integer_log2_ceil(int i)
calculate the log2 floor of an integer type
template DIA< uint64_t > PrefixDoublingSorting< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
const DIA & Keep(size_t increase=1) const
Mark the referenced DIANode for keeping, which makes children not consume the data when executing...
Definition: dia.hpp:310
bool operator<(const uint_pair &b) const
less-than comparison operator
Definition: uint_types.hpp:187
DIA< Index > PrefixDoublingWindow(const InputDIA &input_dia, size_t input_size, bool packed)
DIA< IndexRank< Index > > PrefixDoublingPack(const InputDIA &input_dia, size_t input_size, bool packed, size_t &iteration)
take input and pack it into an array of Index characters