Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
prefix_doubling.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/suffix_sorting/prefix_doubling.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Florian Kurpicz <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
13 
14 #include <thrill/api/cache.hpp>
15 #include <thrill/api/collapse.hpp>
16 #include <thrill/api/dia.hpp>
17 #include <thrill/api/generate.hpp>
18 #include <thrill/api/prefixsum.hpp>
19 #include <thrill/api/print.hpp>
21 #include <thrill/api/size.hpp>
22 #include <thrill/api/sort.hpp>
23 #include <thrill/api/union.hpp>
24 #include <thrill/api/window.hpp>
25 #include <thrill/common/logger.hpp>
27 
28 #include <algorithm>
29 #include <limits>
30 #include <random>
31 #include <stdexcept>
32 #include <string>
33 #include <tuple>
34 #include <utility>
35 #include <vector>
36 
37 namespace examples {
38 namespace suffix_sorting {
39 
40 using namespace thrill; // NOLINT
42 
43 template <typename CharsType, typename Index>
44 struct IndexKMer {
45  Index index;
46  CharsType chars;
47 
48  bool operator == (const IndexKMer& b) const {
49  return chars == b.chars;
50  }
51 
52  bool operator < (const IndexKMer& b) const {
53  return chars < b.chars;
54  }
55 
56  friend std::ostream& operator << (std::ostream& os, const IndexKMer& iom) {
57  return os << "[i=" << iom.index << ",c=" << iom.chars << ']';
58  }
60 
61 //! A pair (index, rank)
62 template <typename Index>
63 struct IndexRank {
64  Index index;
65  Index rank;
66 
67  friend std::ostream& operator << (std::ostream& os, const IndexRank& ri) {
68  return os << "(i=" << ri.index << ",r=" << ri.rank << ')';
69  }
71 
72 //! A triple (index, rank_1, rank_2)
73 template <typename Index>
74 struct IndexRankRank {
75  Index index;
76  Index rank1;
77  Index rank2;
78 
79  //! Two IndexRankRanks are equal iff their ranks are equal.
80  bool operator == (const IndexRankRank& b) const {
81  return rank1 == b.rank1 && rank2 == b.rank2;
82  }
83 
84  //! A IndexRankRank is smaller than another iff either its fist rank is
85  //! smaller or if the first ranks are equal if its second rank is smaller,
86  //! or if both ranks are equal and the first index is _larger_ than the
87  //! second. The _larger_ is due to suffixes with larger index being smaller.
88  bool operator < (const IndexRankRank& b) const {
89  return std::tie(rank1, rank2, b.index) < std::tie(b.rank1, b.rank2, index);
90  }
91 
92  friend std::ostream& operator << (std::ostream& os, const IndexRankRank& rri) {
93  return os << "(i=" << rri.index << ",r1=" << rri.rank1 << ",r2=" << rri.rank2 << ")";
94  }
96 
97 //! A triple with index (index, rank_1, rank_2, rank_3)
98 template <typename Index>
99 struct Index3Rank {
100  Index index;
101  Index rank1;
102  Index rank2;
103  Index rank3;
104 
105  friend std::ostream& operator << (std::ostream& os, const Index3Rank& irrr) {
106  return os << "(i=" << irrr.index << ",r1=" << irrr.rank1
107  << ",r2=" << irrr.rank2 << ",r3=" << irrr.rank3 << ")";
108  }
110 
111 template <typename Char, typename Index>
112 struct CharCharIndex {
113  Char ch[2];
114  Index index;
115 
116  bool operator == (const CharCharIndex& b) const {
117  return std::tie(ch[0], ch[1]) == std::tie(b.ch[0], b.ch[1]);
118  }
119 
120  bool operator < (const CharCharIndex& b) const {
121  return std::tie(ch[0], ch[1]) < std::tie(b.ch[0], b.ch[1]);
122  }
123 
124  friend std::ostream& operator << (std::ostream& os, const CharCharIndex& cci) {
125  return os << "[ch0=" << cci.ch[0] << ",ch1=" << cci.ch[1]
126  << ",index=" << cci.index << ']';
127  }
129 
130 enum class Status : uint8_t {
131  UNDECIDED = 0,
132  UNIQUE = 1,
133  FULLY_DISCARDED = 2
134 };
135 
136 //! A triple (index, rank, status)
137 template <typename Index>
138 struct IndexRankStatus {
139  Index index;
140  Index rank;
141  Status status;
142 
143  //! Two IndexRandStatuses are equal iff their ranks are equal.
144  bool operator == (const IndexRankStatus& b) const {
145  return rank == b.rank;
146  }
147 
148  //! A IndexRankStatus is smaller than another iff either its fist rank is
149  //! smaller or if both ranks are equal and the first index is _larger_ than
150  //! the second. The _larger_ is due to suffixes with larger index being
151  //! smaller.
152  bool operator < (const IndexRankStatus& b) const {
153  return rank < b.rank || (rank == b.rank && index > b.index);
154  }
155 
156  friend std::ostream& operator << (std::ostream& os, const IndexRankStatus& irs) {
157  return os << "(index=" << irs.index << ",rank=" << irs.rank << ",status="
158  << static_cast<uint8_t>(irs.status) << ")";
159  }
161 
162 //! A triple (index, rank, status)
163 template <typename Index>
164 struct IndexRankRankStatus {
165  Index index;
166  Index rank1;
167  Index rank2;
168  Status status;
169 
170  friend std::ostream& operator << (std::ostream& os, const IndexRankRankStatus& irrs) {
171  return os << "(index=" << irrs.index
172  << ",rank1=" << irrs.rank1 << ",rank2=" << irrs.rank2
173  << ",status=" << static_cast<uint8_t>(irrs.status) << ")";
174  }
176 
177 //! take input and pack it into an array of Index characters
178 template <typename Index, typename InputDIA>
179 DIA<IndexRank<Index> > PrefixDoublingPack(
180  const InputDIA& input_dia, size_t input_size, bool packed, size_t& iteration) {
181 
182  using Char = typename InputDIA::ValueType;
184  using CharCharIndex = suffix_sorting::CharCharIndex<Char, Index>;
185 
186  if (packed && sizeof(Char) == 1) {
187 
188  // make histogram of characters
189  std::vector<size_t> alpha_map(256);
190 
191  input_dia.Keep()
192  .Map([&alpha_map](const Char& c) { alpha_map[c]++; return c; })
193  .Size();
194 
195  alpha_map = input_dia.ctx().net.AllReduce(
196  alpha_map, common::ComponentSum<std::vector<size_t> >());
197 
198  // determine alphabet size and map to names, keeping zero reserved
199  size_t alphabet_size = 1;
200  for (size_t i = 0; i < 256; ++i) {
201  if (alpha_map[i] != 0) {
202  alpha_map[i] = alphabet_size;
203  alphabet_size++;
204  }
205  }
206 
207  // calculate number of characters fit into the bits of an Index, and the
208  // next iteration
209  size_t input_bit_size = tlx::integer_log2_ceil(alphabet_size);
210  size_t k_fitting = 8 * sizeof(Index) / input_bit_size;
211  iteration = tlx::integer_log2_floor(k_fitting);
212 
213  if (input_dia.ctx().my_rank() == 0) {
214  LOG1 << "Packing:"
215  << " alphabet_size=" << alphabet_size - 1
216  << " input_bit_size=" << input_bit_size
217  << " k_fitting=" << k_fitting
218  << " next_iteration=" << iteration;
219  }
220 
221  // pack and sort character groups
222  auto chars_sorted =
223  input_dia
224  .template FlatWindow<IndexRank>(
225  k_fitting,
226  [=, alpha_map = std::move(alpha_map)](
227  size_t index, const RingBuffer<Char>& rb, auto emit) {
228 
229  size_t result = alpha_map[rb[0]];
230  for (size_t i = 1; i < k_fitting; ++i)
231  result = (result << input_bit_size) | alpha_map[rb[i]];
232  emit(IndexRank { Index(index), Index(result) });
233 
234  if (index + k_fitting == input_size) {
235  for (size_t i = 1; i < k_fitting; ++i) {
236  result = alpha_map[rb[i]];
237  for (size_t j = i + 1; j < k_fitting; ++j)
238  result = (result << input_bit_size) | alpha_map[rb[j]];
239  result <<= i * input_bit_size;
240  emit(IndexRank { Index(index + i), Index(result) });
241  }
242  }
243  })
244  .Sort([](const IndexRank& a, const IndexRank& b) {
245  return a.rank < b.rank;
246  });
247 
248  if (debug_print)
249  chars_sorted.Keep().Print("chars_sorted packed");
250 
251  return chars_sorted.template FlatWindow<IndexRank>(
252  2,
253  [](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
254  if (index == 0)
255  emit(IndexRank { rb[0].index, Index(1) });
256  emit(IndexRank {
257  rb[1].index, Index(rb[0].rank == rb[1].rank ? 0 : index + 2)
258  });
259  });
260  }
261  else {
262  iteration = 1;
263 
264  // sorts pairs of characters to generate first iteration of lexnames
265 
266  auto chars_sorted =
267  input_dia
268  .template FlatWindow<CharCharIndex>(
269  2,
270  [](size_t index, const RingBuffer<Char>& rb, auto emit) {
271  // emit CharCharIndex for each character pair
272  emit(CharCharIndex {
273  { rb[0], rb[1] }, Index(index)
274  });
275  },
276  [=](size_t index, const RingBuffer<Char>& rb, auto emit) {
277  if (index + 1 == input_size) {
278  // emit CharCharIndex for last suffix position
279  emit(CharCharIndex {
280  { rb[0], std::numeric_limits<Char>::lowest() },
281  Index(index)
282  });
283  }
284  })
285  // sort character pairs
286  .Sort();
287 
288  if (debug_print)
289  chars_sorted.Keep().Print("chars_sorted");
290 
291  return chars_sorted.template FlatWindow<IndexRank>(
292  2,
293  [](size_t index, const RingBuffer<CharCharIndex>& rb, auto emit) {
294  if (index == 0) {
295  // emit rank 1 for smallest character pair
296  emit(IndexRank { rb[0].index, Index(1) });
297  }
298  // emit next rank if character pair is unequal, else 0 which
299  // will become the previous rank in the subsequent max().
300  emit(IndexRank {
301  rb[1].index, Index(rb[0] == rb[1] ? 0 : index + 2)
302  });
303  });
304  }
305 }
306 
307 template <typename Index, typename InputDIA>
309  const InputDIA& input_dia, size_t input_size, bool packed) {
310 
311  if (input_dia.ctx().my_rank() == 0)
312  LOG1 << "Running PrefixDoublingSorting";
313 
315  using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
316 
317  size_t iteration;
318 
319  DIA<IndexRank> names = PrefixDoublingPack<Index>(
320  input_dia, input_size, packed, iteration);
321 
322  if (debug_print)
323  names.Keep().Print("names");
324 
325  // count number of duplicate character pairs, these are 0 indicators
326  auto number_duplicates =
327  names
328  .Filter([](const IndexRank& ir) {
329  return ir.rank == Index(0);
330  })
331  .SizeFuture();
332 
333  // construct lexnames array by maxing ranks = filling in zeros with names.
334  names =
335  names
336  .PrefixSum(
337  [](const IndexRank& a, const IndexRank& b) {
338  return IndexRank { b.index, std::max(a.rank, b.rank) };
339  });
340 
341  if (number_duplicates.get() == 0) {
342  if (input_dia.context().my_rank() == 0)
343  sLOG1 << "Finished before doubling in loop";
344 
345  // suffix array already known, as character pairs are unique
346  auto sa =
347  names
348  .Map([](const IndexRank& ir) {
349  return ir.index;
350  });
351 
352  return sa.Collapse();
353  }
354 
355  if (debug_print)
356  names.Keep().Print("names before loop");
357 
358  auto last_number_duplicates = number_duplicates.get();
359 
360  while (true) {
361  // reorder names such that 2^k+i and 2^(k+1)+i are adjacent
362  auto names_sorted =
363  names
364  .Sort([iteration](const IndexRank& a, const IndexRank& b) {
365  Index mod_mask = (Index(1) << iteration) - 1;
366  Index div_mask = ~mod_mask;
367 
368  if ((a.index & mod_mask) == (b.index & mod_mask))
369  return (a.index & div_mask) < (b.index & div_mask);
370  else
371  return (a.index & mod_mask) < (b.index & mod_mask);
372  });
373 
374  if (debug_print)
375  names_sorted.Keep().Print("names_sorted");
376 
377  size_t next_index = size_t(1) << iteration++;
378 
379  if (input_dia.context().my_rank() == 0) {
380  sLOG1 << "next_index" << next_index;
381  }
382 
383  auto triple =
384  names_sorted
385  .template FlatWindow<IndexRankRank>(
386  2,
387  [=](size_t /* index */, const RingBuffer<IndexRank>& rb, auto emit) {
388  emit(IndexRankRank {
389  rb[0].index, rb[0].rank,
390  // check if at boundary between 2^k+i range, emit 0
391  // if crossing boundary
392  (rb[0].index + Index(next_index) == rb[1].index)
393  ? rb[1].rank : Index(0)
394  });
395  },
396  [=](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
397  if (index + 1 == input_size)
398  emit(IndexRankRank { rb[0].index, rb[0].rank, Index(0) });
399  });
400 
401  if (debug_print)
402  triple.Keep().Print("triple");
403 
404  auto triple_sorted = triple.Sort();
405 
406  if (debug_print)
407  triple_sorted.Keep().Print("triple_sorted");
408 
409  names =
410  triple_sorted
411  .template FlatWindow<IndexRank>(
412  2,
413  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
414  if (index == 0)
415  emit(IndexRank { rb[0].index, Index(1) });
416 
417  emit(IndexRank {
418  rb[1].index,
419  (rb[0] == rb[1] && rb[0].rank2 != Index(0))
420  ? Index(0) : Index(index + 2)
421  });
422  });
423 
424  if (debug_print)
425  names.Keep().Print("names indicator");
426 
427  number_duplicates =
428  names
429  .Filter([](const IndexRank& ir) {
430  return ir.rank == Index(0);
431  })
432  .SizeFuture();
433 
434  names =
435  names
436  .PrefixSum([](const IndexRank& a, const IndexRank& b) {
437  return IndexRank { b.index, std::max(a.rank, b.rank) };
438  });
439 
440  if (input_dia.context().my_rank() == 0) {
441  sLOG1 << "iteration" << iteration - 1
442  << "duplicates" << number_duplicates.get();
443  }
444 
445  if (number_duplicates.get() > last_number_duplicates) {
446  sLOG1 << "number_duplicates" << number_duplicates.get()
447  << "last_number_duplicates" << last_number_duplicates;
448 
449  auto sa =
450  names
451  .Map([](const IndexRank& ir) {
452  return ir.index;
453  });
454 
455  return sa.Collapse();
456  }
457 
458  last_number_duplicates = number_duplicates.get();
459 
460  if (number_duplicates.get() == 0) {
461  auto sa =
462  names
463  .Map([](const IndexRank& ir) {
464  return ir.index;
465  });
466 
467  return sa.Collapse();
468  }
469 
470  if (debug_print)
471  names.Keep().Print("names");
472  }
473 }
474 
475 template <typename Index, typename InputDIA>
477  const InputDIA& input_dia, size_t input_size, bool packed) {
478 
479  if (input_dia.ctx().my_rank() == 0)
480  LOG1 << "Running PrefixDoublingWindow";
481 
483  using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
484 
485  size_t iteration;
486 
487  DIA<IndexRank> names = PrefixDoublingPack<Index>(
488  input_dia, input_size, packed, iteration);
489 
490  auto number_duplicates =
491  names
492  .Filter([](const IndexRank& ir) {
493  return ir.rank == Index(0);
494  })
495  .SizeFuture();
496 
497  names =
498  names
499  .PrefixSum([](const IndexRank& a, const IndexRank& b) {
500  return IndexRank { b.index, std::max(a.rank, b.rank) };
501  });
502 
503  // The first rank is always 0 and all other duplicates have "rank" 0
504  // before we compute the correct new rank.
505  if (number_duplicates.get() == 0) {
506  if (input_dia.context().my_rank() == 0)
507  sLOG1 << "Finished before doubling in loop.";
508 
509  auto sa =
510  names
511  .Map([](const IndexRank& ir) {
512  return ir.index;
513  });
514  return sa.Collapse();
515  }
516 
517  if (debug_print)
518  names.Keep().Print("names");
519 
520  while (true) {
521  auto isa =
522  names
523  // .Sort([](const IndexRank& a, const IndexRank& b) {
524  // return a.index < b.index;
525  // });
526  .ReduceToIndex(
528  [](const IndexRank& a) { return static_cast<size_t>(a.index); },
529  [](const IndexRank&, const IndexRank& b) { return b; },
530  input_size);
531 
532  if (debug_print)
533  isa.Keep().Print("isa");
534 
535  size_t shift_by = (size_t(1) << (iteration - 1)) + 1;
536 
537  auto triple_sorted =
538  isa
539  .template FlatWindow<IndexRankRank>(
540  shift_by,
541  [](size_t /*index*/, const RingBuffer<IndexRank>& rb, auto emit) {
542  emit(IndexRankRank { rb[0].index, rb.front().rank, rb.back().rank });
543  },
544  [](size_t /*index*/, const RingBuffer<IndexRank>& rb, auto emit) {
545  emit(IndexRankRank { rb[0].index, rb[0].rank, Index(0) });
546  })
547  .Sort();
548 
549  if (debug_print)
550  triple_sorted.Keep().Print("triple_sorted");
551 
552  names =
553  triple_sorted
554  .template FlatWindow<IndexRank>(
555  2,
556  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
557  if (index == 0) emit(IndexRank { rb[0].index, Index(1) });
558  emit(IndexRank { rb[1].index,
559  Index(rb[0] == rb[1] ? 0 : index + 2) });
560  });
561 
562  if (debug_print)
563  names.Keep().Print("names");
564 
565  number_duplicates =
566  names
567  .Filter([](const IndexRank& ir) {
568  return ir.rank == Index(0);
569  })
570  .SizeFuture();
571 
572  names =
573  names
574  .PrefixSum([](const IndexRank& a, const IndexRank& b) {
575  return IndexRank { b.index, std::max(a.rank, b.rank) };
576  });
577 
578  if (input_dia.context().my_rank() == 0) {
579  sLOG1 << "iteration" << iteration
580  << "shift_by" << shift_by
581  << "duplicates" << number_duplicates.get();
582  }
583  ++iteration;
584 
585  if (number_duplicates.get() == 0) {
586  auto sa =
587  names
588  .Map([](const IndexRank& ir) {
589  return ir.index;
590  });
591  return sa.Collapse();
592  }
593 
594  if (debug_print)
595  names.Keep().Print("names");
596  }
597 }
598 
599 template <typename Index, typename InputDIA>
601  const InputDIA& input_dia, size_t input_size, bool packed) {
602 
603  if (input_dia.ctx().my_rank() == 0)
604  LOG1 << "Running PrefixDoublingDiscarding";
605 
607  using IndexRankStatus = suffix_sorting::IndexRankStatus<Index>;
608  using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
609  using Index3Rank = suffix_sorting::Index3Rank<Index>;
610  using IndexRankRankStatus = suffix_sorting::IndexRankRankStatus<Index>;
611 
612  size_t iteration;
613 
614  DIA<IndexRank> names = PrefixDoublingPack<Index>(
615  input_dia, input_size, packed, iteration);
616 
617  names =
618  names.PrefixSum([](const IndexRank a, const IndexRank b) {
619  return IndexRank { b.index, std::max(a.rank, b.rank) };
620  });
621 
622  auto names_unique =
623  names
624  .template FlatWindow<IndexRankStatus>(
625  3,
626  [=](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
627  if (index == 0) {
628  Status status = rb[0].rank != rb[1].rank ? Status::UNIQUE : Status::UNDECIDED;
629  emit(IndexRankStatus { rb[0].index, rb[0].rank, status });
630  }
631  {
632  Status status = rb[0].rank != rb[1].rank && rb[1].rank != rb[2].rank
634 
635  emit(IndexRankStatus { rb[1].index, rb[1].rank, status });
636  }
637  if (index == input_size - 3) {
638  Status status = rb[1].rank != rb[2].rank ? Status::UNIQUE : Status::UNDECIDED;
639  emit(IndexRankStatus { rb[2].index, rb[2].rank, status });
640  }
641  });
642 
643  auto names_unique_sorted =
644  names_unique
645  .Sort([iteration](const IndexRankStatus& a, const IndexRankStatus& b) {
646  Index mod_mask = (Index(1) << iteration) - 1;
647  Index div_mask = ~mod_mask;
648 
649  if ((a.index & mod_mask) == (b.index & mod_mask))
650  return (a.index & div_mask) < (b.index & div_mask);
651  else
652  return (a.index & mod_mask) < (b.index & mod_mask);
653  });
654 
655  if (debug_print)
656  names_unique_sorted.Keep().Print("names_unique_sorted");
657 
658  std::vector<DIA<IndexRank> > fully_discarded;
659 
660  while (true) {
661  ++iteration;
662 
663  size_t names_size = names_unique_sorted.Keep().Size();
664 
665  if (debug_print)
666  names_unique_sorted.Keep().Print("names_unique_sorted begin of loop");
667 
668  auto discarded_names =
669  names_unique_sorted.Keep()
670  .template FlatWindow<IndexRankRankStatus>(
671  3,
672  [=](size_t index, const RingBuffer<IndexRankStatus>& rb, auto emit) {
673  // Discarded names (we need to change the status since we remove it one step later)
674  if (index == 0) {
675  if (rb[0].status == Status::UNIQUE)
676  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::FULLY_DISCARDED });
677  if (rb[1].status == Status::UNIQUE)
678  // Since there is just one preceding entry it's either undiscarded or unique
679  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::FULLY_DISCARDED });
680  }
681  if (rb[2].status == Status::UNIQUE) {
682  if (rb[0].status == Status::UNIQUE || rb[1].status == Status::UNIQUE)
683  emit(IndexRankRankStatus { rb[2].index, rb[2].rank, Index(0), Status::FULLY_DISCARDED });
684  else
685  emit(IndexRankRankStatus { rb[2].index, rb[2].rank, Index(0), Status::UNIQUE });
686  }
687  if (rb[0].status == Status::UNDECIDED) {
688  if (rb[0].index + Index(1llu << (iteration - 1)) == rb[1].index)
689  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, rb[1].rank, Status::UNDECIDED });
690  else
691  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::UNDECIDED });
692  }
693  },
694  [=](size_t index, const RingBuffer<IndexRankStatus>& rb, auto emit) {
695  if (index == 0) {
696  if (rb[0].status == Status::UNIQUE)
697  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::FULLY_DISCARDED });
698  else // (rb[0].status == Status::UNDECIDED)
699  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0), Status::UNDECIDED });
700  if (rb[1].status == Status::UNIQUE)
701  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::FULLY_DISCARDED });
702  else // (rb[1].status == Status::UNDECIDED)
703  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::UNDECIDED });
704  }
705  if (index == names_size - 2) {
706  if (rb[0].status == Status::UNDECIDED)
707  emit(IndexRankRankStatus { rb[0].index, rb[0].rank, rb[1].rank, Status::UNDECIDED });
708  if (rb[1].status == Status::UNDECIDED)
709  emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0), Status::UNDECIDED });
710  }
711  });
712 
713  auto new_decided =
714  discarded_names.Keep()
715  .Filter([](const IndexRankRankStatus& irs) {
716  return irs.status == Status::FULLY_DISCARDED;
717  })
718  .Map([](const IndexRankRankStatus& irs) {
719  return IndexRank { irs.index, irs.rank1 };
720  });
721 
722  auto partial_discarded =
723  discarded_names.Keep()
724  .Filter([](const IndexRankRankStatus& irs) {
725  return irs.status == Status::UNIQUE;
726  })
727  .Map([](const IndexRankRankStatus& irs) {
728  return IndexRankStatus { irs.index, irs.rank1, Status::UNIQUE };
729  });
730 
731  auto undiscarded =
732  discarded_names
733  .Filter([](const IndexRankRankStatus& irs) {
734  return irs.status == Status::UNDECIDED;
735  })
736  .Map([](const IndexRankRankStatus& irs) {
737  return IndexRankRank { irs.index, irs.rank1, irs.rank2 };
738  })
739  .Sort();
740 
741  fully_discarded.emplace_back(new_decided.Cache());
742 
743  size_t duplicates = undiscarded.Keep().Size();
744 
745  if (input_dia.context().my_rank() == 0) {
746  sLOG1 << "iteration" << iteration - 1
747  << "duplicates" << duplicates;
748  }
749 
750  if (duplicates == 0) {
751  auto sa =
752  Union(fully_discarded)
753  .Sort([](const IndexRank& a, const IndexRank& b) {
754  return a.rank < b.rank;
755  })
756  .Map([](const IndexRank& ir) {
757  return ir.index;
758  });
759  return sa.Collapse();
760  }
761 
762  auto new_ranks =
763  undiscarded
764  .template FlatWindow<Index3Rank>(
765  2,
766  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
767  if (index == 0) {
768  emit(Index3Rank { rb[0].index, Index(0), Index(0), rb[0].rank1 });
769  }
770  Index i = rb[0].rank1 == rb[1].rank1 ? Index(0) : Index(index + 1);
771  Index r;
772  if (rb[0].rank1 != rb[1].rank1)
773  r = Index(index + 1);
774  else
775  r = (rb[0].rank2 == rb[1].rank2) ? Index(0) : Index(index + 1);
776  emit(Index3Rank { rb[1].index, i, r, rb[1].rank1 });
777  },
778  [](size_t index, const RingBuffer<IndexRankRank>& rb, auto emit) {
779  if (index == 0)
780  emit(Index3Rank { rb[0].index, Index(0), Index(0), rb[0].rank1 });
781  })
782  .PrefixSum([](const Index3Rank& a, const Index3Rank& b) {
783  return Index3Rank {
784  b.index,
785  std::max<Index>(a.rank1, b.rank1),
786  std::max<Index>(a.rank2, b.rank2),
787  b.rank3
788  };
789  })
790  .Map([](const Index3Rank& ir) {
791  return IndexRank { ir.index, ir.rank3 + (ir.rank2 - ir.rank1) };
792  });
793 
794  names_unique =
795  new_ranks
796  .template FlatWindow<IndexRankStatus>(
797  3,
798  [=](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
799  if (index == 0) {
800  Status status = rb[0].rank != rb[1].rank ? Status::UNIQUE : Status::UNDECIDED;
801  emit(IndexRankStatus { rb[0].index, rb[0].rank, status });
802  }
803  if (rb[0].rank != rb[1].rank && rb[1].rank != rb[2].rank)
804  emit(IndexRankStatus { rb[1].index, rb[1].rank, Status::UNIQUE });
805  else
806  emit(IndexRankStatus { rb[1].index, rb[1].rank, Status::UNDECIDED });
807  if (index + 3 == duplicates) {
808  Status status = rb[1].rank != rb[2].rank ? Status::UNIQUE : Status::UNDECIDED;
809  emit(IndexRankStatus { rb[2].index, rb[2].rank, status });
810  }
811  },
812  [](size_t index, const RingBuffer<IndexRank>& rb, auto emit) {
813  if (index == 0) { // We know that there are exactly 2 names
814  emit(IndexRankStatus { rb[0].index, rb[0].rank, Status::UNIQUE });
815  emit(IndexRankStatus { rb[1].index, rb[1].rank, Status::UNIQUE });
816  }
817  });
818 
819  names_unique_sorted =
820  names_unique
821  .Union(partial_discarded)
822  .Sort([iteration](const IndexRankStatus& a, const IndexRankStatus& b) {
823  Index mod_mask = (Index(1) << iteration) - 1;
824  Index div_mask = ~mod_mask;
825 
826  if ((a.index & mod_mask) == (b.index & mod_mask))
827  return (a.index & div_mask) < (b.index & div_mask);
828  else
829  return (a.index & mod_mask) < (b.index & mod_mask);
830  });
831  }
832 }
833 
834 template DIA<uint32_t> PrefixDoublingWindow<uint32_t>(
835  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
836 
837 template DIA<uint32_t> PrefixDoublingSorting<uint32_t>(
838  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
839 
840 template DIA<uint32_t> PrefixDoublingDiscarding<uint32_t>(
841  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
842 
843 #if !THRILL_ON_TRAVIS
844 
845 template DIA<common::uint40> PrefixDoublingWindow<common::uint40>(
846  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
847 
848 template DIA<common::uint40> PrefixDoublingSorting<common::uint40>(
849  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
850 
851 template DIA<common::uint40> PrefixDoublingDiscarding<common::uint40>(
852  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
853 
854 template DIA<uint64_t> PrefixDoublingWindow<uint64_t>(
855  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
856 
857 template DIA<uint64_t> PrefixDoublingSorting<uint64_t>(
858  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
859 
860 template DIA<uint64_t> PrefixDoublingDiscarding<uint64_t>(
861  const DIA<uint8_t>& input_dia, size_t input_size, bool packed);
862 
863 #endif
864 
865 } // namespace suffix_sorting
866 } // namespace examples
867 
868 /******************************************************************************/
tlx::RingBuffer< Type, Allocator > RingBuffer
Definition: ring_buffer.hpp:21
template DIA< uint32_t > PrefixDoublingDiscarding< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
#define sLOG1
Definition: logger.hpp:188
template DIA< uint32_t > PrefixDoublingSorting< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
#define LOG1
Definition: logger.hpp:176
auto Union(const FirstDIA &first_dia, const DIAs &...dias)
Union is a LOp, which creates the union of all items from any number of DIAs as a single DIA...
Definition: union.hpp:320
const struct SkipPreReducePhaseTag SkipPreReducePhaseTag
global const SkipPreReducePhaseTag instance
Definition: dia.hpp:59
template DIA< uint64_t > PrefixDoublingWindow< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
template DIA< uint32_t > PrefixDoublingWindow< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
template for computing the component-wise sum of std::array or std::vector.
Definition: functional.hpp:114
DIA< Index > PrefixDoublingDiscarding(const InputDIA &input_dia, size_t input_size, bool packed)
DIA< Index > PrefixDoublingSorting(const InputDIA &input_dia, size_t input_size, bool packed)
unsigned integer_log2_floor(int i)
calculate the log2 floor of an integer type (by repeated bit shifts)
template DIA< uint64_t > PrefixDoublingDiscarding< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
static bool operator==(const std::string &a, const StringView &b) noexcept
struct examples::suffix_sorting::IndexRank TLX_ATTRIBUTE_PACKED
template DIA< uint64_t > PrefixDoublingSorting< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
unsigned integer_log2_ceil(int i)
calculate the log2 ceiling of an integer type (by repeated bit shifts)
DIA< Index > PrefixDoublingWindow(const InputDIA &input_dia, size_t input_size, bool packed)
DIA< IndexRank< Index > > PrefixDoublingPack(const InputDIA &input_dia, size_t input_size, bool packed, size_t &iteration)
take input and pack it into an array of Index characters
std::ostream & operator<<(std::ostream &os, const DIABase &d)
make ostream-able.
Definition: dia_base.cpp:448
static constexpr const T & max(const T &a, const T &b)
template for constexpr max, because std::max is not good enough.
Definition: functional.hpp:65