38 namespace suffix_sorting {
43 template <
typename CharsType,
typename Index>
49 return chars == b.chars;
53 return chars < b.chars;
56 friend std::ostream&
operator << (std::ostream& os,
const IndexKMer& iom) {
57 return os <<
"[i=" << iom.index <<
",c=" << iom.chars <<
']';
62 template <
typename Index>
68 return os <<
"(i=" << ri.index <<
",r=" << ri.rank <<
')';
73 template <
typename Index>
74 struct IndexRankRank {
81 return rank1 == b.rank1 && rank2 == b.rank2;
88 bool operator < (
const IndexRankRank& b)
const {
89 return std::tie(rank1, rank2, b.index) < std::tie(b.rank1, b.rank2, index);
92 friend std::ostream&
operator << (std::ostream& os,
const IndexRankRank& rri) {
93 return os <<
"(i=" << rri.index <<
",r1=" << rri.rank1 <<
",r2=" << rri.rank2 <<
")";
98 template <
typename Index>
105 friend std::ostream&
operator << (std::ostream& os,
const Index3Rank& irrr) {
106 return os <<
"(i=" << irrr.index <<
",r1=" << irrr.rank1
107 <<
",r2=" << irrr.rank2 <<
",r3=" << irrr.rank3 <<
")";
111 template <
typename Char,
typename Index>
112 struct CharCharIndex {
117 return std::tie(ch[0], ch[1]) == std::tie(b.ch[0], b.ch[1]);
120 bool operator < (
const CharCharIndex& b)
const {
121 return std::tie(ch[0], ch[1]) < std::tie(b.ch[0], b.ch[1]);
124 friend std::ostream&
operator << (std::ostream& os,
const CharCharIndex& cci) {
125 return os <<
"[ch0=" << cci.ch[0] <<
",ch1=" << cci.ch[1]
126 <<
",index=" << cci.index <<
']';
149 template <
typename Index>
150 struct IndexRankStatus {
156 bool operator == (
const IndexRankStatus& b)
const {
157 return rank == b.rank;
164 bool operator < (
const IndexRankStatus& b)
const {
165 return rank < b.rank || (rank == b.rank && index > b.index);
168 friend std::ostream&
operator << (std::ostream& os,
const IndexRankStatus& irs) {
169 return os <<
"(index=" << irs.index <<
",rank=" << irs.rank <<
",status=" 170 << irs.status <<
")";
175 template <
typename Index>
176 struct IndexRankRankStatus {
182 friend std::ostream&
operator << (std::ostream& os,
const IndexRankRankStatus& irrs) {
183 return os <<
"(index=" << irrs.index
184 <<
",rank1=" << irrs.rank1 <<
",rank2=" << irrs.rank2
185 <<
",status=" << irrs.status <<
")";
190 template <
typename Index,
typename InputDIA>
192 const InputDIA& input_dia,
size_t input_size,
bool packed,
size_t& iteration) {
194 using Char =
typename InputDIA::ValueType;
196 using CharCharIndex = suffix_sorting::CharCharIndex<Char, Index>;
198 if (packed &&
sizeof(Char) == 1) {
201 std::vector<size_t> alpha_map(256);
204 .Map([&alpha_map](
const Char& c) { alpha_map[c]++;
return c; })
207 alpha_map = input_dia.ctx().net.AllReduce(
211 size_t alphabet_size = 1;
212 for (
size_t i = 0; i < 256; ++i) {
213 if (alpha_map[i] != 0) {
214 alpha_map[i] = alphabet_size;
222 size_t k_fitting = 8 *
sizeof(Index) / input_bit_size;
225 if (input_dia.ctx().my_rank() == 0) {
227 <<
" alphabet_size=" << alphabet_size - 1
228 <<
" input_bit_size=" << input_bit_size
229 <<
" k_fitting=" << k_fitting
230 <<
" next_iteration=" << iteration;
236 .template FlatWindow<IndexRank>(
238 [=, alpha_map = std::move(alpha_map)](
241 size_t result = alpha_map[rb[0]];
242 for (
size_t i = 1; i < k_fitting; ++i)
243 result = (result << input_bit_size) | alpha_map[rb[i]];
244 emit(
IndexRank { Index(index), Index(result) });
246 if (index + k_fitting == input_size) {
247 for (
size_t i = 1; i < k_fitting; ++i) {
248 result = alpha_map[rb[i]];
249 for (
size_t j = i + 1; j < k_fitting; ++j)
250 result = (result << input_bit_size) | alpha_map[rb[j]];
251 result <<= i * input_bit_size;
252 emit(
IndexRank { Index(index + i), Index(result) });
257 return a.rank < b.rank;
261 chars_sorted.Keep().Print(
"chars_sorted packed");
263 return chars_sorted.template FlatWindow<IndexRank>(
269 rb[1].
index, Index(rb[0].rank == rb[1].rank ? 0 : index + 2)
280 .template FlatWindow<CharCharIndex>(
285 { rb[0], rb[1] }, Index(index)
289 if (index + 1 == input_size) {
292 { rb[0], std::numeric_limits<Char>::lowest() },
301 chars_sorted.Keep().Print(
"chars_sorted");
303 return chars_sorted.template FlatWindow<IndexRank>(
313 rb[1].
index, Index(rb[0] == rb[1] ? 0 : index + 2)
319 template <
typename Index,
typename InputDIA>
321 const InputDIA& input_dia,
size_t input_size,
bool packed) {
323 if (input_dia.ctx().my_rank() == 0)
324 LOG1 <<
"Running PrefixDoublingSorting";
327 using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
332 input_dia, input_size, packed, iteration);
335 names.
Keep().Print(
"names");
338 auto number_duplicates =
341 return ir.rank == Index(0);
353 if (number_duplicates.get() == 0) {
354 if (input_dia.context().my_rank() == 0)
355 sLOG1 <<
"Finished before doubling in loop";
364 return sa.Collapse();
368 names.
Keep().Print(
"names before loop");
370 auto last_number_duplicates = number_duplicates.get();
377 Index mod_mask = (Index(1) << iteration) - 1;
378 Index div_mask = ~mod_mask;
380 if ((a.index & mod_mask) == (b.index & mod_mask))
381 return (a.index & div_mask) < (b.index & div_mask);
383 return (a.index & mod_mask) < (b.index & mod_mask);
387 names_sorted.Keep().Print(
"names_sorted");
389 size_t next_index = size_t(1) << iteration++;
391 if (input_dia.context().my_rank() == 0) {
392 sLOG1 <<
"next_index" << next_index;
397 .template FlatWindow<IndexRankRank>(
401 rb[0].index, rb[0].rank,
404 (rb[0].index + Index(next_index) == rb[1].index)
405 ? rb[1].rank : Index(0)
409 if (index + 1 == input_size)
410 emit(IndexRankRank { rb[0].index, rb[0].rank, Index(0) });
414 triple.Keep().Print(
"triple");
416 auto triple_sorted = triple.Sort();
419 triple_sorted.Keep().Print(
"triple_sorted");
423 .template FlatWindow<IndexRank>(
431 (rb[0] == rb[1] && rb[0].rank2 != Index(0))
432 ? Index(0) : Index(index + 2)
437 names.
Keep().Print(
"names indicator");
442 return ir.rank == Index(0);
452 if (input_dia.context().my_rank() == 0) {
453 sLOG1 <<
"iteration" << iteration - 1
454 <<
"duplicates" << number_duplicates.get();
457 if (number_duplicates.get() > last_number_duplicates) {
458 sLOG1 <<
"number_duplicates" << number_duplicates.get()
459 <<
"last_number_duplicates" << last_number_duplicates;
467 return sa.Collapse();
470 last_number_duplicates = number_duplicates.get();
472 if (number_duplicates.get() == 0) {
479 return sa.Collapse();
483 names.
Keep().Print(
"names");
487 template <
typename Index,
typename InputDIA>
489 const InputDIA& input_dia,
size_t input_size,
bool packed) {
491 if (input_dia.ctx().my_rank() == 0)
492 LOG1 <<
"Running PrefixDoublingWindow";
495 using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
500 input_dia, input_size, packed, iteration);
502 auto number_duplicates =
505 return ir.rank == Index(0);
517 if (number_duplicates.get() == 0) {
518 if (input_dia.context().my_rank() == 0)
519 sLOG1 <<
"Finished before doubling in loop.";
526 return sa.Collapse();
530 names.
Keep().Print(
"names");
540 [](
const IndexRank& a) {
return static_cast<size_t>(a.index); },
545 isa.Keep().Print(
"isa");
547 size_t shift_by = (size_t(1) << (iteration - 1)) + 1;
551 .template FlatWindow<IndexRankRank>(
554 emit(IndexRankRank { rb[0].index, rb.front().rank, rb.back().rank });
557 emit(IndexRankRank { rb[0].index, rb[0].rank, Index(0) });
562 triple_sorted.Keep().Print(
"triple_sorted");
566 .template FlatWindow<IndexRank>(
571 Index(rb[0] == rb[1] ? 0 : index + 2) });
575 names.
Keep().Print(
"names");
580 return ir.rank == Index(0);
590 if (input_dia.context().my_rank() == 0) {
591 sLOG1 <<
"iteration" << iteration
592 <<
"shift_by" << shift_by
593 <<
"duplicates" << number_duplicates.get();
597 if (number_duplicates.get() == 0) {
603 return sa.Collapse();
607 names.
Keep().Print(
"names");
611 template <
typename Index,
typename InputDIA>
613 const InputDIA& input_dia,
size_t input_size,
bool packed) {
615 if (input_dia.ctx().my_rank() == 0)
616 LOG1 <<
"Running PrefixDoublingDiscarding";
619 using IndexRankStatus = suffix_sorting::IndexRankStatus<Index>;
620 using IndexRankRank = suffix_sorting::IndexRankRank<Index>;
621 using Index3Rank = suffix_sorting::Index3Rank<Index>;
622 using IndexRankRankStatus = suffix_sorting::IndexRankRankStatus<Index>;
627 input_dia, input_size, packed, iteration);
635 names.Keep().Print(
"names");
639 .template FlatWindow<IndexRankStatus>(
644 emit(IndexRankStatus { rb[0].index, rb[0].rank, status });
647 Status status = rb[0].rank != rb[1].rank && rb[1].rank != rb[2].rank
650 emit(IndexRankStatus { rb[1].index, rb[1].rank, status });
652 if (index == input_size - 3) {
654 emit(IndexRankStatus { rb[2].index, rb[2].rank, status });
659 names_unique.Keep().Print(
"names_unique");
661 auto names_unique_sorted =
663 .Sort([iteration](
const IndexRankStatus& a,
const IndexRankStatus& b) {
664 Index mod_mask = (Index(1) << iteration) - 1;
665 Index div_mask = ~mod_mask;
667 if ((a.index & mod_mask) == (b.index & mod_mask))
668 return (a.index & div_mask) < (b.index & div_mask);
670 return (a.index & mod_mask) < (b.index & mod_mask);
673 std::vector<DIA<IndexRank> > fully_discarded;
678 size_t names_size = names_unique_sorted.Keep().Size();
681 names_unique_sorted.Keep().Print(
"names_unique_sorted");
683 auto discarded_names =
684 names_unique_sorted.Keep()
685 .template FlatWindow<IndexRankRankStatus>(
700 emit(IndexRankRankStatus { rb[2].index, rb[2].rank, Index(0),
Status::UNIQUE });
703 if (rb[0].index + Index(1llu << (iteration - 1)) == rb[1].index)
704 emit(IndexRankRankStatus { rb[0].index, rb[0].rank, rb[1].rank,
Status::UNDECIDED });
706 emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0),
Status::UNDECIDED });
714 emit(IndexRankRankStatus { rb[0].index, rb[0].rank, Index(0),
Status::UNDECIDED });
718 emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0),
Status::UNDECIDED });
720 if (index == names_size - 2) {
722 emit(IndexRankRankStatus { rb[0].index, rb[0].rank, rb[1].rank,
Status::UNDECIDED });
724 emit(IndexRankRankStatus { rb[1].index, rb[1].rank, Index(0),
Status::UNDECIDED });
729 discarded_names.Keep().Print(
"discarded_names");
732 discarded_names.Keep()
733 .Filter([](
const IndexRankRankStatus& irs) {
736 .Map([](
const IndexRankRankStatus& irs) {
737 return IndexRank { irs.index, irs.rank1 };
740 auto partial_discarded =
741 discarded_names.Keep()
742 .Filter([](
const IndexRankRankStatus& irs) {
745 .Map([](
const IndexRankRankStatus& irs) {
751 .Filter([](
const IndexRankRankStatus& irs) {
754 .Map([](
const IndexRankRankStatus& irs) {
755 return IndexRankRank { irs.index, irs.rank1, irs.rank2 };
759 fully_discarded.emplace_back(new_decided.Cache());
761 size_t duplicates = undiscarded.Keep().Size();
763 if (input_dia.context().my_rank() == 0) {
764 sLOG1 <<
"iteration" << iteration - 1
765 <<
"duplicates" << duplicates;
768 if (duplicates == 0) {
770 Union(fully_discarded)
772 return a.rank < b.rank;
777 return sa.Collapse();
782 .template FlatWindow<Index3Rank>(
786 emit(Index3Rank { rb[0].index, Index(0), Index(0), rb[0].rank1 });
788 Index i = rb[0].rank1 == rb[1].rank1 ? Index(0) : Index(index + 1);
790 if (rb[0].rank1 != rb[1].rank1)
791 r = Index(index + 1);
793 r = (rb[0].rank2 == rb[1].rank2) ? Index(0) : Index(index + 1);
794 emit(Index3Rank { rb[1].index, i, r, rb[1].rank1 });
798 emit(Index3Rank { rb[0].index, Index(0), Index(0), rb[0].rank1 });
800 .PrefixSum([](
const Index3Rank& a,
const Index3Rank& b) {
803 std::max<Index>(a.rank1, b.rank1),
804 std::max<Index>(a.rank2, b.rank2),
808 .Map([](
const Index3Rank& ir) {
809 return IndexRank { ir.index, ir.rank3 + (ir.rank2 - ir.rank1) };
813 new_ranks.Keep().Print(
"new_ranks");
817 .template FlatWindow<IndexRankStatus>(
822 emit(IndexRankStatus { rb[0].index, rb[0].rank, status });
824 if (rb[0].rank != rb[1].rank && rb[1].rank != rb[2].rank)
825 emit(IndexRankStatus { rb[1].index, rb[1].rank,
Status::UNIQUE });
828 if (index + 3 == duplicates) {
830 emit(IndexRankStatus { rb[2].index, rb[2].rank, status });
835 emit(IndexRankStatus { rb[0].index, rb[0].rank,
Status::UNIQUE });
836 emit(IndexRankStatus { rb[1].index, rb[1].rank,
Status::UNIQUE });
841 names_unique.Keep().Print(
"names_unique");
843 names_unique_sorted =
845 .Union(partial_discarded)
846 .Sort([iteration](
const IndexRankStatus& a,
const IndexRankStatus& b) {
847 Index mod_mask = (Index(1) << iteration) - 1;
848 Index div_mask = ~mod_mask;
850 if ((a.index & mod_mask) == (b.index & mod_mask))
851 return (a.index & div_mask) < (b.index & div_mask);
853 return (a.index & mod_mask) < (b.index & mod_mask);
859 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
862 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
865 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
867 #if !THRILL_ON_TRAVIS 870 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
873 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
876 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
879 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
882 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
885 const DIA<uint8_t>& input_dia,
size_t input_size,
bool packed);
tlx::RingBuffer< Type, Allocator > RingBuffer
DIA is the interface between the user and the Thrill framework.
static uint_pair max()
return an uint_pair instance containing the largest value possible
auto Union(const FirstDIA &first_dia, const DIAs &... dias)
Union is a LOp, which creates the union of all items from any number of DIAs as a single DIA...
template DIA< uint32_t > PrefixDoublingDiscarding< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
template DIA< uint32_t > PrefixDoublingSorting< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
A ring (circular) buffer of static (non-growing) size.
auto ReduceToIndex(const KeyExtractor &key_extractor, const ReduceFunction &reduce_function, size_t size, const ValueType &neutral_element=ValueType(), const ReduceConfig &reduce_config=ReduceConfig()) const
ReduceToIndex is a DOp, which groups elements of the DIA with the key_extractor returning an unsigned...
tag structure for ReduceToIndex()
template DIA< uint64_t > PrefixDoublingWindow< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
auto PrefixSum(const SumFunction &sum_function=SumFunction(), const ValueType &initial_element=ValueType()) const
PrefixSum is a DOp, which computes the (inclusive) prefix sum of all elements.
template DIA< uint32_t > PrefixDoublingWindow< uint32_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
template for computing the component-wise sum of std::array or std::vector.
DIA< Index > PrefixDoublingDiscarding(const InputDIA &input_dia, size_t input_size, bool packed)
DIA< Index > PrefixDoublingSorting(const InputDIA &input_dia, size_t input_size, bool packed)
template DIA< uint64_t > PrefixDoublingDiscarding< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
auto Filter(const FilterFunction &filter_function) const
Each item of a DIA is tested using filter_function : to determine whether it is copied into the outp...
auto Sort(const CompareFunction &compare_function=CompareFunction()) const
Sort is a DOp, which sorts a given DIA according to the given compare_function.
static unsigned integer_log2_floor(int i)
calculate the log2 floor of an integer type
auto Map(const MapFunction &map_function) const
Map applies map_function : to each item of a DIA and delivers a new DIA contains the returned values...
bool operator==(const uint_pair &b) const
equality checking operator
std::ostream & operator<<(std::ostream &os, const Status &s)
struct examples::suffix_sorting::IndexRank TLX_ATTRIBUTE_PACKED
static unsigned integer_log2_ceil(int i)
calculate the log2 floor of an integer type
template DIA< uint64_t > PrefixDoublingSorting< uint64_t >(const DIA< uint8_t > &input_dia, size_t input_size, bool packed)
const DIA & Keep(size_t increase=1) const
Mark the referenced DIANode for keeping, which makes children not consume the data when executing...
bool operator<(const uint_pair &b) const
less-than comparison operator
DIA< Index > PrefixDoublingWindow(const InputDIA &input_dia, size_t input_size, bool packed)
DIA< IndexRank< Index > > PrefixDoublingPack(const InputDIA &input_dia, size_t input_size, bool packed, size_t &iteration)
take input and pack it into an array of Index characters