Thrill  0.1
construct_lcp.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/suffix_sorting/construct_lcp.hpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Florian Kurpicz <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
11 #pragma once
12 #ifndef THRILL_EXAMPLES_SUFFIX_SORTING_CONSTRUCT_LCP_HEADER
13 #define THRILL_EXAMPLES_SUFFIX_SORTING_CONSTRUCT_LCP_HEADER
14 
16 
17 #include <thrill/api/cache.hpp>
18 #include <thrill/api/collapse.hpp>
19 #include <thrill/api/dia.hpp>
20 #include <thrill/api/generate.hpp>
21 #include <thrill/api/max.hpp>
23 #include <thrill/api/print.hpp>
24 #include <thrill/api/sort.hpp>
25 #include <thrill/api/window.hpp>
26 #include <thrill/api/zip.hpp>
28 
29 namespace examples {
30 namespace suffix_sorting {
31 
32 using namespace thrill; // NOLINT
34 
35 template <typename Index>
36 struct IndexRank {
37  Index index;
38  Index rank;
39 
40  friend std::ostream& operator << (std::ostream& os, const IndexRank& ir) {
41  return os << '(' << ir.index << '|' << ir.rank << ')';
42  }
44 
45 template <typename Char, typename Index>
46 struct IndexChar {
47  Index index;
48  Char ch;
49 
50  friend std::ostream& operator << (std::ostream& os, const IndexChar& ic) {
51  return os << '(' << ic.index << '|' << ic.ch << ')';
52  }
54 
55 template <typename Index>
56 struct IndexFlag {
57  Index index;
58  bool flag;
59 
60  friend std::ostream& operator << (std::ostream& os, const IndexFlag& idx_flag) {
61  return os << '(' << idx_flag.index << '|' << (idx_flag.flag ? 't' : 'f') << ')';
62  }
64 
65 template <typename Index>
66 struct IndexRankFlag {
67  Index index;
68  Index rank;
69  bool flag;
70 
71  friend std::ostream& operator << (std::ostream& os, const IndexRankFlag& irf) {
72  return os << '(' << irf.index << '|' << irf.rank << '|' << (irf.flag ? 't' : 'f') << ')';
73  }
75 
76 template <typename InputDIA, typename IndexDIA>
77 IndexDIA ConstructLCP(const InputDIA& input, const IndexDIA& /*suffix_array*/,
78  const InputDIA& bwt, uint64_t input_size) {
79 
80  thrill::Context& ctx = input.ctx();
81 
82  using Char = typename InputDIA::ValueType;
83  using Index = typename IndexDIA::ValueType;
84 
89 
90  auto lcp =
91  Generate(
92  ctx, input_size,
93  [](size_t /*index*/) {
94  return IndexFlag { Index(0), false };
95  });
96 
97  auto tmp_inverse_bwt =
98  bwt
99  .ZipWithIndex([](const Char& c, const size_t& i) {
100  return IndexChar { Index(i), c };
101  })
102  .Sort([](const IndexChar& a, const IndexChar& b) {
103  if (a.ch == b.ch)
104  return a.index < b.index;
105  return a.ch < b.ch;
106  });
107 
108  auto intervals =
109  tmp_inverse_bwt
110  .Keep()
111  .template FlatWindow<Index>(
112  2,
113  [](const size_t index, const RingBuffer<IndexChar>& rb, auto emit) {
114  if (index == 0)
115  emit(Index(0));
116  emit(rb[0].ch == rb[1].ch ? Index(0) : Index(1));
117  })
118  .PrefixSum();
119 
120  if (debug_print)
121  intervals.Keep().Print("intervals");
122 
123  size_t number_intervals = intervals.Keep().Max();
124 
125  auto inverse_bwt =
126  tmp_inverse_bwt
127  .ZipWithIndex([](const IndexChar& ic, const size_t& i) {
128  return IndexRank { ic.index, Index(i) };
129  })
130  .Sort([](const IndexRank& a, const IndexRank& b) {
131  return a.index < b.index;
132  })
133  .Map([](const IndexRank& ir) {
134  return ir.rank;
135  })
136  .Cache();
137 
138  if (debug_print)
139  inverse_bwt.Keep().Print("inverse_bwt");
140 
141  size_t lcp_value = 0;
142  while (number_intervals + 1 < input_size) {
143  lcp =
144  intervals
145  .Keep()
146  .Zip(
147  lcp,
148  [](const Index& idx, const IndexFlag& idx_flag) {
149  return IndexRankFlag { idx_flag.index, idx, idx_flag.flag };
150  })
151  .template FlatWindow<IndexFlag>(
152  2,
153  [lcp_value](const size_t index, const RingBuffer<IndexRankFlag>& rb, auto emit) {
154  if (index == 0)
155  emit(IndexFlag { 0, true });
156  if (rb[0].rank != rb[1].rank && !rb[1].flag)
157  emit(IndexFlag { Index(lcp_value), true });
158  else
159  emit(IndexFlag { rb[1].index, rb[1].flag });
160  });
161 
162  intervals =
163  inverse_bwt
164  .Keep()
165  .Zip(
166  intervals,
167  [](const Index& pbwt, const Index& i) {
168  return IndexRank { pbwt, i };
169  })
170  .Sort([](const IndexRank& a, const IndexRank& b) {
171  return a.index < b.index;
172  })
173  .Map([](const IndexRank& i) {
174  return i.rank;
175  })
176  .template FlatWindow<Index>(
177  2,
178  [](const size_t index, const RingBuffer<Index>& rb, auto emit) {
179  if (index == 0)
180  emit(Index(0));
181  emit(rb[0] == rb[1] ? Index(0) : Index(1));
182  })
183  .PrefixSum();
184 
185  number_intervals = intervals.Keep().Max();
186  ++lcp_value;
187  }
188 
189  lcp =
190  intervals
191  .Keep()
192  .Zip(
193  lcp,
194  [](const Index& idx, const IndexFlag& idx_flag) {
195  return IndexRankFlag { idx_flag.index, idx, idx_flag.flag };
196  })
197  .template FlatWindow<IndexFlag>(
198  2,
199  [lcp_value](const size_t index, const RingBuffer<IndexRankFlag>& rb, auto emit) {
200  if (index == 0)
201  emit(IndexFlag { 0, true });
202  if (rb[0].rank != rb[1].rank && !rb[1].flag)
203  emit(IndexFlag { Index(lcp_value), true });
204  else
205  emit(IndexFlag { rb[1].index, rb[1].flag });
206  });
207 
208  if (debug_print)
209  lcp.Keep().Print("lcp");
210 
211  return lcp
212  .Map([](const IndexFlag& idx_flag) {
213  return idx_flag.index;
214  })
215  .Collapse();
216 }
217 
218 } // namespace suffix_sorting
219 } // namespace examples
220 
221 #endif // !THRILL_EXAMPLES_SUFFIX_SORTING_CONSTRUCT_LCP_HEADER
222 
223 /******************************************************************************/
tlx::RingBuffer< Type, Allocator > RingBuffer
Definition: ring_buffer.hpp:21
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:87
A ring (circular) buffer of static (non-growing) size.
Definition: ring_buffer.hpp:36
Definition: bfs.hpp:21
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
Definition: context.hpp:221
IndexDIA ConstructLCP(const InputDIA &input, const IndexDIA &, const InputDIA &bwt, uint64_t input_size)
std::ostream & operator<<(std::ostream &os, const Status &s)
struct examples::suffix_sorting::IndexRank TLX_ATTRIBUTE_PACKED