Thrill  0.1
suffix_sorting.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/suffix_sorting/suffix_sorting.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016-2017 Timo Bingmann <[email protected]>
7  * Copyright (C) 2016 Florian Kurpicz <[email protected]>
8  *
9  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
10  ******************************************************************************/
11 
20 
21 #include <thrill/api/cache.hpp>
22 #include <thrill/api/collapse.hpp>
24 #include <thrill/api/generate.hpp>
25 #include <thrill/api/print.hpp>
28 #include <thrill/common/logger.hpp>
30 #include <tlx/cmdline_parser.hpp>
31 
32 #include <algorithm>
33 #include <limits>
34 #include <random>
35 #include <stdexcept>
36 #include <string>
37 #include <tuple>
38 #include <utility>
39 #include <vector>
40 
41 using namespace thrill; // NOLINT
42 using namespace examples::suffix_sorting; // NOLINT
43 
44 namespace examples {
45 namespace suffix_sorting {
46 
47 bool debug_print = false;
48 
49 } // namespace suffix_sorting
50 } // namespace examples
51 
52 /*!
53  * Class to encapsulate all suffix sorting algorithms
54  */
55 class SuffixSorting
56 {
57 public:
58  std::string input_path_;
59  std::string input_copy_path_;
60  std::string output_path_;
61  uint64_t sizelimit_ = std::numeric_limits<uint64_t>::max();
62 
63  std::string algorithm_;
64 
65  bool text_output_flag_ = false;
66  bool check_flag_ = false;
67  bool input_verbatim_ = false;
68  bool pack_input_ = false;
69  bool lcp_computation_ = false;
70 
71  std::string output_bwt_;
72  std::string output_wavelet_tree_;
73 
74  size_t sa_index_bytes_ = 4;
75 
76  void Run(Context& ctx) const {
77  ctx.enable_consume();
78 
79  if (input_verbatim_) {
80  // take path as verbatim text
81  std::vector<uint8_t> input_vec(input_path_.begin(), input_path_.end());
82  auto input_dia = EqualToDIA(ctx, input_vec).Collapse();
83  SwitchIndexType(input_dia, input_vec.size());
84  }
85  else if (input_path_ == "unary") {
86  if (sizelimit_ == std::numeric_limits<size_t>::max()) {
87  LOG1 << "You must provide -s <size> for generated inputs.";
88  return;
89  }
90 
91  DIA<uint8_t> input_dia = Generate(
92  ctx, sizelimit_, [](size_t /* i */) { return uint8_t('a'); });
93  SwitchIndexType(input_dia, sizelimit_);
94  }
95  else if (input_path_ == "random") {
96  if (sizelimit_ == std::numeric_limits<size_t>::max()) {
97  LOG1 << "You must provide -s <size> for generated inputs.";
98  return;
99  }
100 
101  // share prng in Generate (just random numbers anyway)
102  std::mt19937 prng(
103  std::random_device { } () + 4096 * ctx.my_rank());
104 
105  DIA<uint8_t> input_dia =
106  Generate(
107  ctx, sizelimit_,
108  [&prng](size_t /* index */) {
109  return static_cast<uint8_t>(prng());
110  })
111  // the random input _must_ be cached, otherwise it will be
112  // regenerated ... and contain new numbers.
113  .Cache();
114  SwitchIndexType(input_dia, sizelimit_);
115  }
116  else if (input_path_ == "random10") {
117  if (sizelimit_ == std::numeric_limits<size_t>::max()) {
118  LOG1 << "You must provide -s <size> for generated inputs.";
119  return;
120  }
121 
122  // share prng in Generate (just random digits anyway)
123  std::mt19937 prng(
124  std::random_device { } () + 4096 * ctx.my_rank());
125 
126  DIA<uint8_t> input_dia =
127  Generate(
128  ctx, sizelimit_,
129  [&prng](size_t /* index */) {
130  return static_cast<uint8_t>(
131  '0' + ((prng() >> 6) % 10));
132  })
133  // the random input _must_ be cached, otherwise it will be
134  // regenerated ... and contain new numbers.
135  .Cache();
136  SwitchIndexType(input_dia, sizelimit_);
137  }
138  else if (input_path_ == "random2") {
139  if (sizelimit_ == std::numeric_limits<size_t>::max()) {
140  LOG1 << "You must provide -s <size> for generated inputs.";
141  return;
142  }
143 
144  // share prng in Generate (just random digits anyway)
145  std::mt19937 prng(
146  std::random_device { } () + 4096 * ctx.my_rank());
147 
148  DIA<uint8_t> input_dia =
149  Generate(
150  ctx, sizelimit_,
151  [&prng](size_t /* index */) {
152  return static_cast<uint8_t>(
153  '0' + ((prng() >> 6) % 2));
154  })
155  // the random input _must_ be cached, otherwise it will be
156  // regenerated ... and contain new numbers.
157  .Cache();
158  SwitchIndexType(input_dia, sizelimit_);
159  }
160  else {
161  auto input_dia =
162  ReadBinary<uint8_t>(ctx, input_path_, sizelimit_).Cache();
163  size_t input_size = input_dia.Keep().Size();
164  SwitchIndexType(input_dia, input_size);
165  }
166  }
167 
168  template <typename InputDIA>
169  void SwitchIndexType(const InputDIA& input_dia, uint64_t input_size) const {
170 
171  if (input_copy_path_.size())
172  input_dia.Keep().WriteBinary(input_copy_path_);
173 
174  if (sa_index_bytes_ == 4)
175  return StartInput<uint32_t>(input_dia, input_size);
176 #if !THRILL_ON_TRAVIS
177  else if (sa_index_bytes_ == 5)
178  return StartInput<common::uint40>(input_dia, input_size);
179  else if (sa_index_bytes_ == 8)
180  return StartInput<uint64_t>(input_dia, input_size);
181 #endif
182  else
183  die("Unsupported index byte size: " << sa_index_bytes_ <<
184  ". Byte size has to be 4, 5, or 8");
185  }
186 
187  template <typename Index, typename InputDIA>
188  void StartInput(const InputDIA& input_dia, uint64_t input_size) const {
189 
190  // generate or load input prior to starting timer
191  input_dia.Execute();
192 
194 
195  DIA<Index> suffix_array;
196  if (algorithm_ == "none") {
197  suffix_array = Generate(
198  input_dia.ctx(), 0, [](size_t index) { return Index(index); });
199  }
200  else if (algorithm_ == "dc3") {
201  suffix_array = DC3<Index>(input_dia.Keep(), input_size, 256);
202  }
203  else if (algorithm_ == "dc7") {
204  suffix_array = DC7<Index>(input_dia.Keep(), input_size, 256);
205  }
206  else if (algorithm_ == "pdw") {
207  suffix_array = PrefixDoublingWindow<Index>(
208  input_dia.Keep(), input_size, pack_input_);
209  }
210  else if (algorithm_ == "pds") {
211  suffix_array = PrefixDoublingSorting<Index>(
212  input_dia.Keep(), input_size, pack_input_);
213  }
214  else if (algorithm_ == "dis") {
215  suffix_array = PrefixDoublingDiscarding<Index>(
216  input_dia.Keep(), input_size, pack_input_);
217  }
218  else if (algorithm_ == "q") {
219  suffix_array = PrefixQuadrupling<Index>(
220  input_dia.Keep(), input_size, pack_input_);
221  }
222  else if (algorithm_ == "qd") {
223  suffix_array = PrefixQuadruplingDiscarding<Index>(
224  input_dia.Keep(), input_size, pack_input_);
225  }
226  else {
227  die("Unknown algorithm \"" << algorithm_ << "\"");
228  }
229 
230  suffix_array.Execute();
231  timer.Stop();
232 
233  bool check_result = false;
234  if (check_flag_) {
235  if (input_dia.context().my_rank() == 0)
236  LOG1 << "checking suffix array...";
237  die_unless(CheckSA(input_dia.Keep(), suffix_array.Keep()));
238  check_result = true;
239  }
240 
241  if (input_dia.context().my_rank() == 0) {
242  std::cerr << "RESULT"
243  << " algo=" << algorithm_
244  << " hosts=" << input_dia.context().num_hosts()
245  << " check_result=" << check_result
246  << " time=" << timer
247  << (getenv("RESULT") ? getenv("RESULT") : "")
248  << std::endl;
249  }
250 
251  if (text_output_flag_) {
252  suffix_array.Keep().Print("suffix_array");
253  }
254 
255  if (output_path_.size()) {
256  if (input_dia.context().my_rank() == 0)
257  LOG1 << "writing suffix array to " << output_path_;
258  suffix_array.Keep().WriteBinary(output_path_);
259  }
260 
261  if (output_bwt_.size()) {
262  InputDIA bw_transform =
263  ConstructBWT(input_dia, suffix_array, input_size);
264 
265  if (text_output_flag_) {
266  bw_transform.Keep().Print("bw_transform");
267  }
268  if (input_dia.context().my_rank() == 0) {
269  LOG1 << "writing Burrows–Wheeler transform to "
270  << output_bwt_;
271  }
272  if (output_wavelet_tree_.size()) bw_transform.Keep();
273  bw_transform.WriteBinary(output_bwt_);
274 
275  if (output_wavelet_tree_.size())
276  ConstructWaveletTree(bw_transform, output_wavelet_tree_);
277  }
278  else if (output_wavelet_tree_.size()) {
280  ConstructBWT(input_dia, suffix_array, input_size),
281  output_wavelet_tree_);
282  }
283 
284  if (lcp_computation_) {
285  auto bwt = ConstructBWT(input_dia, suffix_array, input_size);
286  ConstructLCP(input_dia, suffix_array, bwt, input_size);
287  }
288  }
289 };
290 
291 int main(int argc, char* argv[]) {
292 
293  using namespace thrill; // NOLINT
294 
296 
297  cp.set_description("A collection of suffix array construction algorithms.");
298  cp.set_author("Florian Kurpicz <[email protected]>");
299  cp.set_author("Timo Bingmann <[email protected]>");
300 
301  SuffixSorting ss;
302 
303  cp.add_param_string("input", ss.input_path_,
304  "Path to input file (or verbatim text).\n"
305  "The special inputs "
306  "'random', 'random10', 'random2' and 'unary' "
307  "generate such text on-the-fly.");
308 
309  cp.add_string('a', "algorithm", ss.algorithm_,
310  "The algorithm which is used to construct the suffix array. "
311  "Available are: "
312  "[pdw]indow (default), [pds]orting, "
313  "prefix doubling with [dis]carding, "
314  "[q]uadrupling, [qd] quadrupling with carding, "
315  "[dc3], and [dc7], or [none] for skipping.");
316 
317  cp.add_size_t('b', "bytes", ss.sa_index_bytes_,
318  "Suffix array bytes per index: "
319  "4 (32-bit) (default), 5 (40-bit), 8 (64-bit)");
320 
321  cp.add_string('B', "bwt", ss.output_bwt_,
322  "Compute the Burrows–Wheeler transform in addition to the "
323  "suffix array, and write to file.");
324 
325  cp.add_bool('c', "check", ss.check_flag_,
326  "Check suffix array for correctness.");
327 
329  "Print debug info.");
330 
331  cp.add_string('i', "input-copy", ss.input_copy_path_,
332  "Write input text to given path.");
333 
334  cp.add_string('o', "output", ss.output_path_,
335  "Output suffix array [and if constructed Burrows–Wheeler "
336  "transform] to given path.");
337 
338  cp.add_bytes('s', "size", ss.sizelimit_,
339  "Cut input text to given size, e.g. 2 GiB.");
340 
341  cp.add_bool('t', "text", ss.text_output_flag_,
342  "Print out suffix array [and if constructed Burrows-Wheeler "
343  "transform] in readable text.");
344 
345  cp.add_bool('v', "verbatim", ss.input_verbatim_,
346  "Consider \"input\" as verbatim text to construct "
347  "suffix array on.");
348 
349  cp.add_string('w', "wavelet", ss.output_wavelet_tree_,
350  "Compute the Wavelet Tree of the Burrows-Wheeler transform, "
351  "and write to file.");
352 
353  cp.add_bool('p', "packed", ss.pack_input_,
354  "Fit as many characters of the input in the bytes used per index"
355  " in the suffix array.");
356 
357  cp.add_bool('l', "lcp", ss.lcp_computation_,
358  "Compute the LCP array in addition to the SA. Currently this "
359  "requires the construction of the BWT.");
360 
361  if (!cp.process(argc, argv))
362  return -1;
363 
364  return Run([&](Context& ctx) { return ss.Run(ctx); });
365 }
366 
367 /******************************************************************************/
DIA is the interface between the user and the Thrill framework.
Definition: dia.hpp:141
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:87
#define die_unless(X)
Definition: die.hpp:27
void set_description(const std::string &description)
Set description of program, text will be wrapped.
#define LOG1
Definition: logger.hpp:28
auto EqualToDIA(Context &ctx, const std::vector< ValueType > &in_vector)
EqualToDIA is a Source-DOp, which takes a vector of data EQUAL on all workers, and returns the data i...
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:947
Definition: bfs.hpp:21
void add_size_t(char key, const std::string &longkey, size_t &dest, const std::string &desc)
add size_t option -key, –longkey with description and store to dest
#define die(msg)
Instead of std::terminate(), throw the output the message via an exception.
Definition: die.hpp:22
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
Definition: context.hpp:221
void enable_consume(bool consume=true)
Sets consume-mode flag such that DIA contents may be consumed during PushData().
Definition: context.hpp:388
void add_bytes(char key, const std::string &longkey, uint32_t &dest, const std::string &desc)
IndexDIA ConstructLCP(const InputDIA &input, const IndexDIA &, const InputDIA &bwt, uint64_t input_size)
void Print(const std::string &name=std::string()) const
Print is an Action, which collects all data of the DIA at the worker 0 and prints using ostream seria...
Definition: print.hpp:50
const DIA & Execute() const
Execute DIA&#39;s scope and parents such that this (Action)Node is Executed.
Definition: dia.hpp:335
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
bool CheckSA(const InputDIA &input, const SuffixArrayDIA &suffix_array)
Definition: check_sa.hpp:28
Command line parser which automatically fills variables and prints nice usage messages.
void add_param_string(const std::string &name, std::string &dest, const std::string &desc)
add string parameter [name] with description and store to dest
void add_string(char key, const std::string &longkey, std::string &dest, const std::string &desc)
add string option -key, –longkey and store to dest
size_t my_rank() const
Global rank of this worker among all other workers in the system.
Definition: context.hpp:243
InputDIA ConstructBWT(const InputDIA &input, const SuffixArrayDIA &suffix_array, uint64_t input_size)
int main(int argc, char *argv[])
void add_bool(char key, const std::string &longkey, bool &dest, const std::string &desc)
void WriteBinary(const std::string &filepath, size_t max_file_size=128 *1024 *1024) const
WriteBinary is a function, which writes a DIA to many files per worker.
const DIA & Keep(size_t increase=1) const
Mark the referenced DIANode for keeping, which makes children not consume the data when executing...
Definition: dia.hpp:310
void set_author(const std::string &author)
Set author of program, will be wrapped.
bool process(int argc, const char *const *argv, std::ostream &os)
auto ConstructWaveletTree(const InputDIA &input_dia)