Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
rl_bwt.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * examples/suffix_sorting/rl_bwt.cpp
3  *
4  * Part of Project Thrill - http://project-thrill.org
5  *
6  * Copyright (C) 2016 Simon Gog <[email protected]>
7  *
8  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
9  ******************************************************************************/
10 
11 #include <thrill/api/collapse.hpp>
12 #include <thrill/api/dia.hpp>
14 #include <thrill/api/generate.hpp>
15 #include <thrill/api/print.hpp>
17 #include <thrill/api/size.hpp>
18 #include <thrill/api/sort.hpp>
19 #include <thrill/api/window.hpp>
21 #include <thrill/api/zip.hpp>
22 #include <thrill/common/logger.hpp>
24 #include <tlx/cmdline_parser.hpp>
25 
26 #include <algorithm>
27 #include <limits>
28 #include <random>
29 #include <stdexcept>
30 #include <string>
31 #include <tuple>
32 #include <utility>
33 #include <vector>
34 
35 using namespace thrill; // NOLINT
36 
37 //! A pair (rank, index)
38 template <typename IndexType, typename CharType>
39 struct IndexChar {
40  IndexType index;
41  CharType c;
42 
43  friend std ::ostream& operator << (
44  std::ostream& os, const IndexChar& ri) {
45  return os << '(' << ri.index << '|' << ri.c << ')';
46  }
48 
49 template <typename InputDIA>
50 auto ConstructRLBWT(const InputDIA& input_dia) {
51 
52  Context& ctx = input_dia.ctx();
53 
54  using ValueType = typename InputDIA::ValueType;
55  using PairIC = IndexChar<uint8_t, ValueType>;
56 
57  size_t input_size = input_dia.Size();
58 
59  if (input_size < 2) { // handle special case
60  std::vector<uint8_t> length(input_size, 1);
61  DIA<uint8_t> rl = EqualToDIA(ctx, length);
62  return input_dia.Zip(rl,
63  [](const ValueType& c, const uint8_t& i) {
64  return PairIC { i, c };
65  });
66  }
67 
68  auto rl_bwt = input_dia
69  .template FlatWindow<PairIC>(
70  DisjointTag, 256,
71  [](size_t, const std::vector<ValueType>& v, auto emit) {
72  size_t i = 0;
73  size_t run_start = 0;
74  while (++i < v.size()) {
75  if (v[i - 1] != v[i]) {
76  emit(PairIC { static_cast<uint8_t>(i - run_start - 1), v[i - 1] });
77  run_start = i;
78  }
79  }
80  emit(PairIC { static_cast<uint8_t>(i - run_start - 1), v[i - 1] });
81  });
82  return rl_bwt;
83 }
84 
85 int main(int argc, char* argv[]) {
87 
88  cp.set_author("Simon Gog <[email protected]>");
89 
90  std::string input_path;
91  size_t output_result = 0;
92 
93  cp.add_opt_param_string("input", input_path,
94  "Path to input file.");
95  cp.add_opt_param_size_t("output_result", output_result,
96  "Output result.");
97 
98  if (!cp.process(argc, argv))
99  return -1;
100 
101  return Run(
102  [&](Context& ctx) {
103  if (input_path.size()) {
104  auto input_dia = ReadBinary<uint8_t>(ctx, input_path);
105  auto output_dia = ConstructRLBWT(input_dia);
106  if (output_result)
107  output_dia.Print("rl_bwt");
108  sLOG1 << "RLE size = " << output_dia.Size();
109  }
110  else {
111  std::string bwt = "aaaaaaaaaaabbbbaaaaaaaccccdddaacacaffatttttttttttyyyyaaaaa";
112  auto input_dia =
113  Generate(ctx, bwt.size(),
114  [&](size_t i) { return (uint8_t)bwt[i]; });
115  auto output_dia = ConstructRLBWT(input_dia);
116  if (output_result)
117  output_dia.Print("rl_bwt");
118  }
119  });
120 }
121 
122 /******************************************************************************/
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
Definition: generate.hpp:85
auto EqualToDIA(Context &ctx, const std::vector< ValueType > &in_vector)
EqualToDIA is a Source-DOp, which takes a vector of data EQUAL on all workers, and returns the data i...
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
Definition: context.cpp:863
void add_opt_param_string(const std::string &name, std::string &dest, const std::string &desc)
add optional string parameter [name] with description and store to dest
int main(int argc, char *argv[])
Definition: rl_bwt.cpp:85
#define sLOG1
Definition: logger.hpp:156
struct IndexChar TLX_ATTRIBUTE_PACKED
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
Definition: allocator.hpp:220
Command line parser which automatically fills variables and prints nice usage messages.
void add_opt_param_size_t(const std::string &name, size_t &dest, const std::string &desc)
add optional size_t parameter [name] with description and store to dest
void set_author(const std::string &author)
Set author of program, will be wrapped.
const struct DisjointTag DisjointTag
global const DisjointTag instance
Definition: dia.hpp:67
bool process(int argc, const char *const *argv, std::ostream &os)
std::ostream & operator<<(std::ostream &os, const DIABase &d)
make ostream-able.
Definition: dia_base.cpp:449
auto ConstructRLBWT(const InputDIA &input_dia)
Definition: rl_bwt.cpp:50