45 namespace suffix_sorting {
65 bool text_output_flag_ =
false;
66 bool check_flag_ =
false;
67 bool input_verbatim_ =
false;
68 bool pack_input_ =
false;
69 bool lcp_computation_ =
false;
74 size_t sa_index_bytes_ = 4;
79 if (input_verbatim_) {
81 std::vector<uint8_t> input_vec(input_path_.begin(), input_path_.end());
82 auto input_dia =
EqualToDIA(ctx, input_vec).Collapse();
83 SwitchIndexType(input_dia, input_vec.size());
85 else if (input_path_ ==
"unary") {
87 LOG1 <<
"You must provide -s <size> for generated inputs.";
92 ctx, sizelimit_, [](
size_t ) {
return uint8_t(
'a'); });
93 SwitchIndexType(input_dia, sizelimit_);
95 else if (input_path_ ==
"random") {
97 LOG1 <<
"You must provide -s <size> for generated inputs.";
103 std::random_device { } () + 4096 * ctx.
my_rank());
109 return static_cast<uint8_t
>(prng());
114 SwitchIndexType(input_dia, sizelimit_);
116 else if (input_path_ ==
"random10") {
118 LOG1 <<
"You must provide -s <size> for generated inputs.";
124 std::random_device { } () + 4096 * ctx.
my_rank());
130 return static_cast<uint8_t
>(
131 '0' + ((prng() >> 6) % 10));
136 SwitchIndexType(input_dia, sizelimit_);
138 else if (input_path_ ==
"random2") {
140 LOG1 <<
"You must provide -s <size> for generated inputs.";
146 std::random_device { } () + 4096 * ctx.
my_rank());
152 return static_cast<uint8_t
>(
153 '0' + ((prng() >> 6) % 2));
158 SwitchIndexType(input_dia, sizelimit_);
162 ReadBinary<uint8_t>(ctx, input_path_, sizelimit_).Cache();
163 size_t input_size = input_dia.Keep().Size();
164 SwitchIndexType(input_dia, input_size);
168 template <
typename InputDIA>
169 void SwitchIndexType(
const InputDIA& input_dia, uint64_t input_size)
const {
171 if (input_copy_path_.size())
172 input_dia.Keep().WriteBinary(input_copy_path_);
174 if (sa_index_bytes_ == 4)
175 return StartInput<uint32_t>(input_dia, input_size);
176 #if !THRILL_ON_TRAVIS 177 else if (sa_index_bytes_ == 5)
178 return StartInput<common::uint40>(input_dia, input_size);
179 else if (sa_index_bytes_ == 8)
180 return StartInput<uint64_t>(input_dia, input_size);
183 die(
"Unsupported index byte size: " << sa_index_bytes_ <<
184 ". Byte size has to be 4, 5, or 8");
187 template <
typename Index,
typename InputDIA>
188 void StartInput(
const InputDIA& input_dia, uint64_t input_size)
const {
196 if (algorithm_ ==
"none") {
198 input_dia.ctx(), 0, [](
size_t index) {
return Index(index); });
200 else if (algorithm_ ==
"dc3") {
201 suffix_array = DC3<Index>(input_dia.Keep(), input_size, 256);
203 else if (algorithm_ ==
"dc7") {
204 suffix_array = DC7<Index>(input_dia.Keep(), input_size, 256);
206 else if (algorithm_ ==
"pdw") {
207 suffix_array = PrefixDoublingWindow<Index>(
208 input_dia.Keep(), input_size, pack_input_);
210 else if (algorithm_ ==
"pds") {
211 suffix_array = PrefixDoublingSorting<Index>(
212 input_dia.Keep(), input_size, pack_input_);
214 else if (algorithm_ ==
"dis") {
215 suffix_array = PrefixDoublingDiscarding<Index>(
216 input_dia.Keep(), input_size, pack_input_);
218 else if (algorithm_ ==
"q") {
219 suffix_array = PrefixQuadrupling<Index>(
220 input_dia.Keep(), input_size, pack_input_);
222 else if (algorithm_ ==
"qd") {
223 suffix_array = PrefixQuadruplingDiscarding<Index>(
224 input_dia.Keep(), input_size, pack_input_);
227 die(
"Unknown algorithm \"" << algorithm_ <<
"\"");
233 bool check_result =
false;
235 if (input_dia.context().my_rank() == 0)
236 LOG1 <<
"checking suffix array...";
241 if (input_dia.context().my_rank() == 0) {
242 std::cerr <<
"RESULT" 243 <<
" algo=" << algorithm_
244 <<
" hosts=" << input_dia.context().num_hosts()
245 <<
" check_result=" << check_result
247 << (getenv(
"RESULT") ? getenv(
"RESULT") :
"")
251 if (text_output_flag_) {
252 suffix_array.
Keep().
Print(
"suffix_array");
255 if (output_path_.size()) {
256 if (input_dia.context().my_rank() == 0)
257 LOG1 <<
"writing suffix array to " << output_path_;
261 if (output_bwt_.size()) {
262 InputDIA bw_transform =
265 if (text_output_flag_) {
266 bw_transform.Keep().Print(
"bw_transform");
268 if (input_dia.context().my_rank() == 0) {
269 LOG1 <<
"writing Burrows–Wheeler transform to " 272 if (output_wavelet_tree_.size()) bw_transform.Keep();
273 bw_transform.WriteBinary(output_bwt_);
275 if (output_wavelet_tree_.size())
278 else if (output_wavelet_tree_.size()) {
281 output_wavelet_tree_);
284 if (lcp_computation_) {
285 auto bwt =
ConstructBWT(input_dia, suffix_array, input_size);
291 int main(
int argc,
char* argv[]) {
297 cp.
set_description(
"A collection of suffix array construction algorithms.");
304 "Path to input file (or verbatim text).\n" 305 "The special inputs " 306 "'random', 'random10', 'random2' and 'unary' " 307 "generate such text on-the-fly.");
309 cp.
add_string(
'a',
"algorithm", ss.algorithm_,
310 "The algorithm which is used to construct the suffix array. " 312 "[pdw]indow (default), [pds]orting, " 313 "prefix doubling with [dis]carding, " 314 "[q]uadrupling, [qd] quadrupling with carding, " 315 "[dc3], and [dc7], or [none] for skipping.");
317 cp.
add_size_t(
'b',
"bytes", ss.sa_index_bytes_,
318 "Suffix array bytes per index: " 319 "4 (32-bit) (default), 5 (40-bit), 8 (64-bit)");
322 "Compute the Burrows–Wheeler transform in addition to the " 323 "suffix array, and write to file.");
325 cp.
add_bool(
'c',
"check", ss.check_flag_,
326 "Check suffix array for correctness.");
329 "Print debug info.");
331 cp.
add_string(
'i',
"input-copy", ss.input_copy_path_,
332 "Write input text to given path.");
335 "Output suffix array [and if constructed Burrows–Wheeler " 336 "transform] to given path.");
339 "Cut input text to given size, e.g. 2 GiB.");
341 cp.
add_bool(
't',
"text", ss.text_output_flag_,
342 "Print out suffix array [and if constructed Burrows-Wheeler " 343 "transform] in readable text.");
345 cp.
add_bool(
'v',
"verbatim", ss.input_verbatim_,
346 "Consider \"input\" as verbatim text to construct " 349 cp.
add_string(
'w',
"wavelet", ss.output_wavelet_tree_,
350 "Compute the Wavelet Tree of the Burrows-Wheeler transform, " 351 "and write to file.");
353 cp.
add_bool(
'p',
"packed", ss.pack_input_,
354 "Fit as many characters of the input in the bytes used per index" 355 " in the suffix array.");
357 cp.
add_bool(
'l',
"lcp", ss.lcp_computation_,
358 "Compute the LCP array in addition to the SA. Currently this " 359 "requires the construction of the BWT.");
364 return Run([&](
Context& ctx) {
return ss.Run(ctx); });
DIA is the interface between the user and the Thrill framework.
static uint_pair max()
return an uint_pair instance containing the largest value possible
auto Generate(Context &ctx, size_t size, const GenerateFunction &generate_function)
Generate is a Source-DOp, which creates a DIA of given size using a generator function.
void set_description(const std::string &description)
Set description of program, text will be wrapped.
auto EqualToDIA(Context &ctx, const std::vector< ValueType > &in_vector)
EqualToDIA is a Source-DOp, which takes a vector of data EQUAL on all workers, and returns the data i...
int Run(const std::function< void(Context &)> &job_startpoint)
Runs the given job startpoint with a Context instance.
void add_size_t(char key, const std::string &longkey, size_t &dest, const std::string &desc)
add size_t option -key, –longkey with description and store to dest
#define die(msg)
Instead of std::terminate(), throw the output the message via an exception.
The Context of a job is a unique instance per worker which holds references to all underlying parts o...
void enable_consume(bool consume=true)
Sets consume-mode flag such that DIA contents may be consumed during PushData().
void add_bytes(char key, const std::string &longkey, uint32_t &dest, const std::string &desc)
IndexDIA ConstructLCP(const InputDIA &input, const IndexDIA &, const InputDIA &bwt, uint64_t input_size)
void Print(const std::string &name=std::string()) const
Print is an Action, which collects all data of the DIA at the worker 0 and prints using ostream seria...
const DIA & Execute() const
Execute DIA's scope and parents such that this (Action)Node is Executed.
std::basic_string< char, std::char_traits< char >, Allocator< char > > string
string with Manager tracking
bool CheckSA(const InputDIA &input, const SuffixArrayDIA &suffix_array)
Command line parser which automatically fills variables and prints nice usage messages.
void add_param_string(const std::string &name, std::string &dest, const std::string &desc)
add string parameter [name] with description and store to dest
void add_string(char key, const std::string &longkey, std::string &dest, const std::string &desc)
add string option -key, –longkey and store to dest
size_t my_rank() const
Global rank of this worker among all other workers in the system.
InputDIA ConstructBWT(const InputDIA &input, const SuffixArrayDIA &suffix_array, uint64_t input_size)
int main(int argc, char *argv[])
void add_bool(char key, const std::string &longkey, bool &dest, const std::string &desc)
void WriteBinary(const std::string &filepath, size_t max_file_size=128 *1024 *1024) const
WriteBinary is a function, which writes a DIA to many files per worker.
const DIA & Keep(size_t increase=1) const
Mark the referenced DIANode for keeping, which makes children not consume the data when executing...
void set_author(const std::string &author)
Set author of program, will be wrapped.
bool process(int argc, const char *const *argv, std::ostream &os)
auto ConstructWaveletTree(const InputDIA &input_dia)