Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
multisequence_partition.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * tlx/algorithm/multisequence_partition.hpp
3  *
4  * Implementation of multisequence partition and selection.
5  *
6  * Copied and modified from STXXL, see http://stxxl.org, which itself extracted
7  * it from MCSTL http://algo2.iti.uni-karlsruhe.de/singler/mcstl/. Both are
8  * distributed under the Boost Software License, Version 1.0.
9  *
10  * Part of tlx - http://panthema.net/tlx
11  *
12  * Copyright (C) 2007 Johannes Singler <[email protected]>
13  * Copyright (C) 2014-2018 Timo Bingmann <[email protected]>
14  *
15  * All rights reserved. Published under the Boost Software License, Version 1.0
16  ******************************************************************************/
17 
18 #ifndef TLX_ALGORITHM_MULTISEQUENCE_PARTITION_HEADER
19 #define TLX_ALGORITHM_MULTISEQUENCE_PARTITION_HEADER
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <functional>
24 #include <queue>
25 #include <utility>
26 #include <vector>
27 
30 
31 namespace tlx {
32 
33 //! \addtogroup tlx_algorithm
34 //! \{
35 
36 namespace multisequence_partition_detail {
37 
38 //! Compare a pair of types lexicographically, ascending.
39 template <typename T1, typename T2, typename Comparator>
41  : public std::binary_function<std::pair<T1, T2>, std::pair<T1, T2>, bool>
42 {
43 protected:
44  Comparator& comp_;
45 
46 public:
47  explicit lexicographic(Comparator& comp) : comp_(comp) { }
48 
49  inline bool operator () (const std::pair<T1, T2>& p1,
50  const std::pair<T1, T2>& p2) {
51  if (comp_(p1.first, p2.first))
52  return true;
53 
54  if (comp_(p2.first, p1.first))
55  return false;
56 
57  // firsts are equal
58  return p1.second < p2.second;
59  }
60 };
61 
62 //! Compare a pair of types lexicographically, descending.
63 template <typename T1, typename T2, typename Comparator>
65  : public std::binary_function<std::pair<T1, T2>, std::pair<T1, T2>, bool>
66 {
67 protected:
68  Comparator& comp_;
69 
70 public:
71  explicit lexicographic_rev(Comparator& comp) : comp_(comp) { }
72 
73  inline bool operator () (const std::pair<T1, T2>& p1,
74  const std::pair<T1, T2>& p2) {
75  if (comp_(p2.first, p1.first))
76  return true;
77 
78  if (comp_(p1.first, p2.first))
79  return false;
80 
81  // firsts are equal
82  return p2.second < p1.second;
83  }
84 };
85 
86 } // namespace multisequence_partition_detail
87 
88 /*!
89  * Splits several sorted sequences at a certain global rank, resulting in a
90  * splitting point for each sequence. The sequences are passed via a sequence
91  * of random-access iterator pairs, none of the sequences may be empty. If
92  * there are several equal elements across the split, the ones on the left side
93  * will be chosen from sequences with smaller number.
94  *
95  * \param begin_seqs Begin of the sequence of iterator pairs.
96  * \param end_seqs End of the sequence of iterator pairs.
97  * \param rank The global rank to partition at.
98  * \param begin_offsets A random-access sequence begin where the result will be
99  * stored in. Each element of the sequence is an iterator that points to the
100  * first element on the greater part of the respective sequence.
101  * \param comp The ordering functor, defaults to std::less<T>.
102  */
103 template <typename RanSeqs, typename RankType, typename RankIterator,
104  typename Comparator = std::less<
105  typename std::iterator_traits<
106  typename std::iterator_traits<RanSeqs>
107  ::value_type::first_type>::value_type>
108  >
110  const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
111  const RankType& rank,
112  RankIterator begin_offsets,
113  Comparator comp = Comparator()) {
114 
115  using iterator = typename std::iterator_traits<RanSeqs>
116  ::value_type::first_type;
117  using diff_type = typename std::iterator_traits<iterator>
118  ::difference_type;
119  using value_type = typename std::iterator_traits<iterator>
120  ::value_type;
121 
122  using SamplePair = std::pair<value_type, diff_type>;
123 
124  using namespace multisequence_partition_detail;
125 
126  // comparators for SamplePair
127  lexicographic<value_type, diff_type, Comparator> lcomp(comp);
128  lexicographic_rev<value_type, diff_type, Comparator> lrcomp(comp);
129 
130  // number of sequences, number of elements in total (possibly including
131  // padding)
132  const diff_type m = std::distance(begin_seqs, end_seqs);
133  diff_type nmax, n;
134  RankType N = 0;
135 
136  for (diff_type i = 0; i < m; ++i)
137  {
138  N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
139  assert(std::distance(begin_seqs[i].first, begin_seqs[i].second) > 0);
140  }
141 
142  if (rank == N)
143  {
144  for (diff_type i = 0; i < m; ++i)
145  begin_offsets[i] = begin_seqs[i].second; // very end
146  return;
147  }
148 
149  assert(m != 0 && N != 0 && rank < N);
150 
151  simple_vector<diff_type> seqlen(m);
152 
153  seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
154  nmax = seqlen[0];
155  for (diff_type i = 1; i < m; ++i)
156  {
157  seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
158  nmax = std::max(nmax, seqlen[i]);
159  }
160 
161  // pad all lists to this length, at least as long as any ns[i], equality
162  // iff nmax = 2^k - 1
163  diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
164 
165  simple_vector<diff_type> a(m), b(m);
166 
167  for (diff_type i = 0; i < m; ++i)
168  a[i] = 0, b[i] = l;
169 
170  n = l / 2;
171 
172  // invariants:
173  // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
174 
175  // initial partition
176 
177  std::vector<SamplePair> sample;
178 
179  for (diff_type i = 0; i < m; ++i) {
180  if (n < seqlen[i]) {
181  // sequence long enough
182  sample.push_back(SamplePair(begin_seqs[i].first[n], i));
183  }
184  }
185 
186  std::sort(sample.begin(), sample.end(), lcomp);
187 
188  for (diff_type i = 0; i < m; ++i) {
189  // conceptual infinity
190  if (n >= seqlen[i]) {
191  // sequence too short, conceptual infinity
192  sample.push_back(
193  SamplePair(begin_seqs[i].first[0] /* dummy element */, i));
194  }
195  }
196 
197  size_t localrank = static_cast<size_t>(rank / l);
198 
199  size_t j;
200  for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
201  a[sample[j].second] += n + 1;
202  for ( ; j < static_cast<size_t>(m); ++j)
203  b[sample[j].second] -= n + 1;
204 
205  // further refinement
206 
207  while (n > 0)
208  {
209  n /= 2;
210 
211  const value_type* lmax = nullptr; // impossible to avoid the warning?
212  for (diff_type i = 0; i < m; ++i)
213  {
214  if (a[i] > 0)
215  {
216  if (!lmax)
217  lmax = &(begin_seqs[i].first[a[i] - 1]);
218  else
219  {
220  // max, favor rear sequences
221  if (!comp(begin_seqs[i].first[a[i] - 1], *lmax))
222  lmax = &(begin_seqs[i].first[a[i] - 1]);
223  }
224  }
225  }
226 
227  for (diff_type i = 0; i < m; ++i)
228  {
229  diff_type middle = (b[i] + a[i]) / 2;
230  if (lmax && middle < seqlen[i] &&
231  comp(begin_seqs[i].first[middle], *lmax))
232  a[i] = std::min(a[i] + n + 1, seqlen[i]);
233  else
234  b[i] -= n + 1;
235  }
236 
237  diff_type leftsize = 0;
238  for (diff_type i = 0; i < m; ++i)
239  leftsize += a[i] / (n + 1);
240 
241  diff_type skew = rank / (n + 1) - leftsize;
242 
243  if (skew > 0)
244  {
245  // move to the left, find smallest
246  std::priority_queue<
247  SamplePair, std::vector<SamplePair>,
248  lexicographic_rev<value_type, diff_type, Comparator> >
249  pq(lrcomp);
250 
251  for (diff_type i = 0; i < m; ++i) {
252  if (b[i] < seqlen[i])
253  pq.push(SamplePair(begin_seqs[i].first[b[i]], i));
254  }
255 
256  for ( ; skew != 0 && !pq.empty(); --skew)
257  {
258  diff_type src = pq.top().second;
259  pq.pop();
260 
261  a[src] = std::min(a[src] + n + 1, seqlen[src]);
262  b[src] += n + 1;
263 
264  if (b[src] < seqlen[src])
265  pq.push(SamplePair(begin_seqs[src].first[b[src]], src));
266  }
267  }
268  else if (skew < 0)
269  {
270  // move to the right, find greatest
271  std::priority_queue<
272  SamplePair, std::vector<SamplePair>,
273  lexicographic<value_type, diff_type, Comparator> >
274  pq(lcomp);
275 
276  for (diff_type i = 0; i < m; ++i) {
277  if (a[i] > 0)
278  pq.push(SamplePair(begin_seqs[i].first[a[i] - 1], i));
279  }
280 
281  for ( ; skew != 0; ++skew)
282  {
283  diff_type src = pq.top().second;
284  pq.pop();
285 
286  a[src] -= n + 1;
287  b[src] -= n + 1;
288 
289  if (a[src] > 0)
290  pq.push(SamplePair(begin_seqs[src].first[a[src] - 1], src));
291  }
292  }
293  }
294 
295  // postconditions: a[i] == b[i] in most cases, except when a[i] has been
296  // clamped because of having reached the boundary
297 
298  // now return the result, calculate the offset, compare the keys on both
299  // edges of the border
300 
301  // maximum of left edge, minimum of right edge
302  value_type* maxleft = nullptr, * minright = nullptr;
303  for (diff_type i = 0; i < m; ++i)
304  {
305  if (a[i] > 0)
306  {
307  if (!maxleft)
308  {
309  maxleft = &(begin_seqs[i].first[a[i] - 1]);
310  }
311  else
312  {
313  // max, favor rear sequences
314  if (!comp(begin_seqs[i].first[a[i] - 1], *maxleft))
315  maxleft = &(begin_seqs[i].first[a[i] - 1]);
316  }
317  }
318  if (b[i] < seqlen[i])
319  {
320  if (!minright)
321  {
322  minright = &(begin_seqs[i].first[b[i]]);
323  }
324  else
325  {
326  // min, favor fore sequences
327  if (comp(begin_seqs[i].first[b[i]], *minright))
328  minright = &(begin_seqs[i].first[b[i]]);
329  }
330  }
331  }
332 
333  for (diff_type i = 0; i < m; ++i)
334  begin_offsets[i] = begin_seqs[i].first + a[i];
335 }
336 
337 //! \}
338 
339 } // namespace tlx
340 
341 #endif // !TLX_ALGORITHM_MULTISEQUENCE_PARTITION_HEADER
342 
343 /******************************************************************************/
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
bool operator()(const std::pair< T1, T2 > &p1, const std::pair< T1, T2 > &p2)
Compare a pair of types lexicographically, descending.
Simpler non-growing vector without initialization.
int round_up_to_power_of_two(int i)
does what it says: round up to next power of two
void multisequence_partition(const RanSeqs &begin_seqs, const RanSeqs &end_seqs, const RankType &rank, RankIterator begin_offsets, Comparator comp=Comparator())
Splits several sorted sequences at a certain global rank, resulting in a splitting point for each seq...
bool operator()(const std::pair< T1, T2 > &p1, const std::pair< T1, T2 > &p2)
int N
Definition: gen_data.py:15
Compare a pair of types lexicographically, ascending.
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217