Thrill  0.1
multisequence_partition.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * tlx/algorithm/multisequence_partition.hpp
3  *
4  * Implementation of multisequence partition and selection.
5  *
6  * Copied and modified from STXXL, see http://stxxl.org, which itself extracted
7  * it from MCSTL http://algo2.iti.uni-karlsruhe.de/singler/mcstl/. Both are
8  * distributed under the Boost Software License, Version 1.0.
9  *
10  * Part of tlx - http://panthema.net/tlx
11  *
12  * Copyright (C) 2007 Johannes Singler <[email protected]>
13  * Copyright (C) 2014-2018 Timo Bingmann <[email protected]>
14  *
15  * All rights reserved. Published under the Boost Software License, Version 1.0
16  ******************************************************************************/
17 
18 #ifndef TLX_ALGORITHM_MULTISEQUENCE_PARTITION_HEADER
19 #define TLX_ALGORITHM_MULTISEQUENCE_PARTITION_HEADER
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <queue>
24 #include <utility>
25 #include <vector>
26 
29 
30 namespace tlx {
31 
32 //! \addtogroup tlx_algorithm
33 //! \{
34 
35 namespace multisequence_partition_detail {
36 
37 //! Compare a pair of types lexicographically, ascending.
38 template <typename T1, typename T2, typename Comparator>
40 {
41 protected:
42  Comparator& comp_;
43 
44 public:
45  explicit lexicographic(Comparator& comp) : comp_(comp) { }
46 
47  inline bool operator () (const std::pair<T1, T2>& p1,
48  const std::pair<T1, T2>& p2) {
49  if (comp_(p1.first, p2.first))
50  return true;
51 
52  if (comp_(p2.first, p1.first))
53  return false;
54 
55  // firsts are equal
56  return p1.second < p2.second;
57  }
58 };
59 
60 //! Compare a pair of types lexicographically, descending.
61 template <typename T1, typename T2, typename Comparator>
63 {
64 protected:
65  Comparator& comp_;
66 
67 public:
68  explicit lexicographic_rev(Comparator& comp) : comp_(comp) { }
69 
70  inline bool operator () (const std::pair<T1, T2>& p1,
71  const std::pair<T1, T2>& p2) {
72  if (comp_(p2.first, p1.first))
73  return true;
74 
75  if (comp_(p1.first, p2.first))
76  return false;
77 
78  // firsts are equal
79  return p2.second < p1.second;
80  }
81 };
82 
83 } // namespace multisequence_partition_detail
84 
85 /*!
86  * Splits several sorted sequences at a certain global rank, resulting in a
87  * splitting point for each sequence. The sequences are passed via a sequence
88  * of random-access iterator pairs, none of the sequences may be empty. If
89  * there are several equal elements across the split, the ones on the left side
90  * will be chosen from sequences with smaller number.
91  *
92  * \param begin_seqs Begin of the sequence of iterator pairs.
93  * \param end_seqs End of the sequence of iterator pairs.
94  * \param rank The global rank to partition at.
95  * \param begin_offsets A random-access sequence begin where the result will be
96  * stored in. Each element of the sequence is an iterator that points to the
97  * first element on the greater part of the respective sequence.
98  * \param comp The ordering functor, defaults to std::less<T>.
99  */
100 template <typename RanSeqs, typename RankType, typename RankIterator,
101  typename Comparator = std::less<
102  typename std::iterator_traits<
103  typename std::iterator_traits<RanSeqs>
104  ::value_type::first_type>::value_type>
105  >
107  const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
108  const RankType& rank,
109  RankIterator begin_offsets,
110  Comparator comp = Comparator()) {
111 
112  using iterator = typename std::iterator_traits<RanSeqs>
113  ::value_type::first_type;
114  using diff_type = typename std::iterator_traits<iterator>
115  ::difference_type;
116  using value_type = typename std::iterator_traits<iterator>
117  ::value_type;
118 
119  using SamplePair = std::pair<value_type, diff_type>;
120 
121  using namespace multisequence_partition_detail;
122 
123  // comparators for SamplePair
124  lexicographic<value_type, diff_type, Comparator> lcomp(comp);
125  lexicographic_rev<value_type, diff_type, Comparator> lrcomp(comp);
126 
127  // number of sequences, number of elements in total (possibly including
128  // padding)
129  const diff_type m = std::distance(begin_seqs, end_seqs);
130  diff_type nmax, n;
131  RankType N = 0;
132 
133  for (diff_type i = 0; i < m; ++i)
134  {
135  N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
136  assert(std::distance(begin_seqs[i].first, begin_seqs[i].second) > 0);
137  }
138 
139  if (rank == N)
140  {
141  for (diff_type i = 0; i < m; ++i)
142  begin_offsets[i] = begin_seqs[i].second; // very end
143  return;
144  }
145 
146  assert(m != 0 && N != 0 && rank < N);
147 
148  simple_vector<diff_type> seqlen(m);
149 
150  seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
151  nmax = seqlen[0];
152  for (diff_type i = 1; i < m; ++i)
153  {
154  seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
155  nmax = std::max(nmax, seqlen[i]);
156  }
157 
158  // pad all lists to this length, at least as long as any ns[i], equality
159  // iff nmax = 2^k - 1
160  diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
161 
162  simple_vector<diff_type> a(m), b(m);
163 
164  for (diff_type i = 0; i < m; ++i)
165  a[i] = 0, b[i] = l;
166 
167  n = l / 2;
168 
169  // invariants:
170  // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
171 
172  // initial partition
173 
174  std::vector<SamplePair> sample;
175 
176  for (diff_type i = 0; i < m; ++i) {
177  if (n < seqlen[i]) {
178  // sequence long enough
179  sample.push_back(SamplePair(begin_seqs[i].first[n], i));
180  }
181  }
182 
183  std::sort(sample.begin(), sample.end(), lcomp);
184 
185  for (diff_type i = 0; i < m; ++i) {
186  // conceptual infinity
187  if (n >= seqlen[i]) {
188  // sequence too short, conceptual infinity
189  sample.push_back(
190  SamplePair(begin_seqs[i].first[0] /* dummy element */, i));
191  }
192  }
193 
194  size_t localrank = static_cast<size_t>(rank / l);
195 
196  size_t j;
197  for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
198  a[sample[j].second] += n + 1;
199  for ( ; j < static_cast<size_t>(m); ++j)
200  b[sample[j].second] -= n + 1;
201 
202  // further refinement
203 
204  while (n > 0)
205  {
206  n /= 2;
207 
208  const value_type* lmax = nullptr; // impossible to avoid the warning?
209  for (diff_type i = 0; i < m; ++i)
210  {
211  if (a[i] > 0)
212  {
213  if (!lmax)
214  lmax = &(begin_seqs[i].first[a[i] - 1]);
215  else
216  {
217  // max, favor rear sequences
218  if (!comp(begin_seqs[i].first[a[i] - 1], *lmax))
219  lmax = &(begin_seqs[i].first[a[i] - 1]);
220  }
221  }
222  }
223 
224  for (diff_type i = 0; i < m; ++i)
225  {
226  diff_type middle = (b[i] + a[i]) / 2;
227  if (lmax && middle < seqlen[i] &&
228  comp(begin_seqs[i].first[middle], *lmax))
229  a[i] = std::min(a[i] + n + 1, seqlen[i]);
230  else
231  b[i] -= n + 1;
232  }
233 
234  diff_type leftsize = 0;
235  for (diff_type i = 0; i < m; ++i)
236  leftsize += a[i] / (n + 1);
237 
238  diff_type skew = rank / (n + 1) - leftsize;
239 
240  if (skew > 0)
241  {
242  // move to the left, find smallest
243  std::priority_queue<
244  SamplePair, std::vector<SamplePair>,
245  lexicographic_rev<value_type, diff_type, Comparator> >
246  pq(lrcomp);
247 
248  for (diff_type i = 0; i < m; ++i) {
249  if (b[i] < seqlen[i])
250  pq.push(SamplePair(begin_seqs[i].first[b[i]], i));
251  }
252 
253  for ( ; skew != 0 && !pq.empty(); --skew)
254  {
255  diff_type src = pq.top().second;
256  pq.pop();
257 
258  a[src] = std::min(a[src] + n + 1, seqlen[src]);
259  b[src] += n + 1;
260 
261  if (b[src] < seqlen[src])
262  pq.push(SamplePair(begin_seqs[src].first[b[src]], src));
263  }
264  }
265  else if (skew < 0)
266  {
267  // move to the right, find greatest
268  std::priority_queue<
269  SamplePair, std::vector<SamplePair>,
270  lexicographic<value_type, diff_type, Comparator> >
271  pq(lcomp);
272 
273  for (diff_type i = 0; i < m; ++i) {
274  if (a[i] > 0)
275  pq.push(SamplePair(begin_seqs[i].first[a[i] - 1], i));
276  }
277 
278  for ( ; skew != 0; ++skew)
279  {
280  diff_type src = pq.top().second;
281  pq.pop();
282 
283  a[src] -= n + 1;
284  b[src] -= n + 1;
285 
286  if (a[src] > 0)
287  pq.push(SamplePair(begin_seqs[src].first[a[src] - 1], src));
288  }
289  }
290  }
291 
292  // postconditions: a[i] == b[i] in most cases, except when a[i] has been
293  // clamped because of having reached the boundary
294 
295  // now return the result, calculate the offset, compare the keys on both
296  // edges of the border
297 
298  // maximum of left edge, minimum of right edge
299  value_type* maxleft = nullptr, * minright = nullptr;
300  for (diff_type i = 0; i < m; ++i)
301  {
302  if (a[i] > 0)
303  {
304  if (!maxleft)
305  {
306  maxleft = &(begin_seqs[i].first[a[i] - 1]);
307  }
308  else
309  {
310  // max, favor rear sequences
311  if (!comp(begin_seqs[i].first[a[i] - 1], *maxleft))
312  maxleft = &(begin_seqs[i].first[a[i] - 1]);
313  }
314  }
315  if (b[i] < seqlen[i])
316  {
317  if (!minright)
318  {
319  minright = &(begin_seqs[i].first[b[i]]);
320  }
321  else
322  {
323  // min, favor fore sequences
324  if (comp(begin_seqs[i].first[b[i]], *minright))
325  minright = &(begin_seqs[i].first[b[i]]);
326  }
327  }
328  }
329 
330  for (diff_type i = 0; i < m; ++i)
331  begin_offsets[i] = begin_seqs[i].first + a[i];
332 }
333 
334 //! \}
335 
336 } // namespace tlx
337 
338 #endif // !TLX_ALGORITHM_MULTISEQUENCE_PARTITION_HEADER
339 
340 /******************************************************************************/
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
Compare a pair of types lexicographically, descending.
Simpler non-growing vector without initialization.
void multisequence_partition(const RanSeqs &begin_seqs, const RanSeqs &end_seqs, const RankType &rank, RankIterator begin_offsets, Comparator comp=Comparator())
Splits several sorted sequences at a certain global rank, resulting in a splitting point for each seq...
bool operator()(const std::pair< T1, T2 > &p1, const std::pair< T1, T2 > &p2)
int N
Definition: gen_data.py:15
Compare a pair of types lexicographically, ascending.
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
static int round_up_to_power_of_two(int i)
does what it says: round up to next power of two