Thrill  0.1
multisequence_selection.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * tlx/algorithm/multisequence_selection.hpp
3  *
4  * Implementation of multisequence partition and selection.
5  *
6  * Copied and modified from STXXL, see http://stxxl.org, which itself extracted
7  * it from MCSTL http://algo2.iti.uni-karlsruhe.de/singler/mcstl/. Both are
8  * distributed under the Boost Software License, Version 1.0.
9  *
10  * Part of tlx - http://panthema.net/tlx
11  *
12  * Copyright (C) 2007 Johannes Singler <[email protected]>
13  * Copyright (C) 2014-2018 Timo Bingmann <[email protected]>
14  *
15  * All rights reserved. Published under the Boost Software License, Version 1.0
16  ******************************************************************************/
17 
18 #ifndef TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
19 #define TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <queue>
24 #include <utility>
25 #include <vector>
26 
29 
30 namespace tlx {
31 
32 //! \addtogroup tlx_algorithm
33 //! \{
34 
35 namespace multisequence_selection_detail {
36 
37 //! Compare a pair of types lexicographically, ascending.
38 template <typename T1, typename T2, typename Comparator>
40 {
41 protected:
42  Comparator& comp_;
43 
44 public:
45  explicit lexicographic(Comparator& comp) : comp_(comp) { }
46 
47  inline bool operator () (const std::pair<T1, T2>& p1,
48  const std::pair<T1, T2>& p2) {
49  if (comp_(p1.first, p2.first))
50  return true;
51 
52  if (comp_(p2.first, p1.first))
53  return false;
54 
55  // firsts are equal
56  return p1.second < p2.second;
57  }
58 };
59 
60 //! Compare a pair of types lexicographically, descending.
61 template <typename T1, typename T2, typename Comparator>
63 {
64 protected:
65  Comparator& comp_;
66 
67 public:
68  explicit lexicographic_rev(Comparator& comp) : comp_(comp) { }
69 
70  inline bool operator () (const std::pair<T1, T2>& p1,
71  const std::pair<T1, T2>& p2) {
72  if (comp_(p2.first, p1.first))
73  return true;
74 
75  if (comp_(p1.first, p2.first))
76  return false;
77 
78  // firsts are equal
79  return p2.second < p1.second;
80  }
81 };
82 
83 } // namespace multisequence_selection_detail
84 
85 /*!
86  * Selects the element at a certain global rank from several sorted sequences.
87  *
88  * The sequences are passed via a sequence of random-access iterator pairs, none
89  * of the sequences may be empty.
90  *
91  * \param begin_seqs Begin of the sequence of iterator pairs.
92  * \param end_seqs End of the sequence of iterator pairs.
93  * \param rank The global rank to partition at.
94  * \param offset The rank of the selected element in the global subsequence of
95  * elements equal to the selected element. If the selected element is unique,
96  * this number is 0.
97  * \param comp The ordering functor, defaults to std::less. */
98 template <typename ValueType, typename RanSeqs, typename RankType,
99  typename Comparator = std::less<ValueType> >
101  const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
102  const RankType& rank,
103  RankType& offset,
104  Comparator comp = Comparator()) {
105 
106  using iterator = typename std::iterator_traits<RanSeqs>
107  ::value_type::first_type;
108  using diff_type = typename std::iterator_traits<iterator>
109  ::difference_type;
110 
111  using SamplePair = std::pair<ValueType, diff_type>;
112 
113  using namespace multisequence_selection_detail;
114 
115  // comparators for SamplePair
116  lexicographic<ValueType, diff_type, Comparator> lcomp(comp);
117  lexicographic_rev<ValueType, diff_type, Comparator> lrcomp(comp);
118 
119  // number of sequences, number of elements in total (possibly including
120  // padding)
121  const diff_type m = std::distance(begin_seqs, end_seqs);
122  diff_type nmax, n;
123  RankType N = 0;
124 
125  for (diff_type i = 0; i < m; ++i)
126  N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
127 
128  if (m == 0 || N == 0 || rank < 0 || rank >= N)
129  // result undefined when there is no data or rank is outside bounds
130  throw std::exception();
131 
132  simple_vector<diff_type> seqlen(m);
133 
134  seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
135  nmax = seqlen[0];
136  for (diff_type i = 1; i < m; ++i)
137  {
138  seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
139  nmax = std::max(nmax, seqlen[i]);
140  }
141 
142  // pad all lists to this length, at least as long as any ns[i], equliaty iff
143  // nmax = 2^k - 1
144  diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
145 
146  simple_vector<diff_type> a(m), b(m);
147 
148  for (diff_type i = 0; i < m; ++i)
149  a[i] = 0, b[i] = l;
150 
151  n = l / 2;
152 
153  // invariants:
154  // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
155 
156  // initial partition
157 
158  std::vector<SamplePair> sample;
159 
160  for (diff_type i = 0; i < m; ++i) {
161  if (n < seqlen[i])
162  sample.push_back(SamplePair(begin_seqs[i].first[n], i));
163  }
164 
165  std::sort(sample.begin(), sample.end(), lcomp);
166 
167  for (diff_type i = 0; i < m; ++i) {
168  // conceptual infinity
169  if (n >= seqlen[i])
170  sample.push_back(
171  SamplePair(begin_seqs[i].first[0] /* dummy element */, i));
172  }
173 
174  size_t localrank = static_cast<size_t>(rank / l);
175 
176  size_t j;
177  for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
178  a[sample[j].second] += n + 1;
179  for ( ; j < static_cast<size_t>(m); ++j)
180  b[sample[j].second] -= n + 1;
181 
182  // further refinement
183 
184  while (n > 0)
185  {
186  n /= 2;
187 
188  const ValueType* lmax = nullptr;
189  for (diff_type i = 0; i < m; ++i)
190  {
191  if (a[i] > 0)
192  {
193  if (!lmax)
194  {
195  lmax = &(begin_seqs[i].first[a[i] - 1]);
196  }
197  else
198  {
199  // max
200  if (comp(*lmax, begin_seqs[i].first[a[i] - 1]))
201  lmax = &(begin_seqs[i].first[a[i] - 1]);
202  }
203  }
204  }
205 
206  for (diff_type i = 0; i < m; ++i)
207  {
208  diff_type middle = (b[i] + a[i]) / 2;
209  if (lmax && middle < seqlen[i] &&
210  comp(begin_seqs[i].first[middle], *lmax))
211  a[i] = std::min(a[i] + n + 1, seqlen[i]);
212  else
213  b[i] -= n + 1;
214  }
215 
216  diff_type leftsize = 0;
217  for (diff_type i = 0; i < m; ++i)
218  leftsize += a[i] / (n + 1);
219 
220  diff_type skew = rank / (n + 1) - leftsize;
221 
222  if (skew > 0)
223  {
224  // move to the left, find smallest
225  std::priority_queue<
226  SamplePair, std::vector<SamplePair>,
227  lexicographic_rev<ValueType, diff_type, Comparator> >
228  pq(lrcomp);
229 
230  for (diff_type i = 0; i < m; ++i) {
231  if (b[i] < seqlen[i])
232  pq.push(SamplePair(begin_seqs[i].first[b[i]], i));
233  }
234 
235  for ( ; skew != 0 && !pq.empty(); --skew)
236  {
237  diff_type src = pq.top().second;
238  pq.pop();
239 
240  a[src] = std::min(a[src] + n + 1, seqlen[src]);
241  b[src] += n + 1;
242 
243  if (b[src] < seqlen[src])
244  pq.push(SamplePair(begin_seqs[src].first[b[src]], src));
245  }
246  }
247  else if (skew < 0)
248  {
249  // move to the right, find greatest
250  std::priority_queue<
251  SamplePair, std::vector<SamplePair>,
252  lexicographic<ValueType, diff_type, Comparator> >
253  pq(lcomp);
254 
255  for (diff_type i = 0; i < m; ++i) {
256  if (a[i] > 0)
257  pq.push(SamplePair(begin_seqs[i].first[a[i] - 1], i));
258  }
259 
260  for ( ; skew != 0; ++skew)
261  {
262  diff_type src = pq.top().second;
263  pq.pop();
264 
265  a[src] -= n + 1;
266  b[src] -= n + 1;
267 
268  if (a[src] > 0)
269  pq.push(SamplePair(begin_seqs[src].first[a[src] - 1], src));
270  }
271  }
272  }
273 
274  // postconditions: a[i] == b[i] in most cases, except when a[i] has been
275  // clamped because of having reached the boundary
276 
277  // now return the result, calculate the offset, compare the keys on both
278  // edges of the border
279 
280  // maximum of left edge, minimum of right edge
281  ValueType* maxleft = nullptr, * minright = nullptr;
282  for (diff_type i = 0; i < m; ++i)
283  {
284  if (a[i] > 0)
285  {
286  if (!maxleft)
287  {
288  maxleft = &(begin_seqs[i].first[a[i] - 1]);
289  }
290  else
291  {
292  // max
293  if (comp(*maxleft, begin_seqs[i].first[a[i] - 1]))
294  maxleft = &(begin_seqs[i].first[a[i] - 1]);
295  }
296  }
297  if (b[i] < seqlen[i])
298  {
299  if (!minright)
300  {
301  minright = &(begin_seqs[i].first[b[i]]);
302  }
303  else
304  {
305  // min
306  if (comp(begin_seqs[i].first[b[i]], *minright))
307  minright = &(begin_seqs[i].first[b[i]]);
308  }
309  }
310  }
311 
312  // minright is the splitter, in any case
313 
314  if (!maxleft || comp(*minright, *maxleft))
315  {
316  // good luck, everything is split unambigiously
317  offset = 0;
318  }
319  else
320  {
321  // we have to calculate an offset
322  offset = 0;
323 
324  for (diff_type i = 0; i < m; ++i)
325  {
326  diff_type lb = std::lower_bound(
327  begin_seqs[i].first, begin_seqs[i].first + seqlen[i],
328  *minright, comp) - begin_seqs[i].first;
329  offset += a[i] - lb;
330  }
331  }
332 
333  return *minright;
334 }
335 
336 //! \}
337 
338 } // namespace tlx
339 
340 #endif // !TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
341 
342 /******************************************************************************/
Compare a pair of types lexicographically, ascending.
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
Simpler non-growing vector without initialization.
ValueType multisequence_selection(const RanSeqs &begin_seqs, const RanSeqs &end_seqs, const RankType &rank, RankType &offset, Comparator comp=Comparator())
Selects the element at a certain global rank from several sorted sequences.
bool operator()(const std::pair< T1, T2 > &p1, const std::pair< T1, T2 > &p2)
int N
Definition: gen_data.py:15
Compare a pair of types lexicographically, descending.
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
static int round_up_to_power_of_two(int i)
does what it says: round up to next power of two