Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
multisequence_selection.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * tlx/algorithm/multisequence_selection.hpp
3  *
4  * Implementation of multisequence partition and selection.
5  *
6  * Copied and modified from STXXL, see http://stxxl.org, which itself extracted
7  * it from MCSTL http://algo2.iti.uni-karlsruhe.de/singler/mcstl/. Both are
8  * distributed under the Boost Software License, Version 1.0.
9  *
10  * Part of tlx - http://panthema.net/tlx
11  *
12  * Copyright (C) 2007 Johannes Singler <[email protected]>
13  * Copyright (C) 2014-2018 Timo Bingmann <[email protected]>
14  *
15  * All rights reserved. Published under the Boost Software License, Version 1.0
16  ******************************************************************************/
17 
18 #ifndef TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
19 #define TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <functional>
24 #include <queue>
25 #include <utility>
26 #include <vector>
27 
30 
31 namespace tlx {
32 
33 //! \addtogroup tlx_algorithm
34 //! \{
35 
36 namespace multisequence_selection_detail {
37 
38 //! Compare a pair of types lexicographically, ascending.
39 template <typename T1, typename T2, typename Comparator>
41  : public std::binary_function<std::pair<T1, T2>, std::pair<T1, T2>, bool>
42 {
43 protected:
44  Comparator& comp_;
45 
46 public:
47  explicit lexicographic(Comparator& comp) : comp_(comp) { }
48 
49  inline bool operator () (const std::pair<T1, T2>& p1,
50  const std::pair<T1, T2>& p2) {
51  if (comp_(p1.first, p2.first))
52  return true;
53 
54  if (comp_(p2.first, p1.first))
55  return false;
56 
57  // firsts are equal
58  return p1.second < p2.second;
59  }
60 };
61 
62 //! Compare a pair of types lexicographically, descending.
63 template <typename T1, typename T2, typename Comparator>
65  : public std::binary_function<std::pair<T1, T2>, std::pair<T1, T2>, bool>
66 {
67 protected:
68  Comparator& comp_;
69 
70 public:
71  explicit lexicographic_rev(Comparator& comp) : comp_(comp) { }
72 
73  inline bool operator () (const std::pair<T1, T2>& p1,
74  const std::pair<T1, T2>& p2) {
75  if (comp_(p2.first, p1.first))
76  return true;
77 
78  if (comp_(p1.first, p2.first))
79  return false;
80 
81  // firsts are equal
82  return p2.second < p1.second;
83  }
84 };
85 
86 } // namespace multisequence_selection_detail
87 
88 /*!
89  * Selects the element at a certain global rank from several sorted sequences.
90  *
91  * The sequences are passed via a sequence of random-access iterator pairs, none
92  * of the sequences may be empty.
93  *
94  * \param begin_seqs Begin of the sequence of iterator pairs.
95  * \param end_seqs End of the sequence of iterator pairs.
96  * \param rank The global rank to partition at.
97  * \param offset The rank of the selected element in the global subsequence of
98  * elements equal to the selected element. If the selected element is unique,
99  * this number is 0.
100  * \param comp The ordering functor, defaults to std::less. */
101 template <typename ValueType, typename RanSeqs, typename RankType,
102  typename Comparator = std::less<ValueType> >
104  const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
105  const RankType& rank,
106  RankType& offset,
107  Comparator comp = Comparator()) {
108 
109  using iterator = typename std::iterator_traits<RanSeqs>
110  ::value_type::first_type;
111  using diff_type = typename std::iterator_traits<iterator>
112  ::difference_type;
113 
114  using SamplePair = std::pair<ValueType, diff_type>;
115 
116  using namespace multisequence_selection_detail;
117 
118  // comparators for SamplePair
119  lexicographic<ValueType, diff_type, Comparator> lcomp(comp);
120  lexicographic_rev<ValueType, diff_type, Comparator> lrcomp(comp);
121 
122  // number of sequences, number of elements in total (possibly including
123  // padding)
124  const diff_type m = std::distance(begin_seqs, end_seqs);
125  diff_type nmax, n;
126  RankType N = 0;
127 
128  for (diff_type i = 0; i < m; ++i)
129  N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
130 
131  if (m == 0 || N == 0 || rank < 0 || rank >= N)
132  // result undefined when there is no data or rank is outside bounds
133  throw std::exception();
134 
135  simple_vector<diff_type> seqlen(m);
136 
137  seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
138  nmax = seqlen[0];
139  for (diff_type i = 1; i < m; ++i)
140  {
141  seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
142  nmax = std::max(nmax, seqlen[i]);
143  }
144 
145  // pad all lists to this length, at least as long as any ns[i], equliaty iff
146  // nmax = 2^k - 1
147  diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
148 
149  simple_vector<diff_type> a(m), b(m);
150 
151  for (diff_type i = 0; i < m; ++i)
152  a[i] = 0, b[i] = l;
153 
154  n = l / 2;
155 
156  // invariants:
157  // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
158 
159  // initial partition
160 
161  std::vector<SamplePair> sample;
162 
163  for (diff_type i = 0; i < m; ++i) {
164  if (n < seqlen[i])
165  sample.push_back(SamplePair(begin_seqs[i].first[n], i));
166  }
167 
168  std::sort(sample.begin(), sample.end(), lcomp);
169 
170  for (diff_type i = 0; i < m; ++i) {
171  // conceptual infinity
172  if (n >= seqlen[i])
173  sample.push_back(
174  SamplePair(begin_seqs[i].first[0] /* dummy element */, i));
175  }
176 
177  size_t localrank = static_cast<size_t>(rank / l);
178 
179  size_t j;
180  for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
181  a[sample[j].second] += n + 1;
182  for ( ; j < static_cast<size_t>(m); ++j)
183  b[sample[j].second] -= n + 1;
184 
185  // further refinement
186 
187  while (n > 0)
188  {
189  n /= 2;
190 
191  const ValueType* lmax = nullptr;
192  for (diff_type i = 0; i < m; ++i)
193  {
194  if (a[i] > 0)
195  {
196  if (!lmax)
197  {
198  lmax = &(begin_seqs[i].first[a[i] - 1]);
199  }
200  else
201  {
202  // max
203  if (comp(*lmax, begin_seqs[i].first[a[i] - 1]))
204  lmax = &(begin_seqs[i].first[a[i] - 1]);
205  }
206  }
207  }
208 
209  for (diff_type i = 0; i < m; ++i)
210  {
211  diff_type middle = (b[i] + a[i]) / 2;
212  if (lmax && middle < seqlen[i] &&
213  comp(begin_seqs[i].first[middle], *lmax))
214  a[i] = std::min(a[i] + n + 1, seqlen[i]);
215  else
216  b[i] -= n + 1;
217  }
218 
219  diff_type leftsize = 0;
220  for (diff_type i = 0; i < m; ++i)
221  leftsize += a[i] / (n + 1);
222 
223  diff_type skew = rank / (n + 1) - leftsize;
224 
225  if (skew > 0)
226  {
227  // move to the left, find smallest
228  std::priority_queue<
229  SamplePair, std::vector<SamplePair>,
230  lexicographic_rev<ValueType, diff_type, Comparator> >
231  pq(lrcomp);
232 
233  for (diff_type i = 0; i < m; ++i) {
234  if (b[i] < seqlen[i])
235  pq.push(SamplePair(begin_seqs[i].first[b[i]], i));
236  }
237 
238  for ( ; skew != 0 && !pq.empty(); --skew)
239  {
240  diff_type src = pq.top().second;
241  pq.pop();
242 
243  a[src] = std::min(a[src] + n + 1, seqlen[src]);
244  b[src] += n + 1;
245 
246  if (b[src] < seqlen[src])
247  pq.push(SamplePair(begin_seqs[src].first[b[src]], src));
248  }
249  }
250  else if (skew < 0)
251  {
252  // move to the right, find greatest
253  std::priority_queue<
254  SamplePair, std::vector<SamplePair>,
255  lexicographic<ValueType, diff_type, Comparator> >
256  pq(lcomp);
257 
258  for (diff_type i = 0; i < m; ++i) {
259  if (a[i] > 0)
260  pq.push(SamplePair(begin_seqs[i].first[a[i] - 1], i));
261  }
262 
263  for ( ; skew != 0; ++skew)
264  {
265  diff_type src = pq.top().second;
266  pq.pop();
267 
268  a[src] -= n + 1;
269  b[src] -= n + 1;
270 
271  if (a[src] > 0)
272  pq.push(SamplePair(begin_seqs[src].first[a[src] - 1], src));
273  }
274  }
275  }
276 
277  // postconditions: a[i] == b[i] in most cases, except when a[i] has been
278  // clamped because of having reached the boundary
279 
280  // now return the result, calculate the offset, compare the keys on both
281  // edges of the border
282 
283  // maximum of left edge, minimum of right edge
284  ValueType* maxleft = nullptr, * minright = nullptr;
285  for (diff_type i = 0; i < m; ++i)
286  {
287  if (a[i] > 0)
288  {
289  if (!maxleft)
290  {
291  maxleft = &(begin_seqs[i].first[a[i] - 1]);
292  }
293  else
294  {
295  // max
296  if (comp(*maxleft, begin_seqs[i].first[a[i] - 1]))
297  maxleft = &(begin_seqs[i].first[a[i] - 1]);
298  }
299  }
300  if (b[i] < seqlen[i])
301  {
302  if (!minright)
303  {
304  minright = &(begin_seqs[i].first[b[i]]);
305  }
306  else
307  {
308  // min
309  if (comp(begin_seqs[i].first[b[i]], *minright))
310  minright = &(begin_seqs[i].first[b[i]]);
311  }
312  }
313  }
314 
315  // minright is the splitter, in any case
316 
317  if (!maxleft || comp(*minright, *maxleft))
318  {
319  // good luck, everything is split unambigiously
320  offset = 0;
321  }
322  else
323  {
324  // we have to calculate an offset
325  offset = 0;
326 
327  for (diff_type i = 0; i < m; ++i)
328  {
329  diff_type lb = std::lower_bound(
330  begin_seqs[i].first, begin_seqs[i].first + seqlen[i],
331  *minright, comp) - begin_seqs[i].first;
332  offset += a[i] - lb;
333  }
334  }
335 
336  return *minright;
337 }
338 
339 //! \}
340 
341 } // namespace tlx
342 
343 #endif // !TLX_ALGORITHM_MULTISEQUENCE_SELECTION_HEADER
344 
345 /******************************************************************************/
Compare a pair of types lexicographically, ascending.
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.hpp:226
Simpler non-growing vector without initialization.
int round_up_to_power_of_two(int i)
does what it says: round up to next power of two
ValueType multisequence_selection(const RanSeqs &begin_seqs, const RanSeqs &end_seqs, const RankType &rank, RankType &offset, Comparator comp=Comparator())
Selects the element at a certain global rank from several sorted sequences.
bool operator()(const std::pair< T1, T2 > &p1, const std::pair< T1, T2 > &p2)
bool operator()(const std::pair< T1, T2 > &p1, const std::pair< T1, T2 > &p2)
int N
Definition: gen_data.py:15
Compare a pair of types lexicographically, descending.
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217