Thrill  0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
collective.hpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * thrill/net/collective.hpp
3  *
4  * net::Group is a collection of net::Connections providing simple MPI-like
5  * collectives and point-to-point communication.
6  *
7  * Part of Project Thrill - http://project-thrill.org
8  *
9  * Copyright (C) 2015 Robert Hangu <[email protected]>
10  * Copyright (C) 2015-2016 Timo Bingmann <[email protected]>
11  * Copyright (C) 2015 Lorenz Hübschle-Schneider <lore[email protected]>
12  * Copyright (C) 2017 Nejmeddine Douma <[email protected]>
13  *
14  * All rights reserved. Published under the BSD-2 license in the LICENSE file.
15  ******************************************************************************/
16 
17 #pragma once
18 #ifndef THRILL_NET_COLLECTIVE_HEADER
19 #define THRILL_NET_COLLECTIVE_HEADER
20 
22 #include <thrill/net/group.hpp>
23 #include <tlx/math/ffs.hpp>
27 
28 #include <functional>
29 
30 namespace thrill {
31 namespace net {
32 
33 //! \addtogroup net_layer
34 //! \{
35 
36 /******************************************************************************/
37 // Prefixsum Algorithms
38 
39 /*!
40  * Calculate for every worker his prefix sum.
41  *
42  * The prefix sum is the aggregation of the values of all workers with lesser
43  * index, including himself, according to a summation operator. The run-time is
44  * in O(log n).
45  *
46  * \param value The value to be summed up
47  * \param sum_op A custom summation operator
48  * \param inclusive Inclusive prefix sum if true (default)
49  */
50 template <typename T, typename BinarySumOp>
51 void Group::PrefixSumDoubling(T& value, BinarySumOp sum_op,
52  const T& initial, bool inclusive) {
53  static constexpr bool debug = false;
54 
55  bool first = true;
56 
57  if (my_host_rank() == 0)
58  value = sum_op(initial, value);
59 
60  // Use a copy, in case of exclusive, we have to forward
61  // something that's not our result.
62  T to_forward = value;
63 
64  // This is based on the pointer-doubling algorithm presented in the ParAlg
65  // script, which is used for list ranking.
66  for (size_t d = 1; d < num_hosts(); d <<= 1) {
67 
68  if (my_host_rank() + d < num_hosts()) {
69  sLOG << "Host" << my_host_rank()
70  << ": sending to" << my_host_rank() + d;
71  SendTo(my_host_rank() + d, to_forward);
72  }
73 
74  if (my_host_rank() >= d) {
75  T recv_value;
76  ReceiveFrom(my_host_rank() - d, &recv_value);
77  sLOG << "Host" << my_host_rank()
78  << ": receiving from" << my_host_rank() - d;
79 
80  // Take care of order, so we don't break associativity.
81  to_forward = sum_op(recv_value, to_forward);
82 
83  if (!first || inclusive) {
84  value = sum_op(recv_value, value);
85  }
86  else {
87  value = recv_value;
88  first = false;
89  }
90  }
91  }
92 
93  // set worker 0's value for exclusive prefixsums
94  if (!inclusive && my_host_rank() == 0)
95  value = initial;
96 }
97 
98 /*!
99  * \brief Calculate for every worker his prefix sum. Works only for worker
100  * numbers which are powers of two.
101  *
102  * \details The prefix sum is an aggregatation of the values of all workers with
103  * smaller index, including itself, according to an associative summation
104  * operator. This function currently only supports worker numbers which are
105  * powers of two.
106  *
107  * \param value The value to be summed up
108  *
109  * \param sum_op A custom summation operator
110  */
111 template <typename T, typename BinarySumOp>
112 void Group::PrefixSumHypercube(T& value, BinarySumOp sum_op) {
113  T total_sum = value;
114 
115  static constexpr bool debug = false;
116 
117  for (size_t d = 1; d < num_hosts(); d <<= 1)
118  {
119  // communication peer for this round (hypercube dimension)
120  size_t peer = my_host_rank() ^ d;
121 
122  // Send total sum of this hypercube to worker with id = id XOR d
123  if (peer < num_hosts()) {
124  SendTo(peer, total_sum);
125  sLOG << "PREFIX_SUM: host" << my_host_rank()
126  << ": sending to peer" << peer;
127  }
128 
129  // Receive total sum of smaller hypercube from worker with id = id XOR d
130  T recv_data;
131  if (peer < num_hosts()) {
132  ReceiveFrom(peer, &recv_data);
133  // The order of addition is important. The total sum of the smaller
134  // hypercube always comes first.
135  if (my_host_rank() & d)
136  total_sum = sum_op(recv_data, total_sum);
137  else
138  total_sum = sum_op(total_sum, recv_data);
139  // Variable 'value' represents the prefix sum of this worker
140  if (my_host_rank() & d)
141  // The order of addition is respected the same way as above.
142  value = sum_op(recv_data, value);
143  sLOG << "PREFIX_SUM: host" << my_host_rank()
144  << ": received from peer" << peer;
145  }
146  }
147 
148  sLOG << "PREFIX_SUM: host" << my_host_rank() << ": done";
149 }
150 
151 //! select prefixsum implementation (often due to total number of processors)
152 template <typename T, typename BinarySumOp>
153 void Group::PrefixSumSelect(T& value, BinarySumOp sum_op,
154  const T& initial, bool inclusive) {
155  return PrefixSumDoubling(value, sum_op, initial, inclusive);
156 }
157 
158 template <typename T, typename BinarySumOp>
159 void Group::PrefixSum(T& value, BinarySumOp sum_op, const T& initial) {
160  return PrefixSumSelect(value, sum_op, initial, true);
161 }
162 
163 template <typename T, typename BinarySumOp>
164 void Group::ExPrefixSum(T& value, BinarySumOp sum_op, const T& initial) {
165  return PrefixSumSelect(value, sum_op, initial, false);
166 }
167 
168 /******************************************************************************/
169 // Broadcast Algorithms
170 
171 /*!
172  * Broadcasts the value of the peer with index 0 to all the others. This is a
173  * trivial broadcast from peer 0.
174  *
175  * \param value The value to be broadcast / receive into.
176  *
177  * \param origin The PE to broadcast value from.
178  */
179 template <typename T>
180 void Group::BroadcastTrivial(T& value, size_t origin) {
181 
182  if (my_host_rank() == origin) {
183  // send value to all peers
184  for (size_t p = 0; p < num_hosts(); ++p) {
185  if (p != origin)
186  SendTo(p, value);
187  }
188  }
189  else {
190  // receive from origin
191  ReceiveFrom(origin, &value);
192  }
193 }
194 
195 /*!
196  * Broadcasts the value of the worker with index "origin" to all the
197  * others. This is a binomial tree broadcast method.
198  *
199  * \param value The value to be broadcast / receive into.
200  *
201  * \param origin The PE to broadcast value from.
202  */
203 template <typename T>
204 void Group::BroadcastBinomialTree(T& value, size_t origin) {
205  static constexpr bool debug = false;
206 
207  size_t num_hosts = this->num_hosts();
208  // calculate rank in cyclically shifted binomial tree
209  size_t my_rank = (my_host_rank() + num_hosts - origin) % num_hosts;
210  size_t r = 0, d = 1;
211  // receive from predecessor
212  if (my_rank > 0) {
213  // our predecessor is p with the lowest one bit flipped to zero. this
214  // also counts the number of rounds we have to send out messages later.
215  r = tlx::ffs(my_rank) - 1;
216  d <<= r;
217  size_t from = ((my_rank ^ d) + origin) % num_hosts;
218  sLOG << "Broadcast: rank" << my_rank << "receiving from" << from
219  << "in round" << r;
220  ReceiveFrom(from, &value);
221  }
222  else {
223  d = tlx::round_up_to_power_of_two(num_hosts);
224  }
225  // send to successors
226  for (d >>= 1; d > 0; d >>= 1, ++r) {
227  if (my_rank + d < num_hosts) {
228  size_t to = (my_rank + d + origin) % num_hosts;
229  sLOG << "Broadcast: rank" << my_rank << "round" << r << "sending to"
230  << to;
231  SendTo(to, value);
232  }
233  }
234 }
235 
236 //! select broadcast implementation (often due to total number of processors)
237 template <typename T>
238 void Group::BroadcastSelect(T& value, size_t origin) {
239  return BroadcastBinomialTree(value, origin);
240 }
241 
242 /*!
243  * Broadcasts the value of the worker with index 0 to all the others. This is a
244  * binomial tree broadcast method.
245  *
246  * \param value The value to be broadcast / receive into.
247  *
248  * \param origin The PE to broadcast value from.
249  */
250 template <typename T>
251 void Group::Broadcast(T& value, size_t origin) {
252  return BroadcastSelect(value, origin);
253 }
254 
255 /******************************************************************************/
256 // AllGather Algorithms
257 
258 template <typename T>
260  size_t num_hosts = this->num_hosts();
261  size_t my_rank = my_host_rank();
262  size_t d = tlx::integer_log2_ceil(num_hosts);
263 
264  for (size_t j = 0; j < d; ++j) {
265  size_t peer = my_rank ^ (0x1 << j);
266  // index of first element to be sent
267  size_t snd_pos = (~((0x1 << j) - 1) & my_rank) * n;
268  // index of first element to be received
269  size_t rcv_pos = (~((0x1 << j) - 1) & peer) * n;
270  // number of elements to be sent/received
271  size_t ins_n = (0x1 << j) * n;
272 
273  connection(peer).SendReceive(values + snd_pos, values + rcv_pos, ins_n);
274  }
275 }
276 
277 template <typename T>
278 void Group::AllGatherBruck(T* values, size_t n) {
279  size_t num_hosts = this->num_hosts();
280  size_t my_rank = my_host_rank();
281  size_t size = num_hosts * n;
282  std::vector<T> temp(size);
283 
284  for (size_t i = 0; i < n; ++i) {
285  temp[i] = values[i];
286  }
287 
288  for (size_t j = 0; (0x1U << j) < num_hosts; ++j) {
289  size_t snd_peer = (my_rank + num_hosts - (0x1 << j)) % num_hosts;
290  size_t rcv_peer = (my_rank + (0x1 << j)) % num_hosts;
291  // position for received data
292  size_t ins_pos = (0x1 << j) * n;
293  // number of elements to be sent/received
294  size_t ins_n = std::min((0x1 << j) * n, size - ins_pos);
295 
296  if ((0x1 << j) & my_rank) {
297  connection(rcv_peer).ReceiveN(temp.data() + ins_pos, ins_n);
298  connection(snd_peer).SendN(temp.data(), ins_n);
299  }
300  else {
301  connection(snd_peer).SendN(temp.data(), ins_n);
302  connection(rcv_peer).ReceiveN(temp.data() + ins_pos, ins_n);
303  }
304  }
305 
306  // local reorder: shift whole array by my_rank*n to the right
307  for (size_t i = 0; i < size; ++i) {
308  values[i] = temp[(i + size - my_rank * n) % size];
309  }
310 }
311 
312 /******************************************************************************/
313 // Reduce Algorithms
314 
315 /*!
316  * \brief Perform a reduction on all workers in a group.
317  *
318  * \details This function aggregates the values of all workers in the group
319  * according with a specified reduction operator. The result will be returned in
320  * the input variable at the root node.
321  *
322  * \param value The input value to be used in the reduction. Will be overwritten
323  * with the result (on the root) or arbitrary data (on other ranks).
324  *
325  * \param root The rank of the root
326  *
327  * \param sum_op A custom reduction operator (optional)
328  */
329 template <typename T, typename BinarySumOp>
330 void Group::Reduce(T& value, size_t root, BinarySumOp sum_op) {
331  static constexpr bool debug = false;
332  const size_t num_hosts = this->num_hosts();
333  const size_t my_rank = my_host_rank() + num_hosts;
334  const size_t shifted_rank = (my_rank - root) % num_hosts;
335  sLOG << my_host_rank() << "shifted_rank" << shifted_rank;
336 
337  for (size_t d = 1; d < num_hosts; d <<= 1) {
338  if (shifted_rank & d) {
339  sLOG << "Reduce" << my_host_rank()
340  << "->" << (my_rank - d) % num_hosts << "/"
341  << shifted_rank << "->" << shifted_rank - d;
342  SendTo((my_rank - d) % num_hosts, value);
343  break;
344  }
345  else if (shifted_rank + d < num_hosts) {
346  sLOG << "Reduce" << my_host_rank()
347  << "<-" << (my_rank + d) % num_hosts << "/"
348  << shifted_rank << "<-" << shifted_rank + d;
349  T recv_data;
350  ReceiveFrom((my_rank + d) % num_hosts, &recv_data);
351  value = sum_op(value, recv_data);
352  }
353  }
354 }
355 
356 /******************************************************************************/
357 // AllReduce Algorithms
358 
359 /*!
360  * Perform an All-Reduce on the workers. This is done by aggregating all values
361  * according to a summation operator and sending them backto all workers.
362  *
363  * \param value The value to be added to the aggregation
364  * \param sum_op A custom summation operator
365  */
366 template <typename T, typename BinarySumOp>
367 void Group::AllReduceSimple(T& value, BinarySumOp sum_op) {
368  Reduce(value, 0, sum_op);
369  Broadcast(value, 0);
370 }
371 
372 /*!
373  * Broadcasts the value of the peer with index 0 to all the others. This is a
374  * trivial broadcast from peer 0.
375  *
376  * \param value The value to be broadcast / receive into.
377  *
378  * \param sum_op A custom summation operator
379  */
380 template <typename T, typename BinarySumOp>
381 void Group::AllReduceAtRoot(T& value, BinarySumOp sum_op) {
382 
383  if (my_host_rank() == 0) {
384  // receive value from all peers
385  for (size_t p = 1; p < num_hosts(); ++p) {
386  T recv_value;
387  ReceiveFrom(p, &recv_value);
388  value = sum_op(value, recv_value);
389  }
390  // send reduced value back to all peers
391  for (size_t p = 1; p < num_hosts(); ++p) {
392  SendTo(p, value);
393  }
394  }
395  else {
396  // send to root host
397  SendTo(0, value);
398  // receive value back from root
399  ReceiveFrom(0, &value);
400  }
401 }
402 
403 /*!
404  * Perform an All-Reduce for powers of two. This is done with the Hypercube
405  * algorithm from the ParAlg script.
406  *
407  * \note This method is no longer used, but it is kept here for reference
408  *
409  * \param value The value to be added to the aggregation
410  * \param sum_op A custom summation operator
411  */
412 template <typename T, typename BinarySumOp>
413 void Group::AllReduceHypercube(T& value, BinarySumOp sum_op) {
414  // For each dimension of the hypercube, exchange data between workers with
415  // different bits at position d
416 
417  // static constexpr bool debug = false;
418 
419  for (size_t d = 1; d < num_hosts(); d <<= 1) {
420  // communication peer for this round (hypercube dimension)
421  size_t peer = my_host_rank() ^ d;
422 
423  // SendReceive value to worker with id id ^ d
424  if (peer < num_hosts()) {
425  // LOG << "ALL_REDUCE_HYPERCUBE: Host" << my_host_rank()
426  // << ": Sending" << value << " to worker" << peer;
427 
428  // The order of addition is important. The total sum of the smaller
429  // hypercube always comes first.
430  T recv_data;
431  if (my_host_rank() & d) {
432  connection(peer).SendReceive(&value, &recv_data);
433  value = sum_op(recv_data, value);
434  }
435  else {
436  connection(peer).ReceiveSend(value, &recv_data);
437  value = sum_op(value, recv_data);
438  }
439 
440  // LOG << "ALL_REDUCE_HYPERCUBE: Host " << my_host_rank()
441  // << ": Received " << recv_data
442  // << " from worker " << peer << " value = " << value;
443  }
444  }
445 }
446 
447 /*!
448  * Perform an All-Reduce using the elimination protocol described in
449  * R. Rabenseifner and J. L. Traeff. "More Efficient Reduction Algorithms for
450  * Non-Power-of-Two Number of Processors in Message-Passing Parallel Systems."
451  * In Recent Advances in Parallel Virtual Machine and Message Passing Interface,
452  * 36–46. LNCS 3241. Springer, 2004.
453  *
454  * \param value The value to be added to the aggregation
455  * \param sum_op A custom summation operator
456  */
457 template <typename T, typename BinarySumOp>
458 void Group::AllReduceElimination(T& value, BinarySumOp sum_op) {
460  my_host_rank(), 1, num_hosts(), 0, value, sum_op);
461 }
462 
463 template <typename T, typename BinarySumOp>
464 T Group::SendReceiveReduce(size_t peer, const T& value, BinarySumOp sum_op) {
465  T recv_data;
466  if (my_host_rank() > peer) {
467  connection(peer).SendReceive(&value, &recv_data);
468  return sum_op(recv_data, value);
469  }
470  else {
471  connection(peer).ReceiveSend(value, &recv_data);
472  return sum_op(value, recv_data);
473  }
474 }
475 
476 //! used for the recursive implementation of the elimination protocol
477 template <typename T, typename BinarySumOp>
479  size_t host_id, size_t group_size, size_t remaining_hosts,
480  size_t send_to, T& value, BinarySumOp sum_op) {
481 
482  // static const bool debug = false;
483 
484  // send_to == 0 => no eliminated host waiting to receive from current host,
485  // host 0 is never eliminated
486 
487  size_t group_count = remaining_hosts / group_size;
488  if (group_count % 2 == 0) {
489  // only hypercube
490  size_t peer = host_id ^ group_size;
491  if (peer < remaining_hosts) {
492  value = SendReceiveReduce(peer, value, sum_op);
493  }
494  }
495  else {
496  // check if my rank is in 3-2 elimination zone
497  size_t host_group = host_id / group_size;
498  if (host_group >= group_count - 3) {
499  // take part in the 3-2 elimination
500  if (host_group == group_count - 1) {
501  size_t peer = (host_id ^ group_size) - 2 * group_size;
502  SendTo(peer, value);
503  ReceiveFrom(peer, &value);
504  }
505  else if (host_group == group_count - 2) {
506  size_t peer = (host_id ^ group_size) + 2 * group_size;
507 
508  T recv_data;
509  ReceiveFrom(peer, &recv_data);
510  if (my_host_rank() > peer)
511  value = sum_op(recv_data, value);
512  else
513  value = sum_op(value, recv_data);
514 
515  // important for gathering
516  send_to = peer;
517 
518  peer = host_id ^ group_size;
519  value = SendReceiveReduce(peer, value, sum_op);
520  }
521  else if (host_group == group_count - 3) {
522  size_t peer = host_id ^ group_size;
523  value = SendReceiveReduce(peer, value, sum_op);
524  }
525  }
526  else {
527  // no elimination, execute hypercube
528  size_t peer = host_id ^ group_size;
529  if (peer < remaining_hosts) {
530  value = SendReceiveReduce(peer, value, sum_op);
531  }
532  }
533  remaining_hosts -= group_size;
534  }
535  group_size <<= 1;
536 
537  // recursion
538  if (group_size < remaining_hosts) {
540  host_id, group_size, remaining_hosts, send_to,
541  value, sum_op);
542  }
543  else if (send_to != 0) {
544  SendTo(send_to, value);
545  }
546 }
547 
548 //! select allreduce implementation (often due to total number of processors)
549 template <typename T, typename BinarySumOp>
550 void Group::AllReduceSelect(T& value, BinarySumOp sum_op) {
551  // always use 3-2-elimination reduction method.
552  AllReduceElimination(value, sum_op);
553  /*if (tlx::is_power_of_two(num_hosts()))
554  AllReduceHypercube(value, sum_op);
555  else
556  AllReduceAtRoot(value, sum_op);*/
557 }
558 
559 /*!
560  * Perform an All-Reduce on the workers. This is done by aggregating all values
561  * according to a summation operator and sending them backto all workers.
562  *
563  * \param value The value to be added to the aggregation
564  * \param sum_op A custom summation operator
565  */
566 template <typename T, typename BinarySumOp>
567 void Group::AllReduce(T& value, BinarySumOp sum_op) {
568  return AllReduceSelect(value, sum_op);
569 }
570 
571 //! \}
572 
573 } // namespace net
574 } // namespace thrill
575 
576 #endif // !THRILL_NET_COLLECTIVE_HEADER
577 
578 /******************************************************************************/
#define sLOG
Default logging method: output if the local debug variable is true.
Definition: logger.hpp:34
void BroadcastSelect(T &value, size_t origin=0)
select broadcast implementation (often due to total number of processors)
Definition: collective.hpp:238
std::enable_if< std::is_pod< T >::value, void >::type ReceiveSend(const T &value, T *out_value)
Definition: connection.hpp:296
double T
static unsigned ffs(int i)
find first set bit in integer, or zero if none are set.
Definition: ffs.hpp:79
int round_up_to_power_of_two(int i)
does what it says: round up to next power of two
virtual Connection & connection(size_t id)=0
Return Connection to client id.
void ExPrefixSum(T &value, BinarySumOp sum_op=BinarySumOp(), const T &initial=T())
Calculate exclusive prefix sum.
Definition: collective.hpp:164
void AllReduceHypercube(T &value, BinarySumOp sum_op=BinarySumOp())
Perform an All-Reduce for powers of two.
Definition: collective.hpp:413
void BroadcastBinomialTree(T &value, size_t origin=0)
Broadcasts the value of the worker with index "origin" to all the others.
Definition: collective.hpp:204
void AllReduce(T &value, BinarySumOp sum_op=BinarySumOp())
Reduce a value from all workers to all workers.
Definition: collective.hpp:567
size_t my_host_rank() const
Return our rank among hosts in this group.
Definition: group.hpp:69
int value
Definition: gen_data.py:41
std::enable_if< std::is_pod< T >::value, void >::type SendN(const T *value, size_t n)
Definition: connection.hpp:462
void SendTo(size_t dest, const T &data)
Sends a serializable type to the given peer.
Definition: group.hpp:112
void AllReduceElimination(T &value, BinarySumOp sum_op=BinarySumOp())
Perform an All-Reduce using the elimination protocol described in R.
Definition: collective.hpp:458
void PrefixSumHypercube(T &value, BinarySumOp sum_op=BinarySumOp())
Calculate for every worker his prefix sum.
Definition: collective.hpp:112
void Broadcast(T &value, size_t origin=0)
Broadcast a value from the worker "origin".
Definition: collective.hpp:251
static constexpr bool debug
void BroadcastTrivial(T &value, size_t origin=0)
Broadcasts the value of the peer with index 0 to all the others.
Definition: collective.hpp:180
void AllGatherBruck(T *values, size_t n)
Definition: collective.hpp:278
void AllReduceAtRoot(T &value, BinarySumOp sum_op=BinarySumOp())
Broadcasts the value of the peer with index 0 to all the others.
Definition: collective.hpp:381
void AllGatherRecursiveDoublingPowerOfTwo(T *values, size_t n)
Definition: collective.hpp:259
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.hpp:217
void Reduce(T &value, size_t root=0, BinarySumOp sum_op=BinarySumOp())
Reduce a value from all workers to the worker 0.
Definition: collective.hpp:330
void AllReduceSimple(T &value, BinarySumOp sum_op=BinarySumOp())
Perform an All-Reduce on the workers.
Definition: collective.hpp:367
void AllReduceEliminationProcess(size_t host_id, size_t group_size, size_t remaining_hosts, size_t send_to, T &value, BinarySumOp sum_op)
Helper method for AllReduce().
Definition: collective.hpp:478
void PrefixSum(T &value, BinarySumOp sum_op=BinarySumOp(), const T &initial=T())
Calculate inclusive prefix sum.
Definition: collective.hpp:159
std::enable_if< std::is_pod< T >::value, void >::type SendReceive(const T *value, T *out_value, size_t n=1)
SendReceive any serializable POD item T.
Definition: connection.hpp:277
unsigned integer_log2_ceil(int i)
calculate the log2 ceiling of an integer type (by repeated bit shifts)
void ReceiveFrom(size_t src, T *data)
Receives a serializable type from the given peer.
Definition: group.hpp:123
void PrefixSumSelect(T &value, BinarySumOp sum_op=BinarySumOp(), const T &initial=T(), bool inclusive=true)
select prefixsum implementation (often due to total number of processors)
Definition: collective.hpp:153
std::enable_if< std::is_pod< T >::value, void >::type ReceiveN(T *out_value, size_t n)
Receive an array of serializable POD items T.
Definition: connection.hpp:524
virtual size_t num_hosts() const =0
Return number of connections in this group (= number computing hosts)
void AllReduceSelect(T &value, BinarySumOp sum_op=BinarySumOp())
select allreduce implementation (often due to total number of processors)
Definition: collective.hpp:550
void PrefixSumDoubling(T &value, BinarySumOp sum_op=BinarySumOp(), const T &initial=T(), bool inclusive=true)
Calculate for every worker his prefix sum.
Definition: collective.hpp:51
T SendReceiveReduce(size_t peer, const T &value, BinarySumOp sum_op)
Helper method for AllReduce().
Definition: collective.hpp:464