Distributed Ranges
Loading...
Searching...
No Matches
reduce.hpp
1// SPDX-FileCopyrightText: Intel Corporation
2//
3// SPDX-License-Identifier: BSD-3-Clause
4
5#pragma once
6
7namespace dr::mp::__detail {
8
9inline auto std_reduce(rng::forward_range auto &&r, auto &&binary_op) {
10 using value_type = rng::range_value_t<decltype(r)>;
11 if (rng::empty(r)) {
12 return value_type{};
13 } else {
14 auto skip1 = rng::begin(r);
15 skip1++;
16 // Explicit cast from distributed_vector reference to value_type
17 return std::reduce(std::execution::par_unseq, skip1, rng::end(r),
18 value_type(*rng::begin(r)), binary_op);
19 }
20}
21
22inline auto dpl_reduce(rng::forward_range auto &&r, auto &&binary_op) {
23 rng::range_value_t<decltype(r)> none{};
24#ifdef SYCL_LANGUAGE_VERSION
25 if (rng::empty(r)) {
26 return none;
27 } else {
28 using T = rng::range_value_t<decltype(r)>;
29 using Fn = decltype(binary_op);
30 if constexpr (sycl::has_known_identity_v<Fn, T>) {
31 dr::drlog.debug(" known identity\n");
32 return std::reduce(dpl_policy(),
33 dr::__detail::direct_iterator(rng::begin(r)),
35 sycl::known_identity_v<Fn, T>, binary_op);
36 } else {
37 dr::drlog.debug(" peel 1st value\n");
38 return std::reduce(dpl_policy(),
39 dr::__detail::direct_iterator(rng::begin(r) + 1),
41 sycl_get_deref(rng::begin(r)), binary_op);
42 // We are not using below code, because we don't want to dereference
43 // rng::begin(r) beyond SYCL environment - the * operator may require
44 // complex operation that relies on GPU memory access (for example
45 // transform view iterator) return std::reduce(dpl_policy(),
46 // dr::__detail::direct_iterator(rng::begin(r) + 1),
47 // dr::__detail::direct_iterator(rng::end(r)),
48 // sycl_get(*rng::begin(r)), binary_op);
49 }
50 }
51#else
52 assert(false);
53 return none;
54#endif
55}
56
58template <dr::distributed_range DR>
59auto reduce(std::size_t root, bool root_provided, DR &&dr, auto &&binary_op) {
60 using value_type = rng::range_value_t<DR>;
61 auto comm = default_comm();
62
63 if (rng::empty(dr)) {
64 return rng::range_value_t<DR>{};
65 }
66
67 if (aligned(dr)) {
68 dr::drlog.debug("Parallel reduce\n");
69
70 // Reduce the local segments
71 auto reduce = [=](auto &&r) {
72 assert(rng::size(r) > 0);
73 if (mp::use_sycl()) {
74 dr::drlog.debug(" with DPL\n");
75 return dpl_reduce(r, binary_op);
76 } else {
77 dr::drlog.debug(" with CPU\n");
78 return std_reduce(r, binary_op);
79 }
80 };
81 auto locals = rng::views::transform(local_segments(dr), reduce);
82 auto local = std_reduce(locals, binary_op);
83
84 std::vector<value_type> all(comm.size());
85 if (root_provided) {
86 // Everyone gathers to root, only root reduces
87 comm.gather(local, std::span{all}, root);
88 if (root == comm.rank()) {
89 return std_reduce(all, binary_op);
90 } else {
91 return value_type{};
92 }
93 } else {
94 // Everyone gathers and everyone reduces
95 comm.all_gather(local, all);
96 return std_reduce(all, binary_op);
97 }
98 } else {
99 dr::drlog.debug("Serial reduce\n");
100 value_type result{};
101 if (!root_provided || root == comm.rank()) {
102 result = std_reduce(dr, binary_op);
103 }
104 barrier();
105 return result;
106 }
107}
108
109// handles init
110template <typename T, dr::distributed_range DR>
111T reduce(std::size_t root, bool root_provided, DR &&dr, T init,
112 auto &&binary_op = std::plus<>{}) {
113
114 if (rng::empty(dr)) {
115 return init;
116 }
117 return binary_op(init, reduce(root, root_provided, dr, binary_op));
118}
119
120inline void
121#if defined(__GNUC__) && !defined(__clang__)
122 __attribute__((optimize(0)))
123#endif
124 no_optimize(auto x) {
125}
126
127}; // namespace dr::mp::__detail
128
129namespace dr::mp {
130
131//
132// Ranges
133//
134
135// range, init, and binary op, w/wo root
136
138template <typename T, dr::distributed_range DR>
139auto reduce(std::size_t root, DR &&dr, T init, auto &&binary_op) {
140 return __detail::reduce(root, true, std::forward<DR>(dr), init, binary_op);
141}
143template <typename T, dr::distributed_range DR>
144auto reduce(DR &&dr, T init, auto &&binary_op) {
145 return __detail::reduce(0, false, std::forward<DR>(dr), init, binary_op);
146}
147
148// range, init, w/wo root
149
151template <typename T, dr::distributed_range DR>
152auto reduce(std::size_t root, DR &&dr, T init) {
153 return __detail::reduce(root, true, std::forward<DR>(dr), init,
154 std::plus<>{});
155}
157template <typename T, dr::distributed_range DR> auto reduce(DR &&dr, T init) {
158 return __detail::reduce(0, false, std::forward<DR>(dr), init, std::plus<>{});
159}
160
161// range, w/wo root
162
164template <dr::distributed_range DR> auto reduce(std::size_t root, DR &&dr) {
165 return __detail::reduce(root, true, std::forward<DR>(dr), std::plus<>{});
166}
167
169template <dr::distributed_range DR> auto reduce(DR &&dr) {
170 auto x = __detail::reduce(0, false, std::forward<DR>(dr), std::plus<>{});
171
172 // The code below avoids an issue where DotProduct_ZipReduce_DR
173 // fails with gcc11. From debugging, I can see that the call to
174 // __detail::reduce above computes the correct value, but this
175 // function returns a bad value. My theory is that the problem is
176 // related to tail call optimization and the function below disables
177 // the optimization.
178 __detail::no_optimize(x);
179
180 return x;
181}
182
183//
184// Iterators
185//
186
187// range, init, and binary op, w/wo root
188
190template <typename T, dr::distributed_iterator DI>
191auto reduce(std::size_t root, DI first, DI last, T init, auto &&binary_op) {
192 return __detail::reduce(root, true, rng::subrange(first, last), init,
193 binary_op);
194}
196template <typename T, dr::distributed_iterator DI>
197auto reduce(DI first, DI last, T init, auto &&binary_op) {
198 return __detail::reduce(0, false, rng::subrange(first, last), init,
199 binary_op);
200}
201
202// range, init, w/wo root
203
205template <typename T, dr::distributed_iterator DI>
206auto reduce(std::size_t root, DI first, DI last, T init) {
207 return __detail::reduce(root, true, rng::subrange(first, last), init,
208 std::plus<>{});
209}
211template <typename T, dr::distributed_iterator DI>
212auto reduce(DI first, DI last, T init) {
213 return __detail::reduce(0, false, rng::subrange(first, last), init,
214 std::plus<>{});
215}
216
217// range, w/wo root
218
220template <dr::distributed_iterator DI>
221auto reduce(std::size_t root, DI first, DI last) {
222 return __detail::reduce(root, true, rng::subrange(first, last),
223 std::plus<>{});
224}
226template <dr::distributed_iterator DI> auto reduce(DI first, DI last) {
227 return __detail::reduce(0, false, rng::subrange(first, last), std::plus<>{});
228}
229
230} // namespace dr::mp
Definition: onedpl_direct_iterator.hpp:15