Distributed Ranges
Loading...
Searching...
No Matches
gemm.hpp
1// SPDX-FileCopyrightText: Intel Corporation
2//
3// SPDX-License-Identifier: BSD-3-Clause
4
5#pragma once
6
7#include <dr/sp/algorithms/matrix/local_gemm.hpp>
8#include <dr/sp/containers/distributed_dense_matrix.hpp>
9
10namespace dr::sp {
11
12template <typename T>
13void gemm(distributed_dense_matrix<T> &a, distributed_dense_matrix<T> &b,
14 distributed_dense_matrix<T> &c) {
15 gemm_buffered(a, b, c);
16}
17
18template <typename T>
19void gemm_inplace(distributed_dense_matrix<T> &a,
20 distributed_dense_matrix<T> &b,
21 distributed_dense_matrix<T> &c) {
22 // Matrix dimensions must match (algorithm requirement)
23 assert(c.shape()[0] == a.shape()[0]);
24 assert(c.shape()[1] == b.shape()[1]);
25 assert(a.shape()[1] == b.shape()[0]);
26
27 // Tile grid dimensions must match (implementation limitation)
28
29 assert(c.grid_shape()[0] == a.grid_shape()[0]);
30 assert(c.grid_shape()[1] == b.grid_shape()[1]);
31 assert(a.grid_shape()[1] == b.grid_shape()[0]);
32
33 std::vector<sycl::event> events;
34 events.reserve(c.grid_shape()[0] * c.grid_shape()[1] * a.grid_shape()[1]);
35
36 for (std::size_t i = 0; i < c.grid_shape()[0]; i++) {
37 for (std::size_t j = 0; j < c.grid_shape()[1]; j++) {
38 // For each tile of the output C matrix
39 auto &&c_tile = c.tile({i, j});
40
41 std::vector<sycl::event> local_events;
42 local_events.reserve(a.grid_shape()[1]);
43
44 std::size_t k_offset = i + j;
45 for (std::size_t k_ = 0; k_ < a.grid_shape()[1]; k_++) {
46 std::size_t k = (k_ + k_offset) % a.grid_shape()[1];
47
48 auto &&a_tile = a.tile({i, k});
49 auto &&b_tile = b.tile({k, j});
50
51 auto &&q = __detail::queue(dr::ranges::rank(c_tile));
52
53 auto e = __detail::local_gemm(q, __detail::local(a_tile),
54 __detail::local(b_tile),
55 __detail::local(c_tile), local_events);
56
57 local_events.push_back(e);
58 }
59
60 for (auto &&e : local_events) {
61 events.push_back(e);
62 }
63 }
64 }
65
66 __detail::wait(events);
67}
68
69template <typename T>
70void gemm_buffered(distributed_dense_matrix<T> &a,
71 distributed_dense_matrix<T> &b,
72 distributed_dense_matrix<T> &c) {
73 // Matrix dimensions must match (algorithm requirement)
74 assert(c.shape()[0] == a.shape()[0]);
75 assert(c.shape()[1] == b.shape()[1]);
76 assert(a.shape()[1] == b.shape()[0]);
77
78 // Tile grid dimensions must match (implementation limitation)
79
80 assert(c.grid_shape()[0] == a.grid_shape()[0]);
81 assert(c.grid_shape()[1] == b.grid_shape()[1]);
82 assert(a.grid_shape()[1] == b.grid_shape()[0]);
83
84 std::vector<std::thread> threads;
85
86 std::atomic<double> communication = 0;
87 std::atomic<double> compute = 0;
88
89 for (std::size_t i = 0; i < c.grid_shape()[0]; i++) {
90 for (std::size_t j = 0; j < c.grid_shape()[1]; j++) {
91 auto c_local = c.tile({i, j});
92
93 threads.emplace_back([c_local, i, j, &a, &b, &communication, &compute] {
94 auto &&q = __detail::queue(dr::ranges::rank(c_local));
95
96 std::size_t a_elem = a.tile_shape()[0] * a.tile_shape()[1];
97 std::size_t b_elem = b.tile_shape()[0] * b.tile_shape()[1];
98 std::size_t buffer_size = std::max(a_elem, b_elem);
99
100 dr::sp::device_allocator<T> gpu_allocator(q);
101 dr::sp::buffered_allocator buffered_allocator(gpu_allocator,
102 buffer_size, 2);
103 auto &&allocator = buffered_allocator;
104
105 std::size_t k_offset = i + j;
106
107 for (std::size_t k_ = 0; k_ < a.grid_shape()[1]; k_++) {
108 std::size_t k = (k_ + k_offset) % a.grid_shape()[1];
109
110 auto begin = std::chrono::high_resolution_clock::now();
111 auto a_tile = a.get_tile({i, k}, allocator);
112 auto b_tile = b.get_tile({k, j}, allocator);
113 auto end = std::chrono::high_resolution_clock::now();
114 double duration = std::chrono::duration<double>(end - begin).count();
115 communication += duration;
116
117 dr::sp::dense_matrix_view a_local(a_tile);
118 dr::sp::dense_matrix_view b_local(b_tile);
119
120 begin = std::chrono::high_resolution_clock::now();
121 __detail::local_gemm(q, __detail::local(a_local),
122 __detail::local(b_local),
123 __detail::local(c_local))
124 .wait();
125 end = std::chrono::high_resolution_clock::now();
126 duration = std::chrono::duration<double>(end - begin).count();
127 compute += duration;
128 }
129 });
130 }
131 }
132
133 for (auto &&t : threads) {
134 t.join();
135 }
136
137 bool debug_print = false;
138
139 if (debug_print) {
140 std::cout << "communication total: " << (double)communication << std::endl;
141 std::cout << "compute total: " << (double)compute << std::endl;
142 }
143}
144
145template <typename T>
146void gemm_buffered_async(distributed_dense_matrix<T> &a,
147 distributed_dense_matrix<T> &b,
148 distributed_dense_matrix<T> &c) {
149 // Matrix dimensions must match (algorithm requirement)
150 assert(c.shape()[0] == a.shape()[0]);
151 assert(c.shape()[1] == b.shape()[1]);
152 assert(a.shape()[1] == b.shape()[0]);
153
154 // Tile grid dimensions must match (implementation limitation)
155
156 assert(c.grid_shape()[0] == a.grid_shape()[0]);
157 assert(c.grid_shape()[1] == b.grid_shape()[1]);
158 assert(a.grid_shape()[1] == b.grid_shape()[0]);
159
160 std::vector<std::thread> threads;
161
162 std::atomic<double> issue = 0;
163 std::atomic<double> sync = 0;
164 std::atomic<double> compute = 0;
165
166 for (std::size_t i = 0; i < c.grid_shape()[0]; i++) {
167 for (std::size_t j = 0; j < c.grid_shape()[1]; j++) {
168 auto c_local = c.tile({i, j});
169
170 threads.emplace_back([c_local, i, j, &a, &b, &issue, &sync, &compute] {
171 auto &&q = __detail::queue(dr::ranges::rank(c_local));
172
173 std::size_t a_elem = a.tile_shape()[0] * a.tile_shape()[1];
174 std::size_t b_elem = b.tile_shape()[0] * b.tile_shape()[1];
175 std::size_t buffer_size = std::max(a_elem, b_elem);
176
177 dr::sp::device_allocator<T> gpu_allocator(q);
178 dr::sp::buffered_allocator buffered_allocator(gpu_allocator,
179 buffer_size, 4);
180 auto &&allocator = buffered_allocator;
181
182 std::size_t k_offset = i + j;
183
184 auto begin = std::chrono::high_resolution_clock::now();
185 auto a_f =
186 a.get_tile_async({i, k_offset % a.grid_shape()[1]}, allocator);
187 // a_f.wait();
188 auto b_f =
189 b.get_tile_async({k_offset % a.grid_shape()[1], j}, allocator);
190 // b_f.wait();
191 auto end = std::chrono::high_resolution_clock::now();
192 double duration = std::chrono::duration<double>(end - begin).count();
193 issue += duration;
194
195 for (std::size_t k_ = 0; k_ < a.grid_shape()[1]; k_++) {
196 std::size_t k = (k_ + k_offset) % a.grid_shape()[1];
197
198 auto begin = std::chrono::high_resolution_clock::now();
199 auto a_tile = a_f.get();
200 auto b_tile = b_f.get();
201 auto end = std::chrono::high_resolution_clock::now();
202 double duration = std::chrono::duration<double>(end - begin).count();
203 sync += duration;
204
205 dr::sp::dense_matrix_view a_local(a_tile);
206 dr::sp::dense_matrix_view b_local(b_tile);
207
208 if (k_ + 1 < a.grid_shape()[1]) {
209 begin = std::chrono::high_resolution_clock::now();
210 a_f = a.get_tile_async({i, (k + 1) % a.grid_shape()[1]}, allocator);
211 // a_f.wait();
212 b_f = b.get_tile_async({(k + 1) % a.grid_shape()[1], j}, allocator);
213 // b_f.wait();
214 end = std::chrono::high_resolution_clock::now();
215 duration = std::chrono::duration<double>(end - begin).count();
216 issue += duration;
217 }
218
219 begin = std::chrono::high_resolution_clock::now();
220 __detail::local_gemm(q, __detail::local(a_local),
221 __detail::local(b_local),
222 __detail::local(c_local))
223 .wait();
224 end = std::chrono::high_resolution_clock::now();
225 duration = std::chrono::duration<double>(end - begin).count();
226 compute += duration;
227 }
228 });
229 }
230 }
231
232 for (auto &&t : threads) {
233 t.join();
234 }
235
236 bool debug_print = false;
237
238 if (debug_print) {
239 std::cout << "sync total: " << (double)sync << std::endl;
240 std::cout << "issue total: " << (double)issue << std::endl;
241 std::cout << "compute total: " << (double)compute << std::endl;
242 }
243}
244
245} // namespace dr::sp
Definition: allocators.hpp:74
Definition: dense_matrix_view.hpp:21
Definition: allocators.hpp:20