1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <iomanip>
19#include <sstream>
20#include "common/utils.hpp"
21#include "gpu/compute/compute_engine.hpp"
22#include "gpu/compute/dispatch.hpp"
23
24namespace dnnl {
25namespace impl {
26namespace gpu {
27namespace compute {
28
29// Compute optimal local work size for the given global work size.
30void get_optimal_lws(const size_t *gws, size_t *lws, const size_t n,
31 const int mapped_vec_dim_idx, const gpu_arch_t gpu_arch) {
32 const size_t lws_max = 256;
33 // Factors in descending order, prefer bigger sizes for local work size.
34 const size_t optimal_lws_values[]
35 = {256, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 3, 2, 1};
36 size_t total_lws = 1;
37
38 size_t gws_copy[3];
39 for (size_t i = 0; i < n; ++i) {
40 lws[i] = 1;
41 gws_copy[i] = gws[i];
42 }
43
44 // Iterate through global work size and calculate max divisor from
45 // the array optimal_lws_values.
46 for (size_t i = 0; i < n; ++i) {
47 auto rest_lws = lws_max / total_lws;
48 size_t lws_idx = 0;
49 while (rest_lws < optimal_lws_values[lws_idx])
50 lws_idx++;
51
52 while (gws_copy[i] % optimal_lws_values[lws_idx])
53 lws_idx++;
54
55 lws[i] *= optimal_lws_values[lws_idx];
56 total_lws *= optimal_lws_values[lws_idx];
57 gws_copy[i] /= optimal_lws_values[lws_idx];
58 }
59
60 // Temporary WA for HW/Compiler walk order issue:
61 // starting from XE_HP, if LWS vectorized dim is not power of 2
62 // it may generate sub_groups with inconsecutive SIMD elements.
63 // TODO: remove it when the original issue fixed
64 if (mapped_vec_dim_idx != -1 && gpu_arch >= gpu_arch_t::xe_hp) {
65 if (!math::is_pow2(lws[mapped_vec_dim_idx])) {
66 for (size_t i = 0; i < n; i++) {
67 if (i != (size_t)mapped_vec_dim_idx) lws[i] = 1;
68 }
69 }
70 }
71}
72
73dispatch_t::dispatch_t(const compute_engine_t *engine, const memory_desc_t *md)
74 : engine_(engine) {
75
76 if (md && md->format_kind == dnnl_blocked) {
77 md_ndims_ = md->ndims;
78 auto &blocking = md->format_desc.blocking;
79 auto *strides = blocking.strides;
80 std::pair<int, dim_t> sorted_strides[DNNL_MAX_NDIMS];
81 for (int i = 0; i < md->ndims; ++i) {
82 sorted_strides[i] = {i, strides[i]};
83 for (int j = 0; j < blocking.inner_nblks; j++) {
84 if (blocking.inner_idxs[j] == i) {
85 int str = 1;
86 for (int k = blocking.inner_nblks - 1; k > j; k--)
87 str *= blocking.inner_blks[k];
88 sorted_strides[i] = {i, str};
89 break;
90 }
91 }
92 }
93 std::sort(sorted_strides, sorted_strides + md->ndims,
94 [](const std::pair<int, dim_t> &a,
95 const std::pair<int, dim_t> &b) {
96 return a.second < b.second;
97 });
98 for (int i = 0; i < md->ndims; i++) {
99 md_nesting_levels_[sorted_strides[i].first] = md->ndims - i - 1;
100 }
101 }
102}
103
104std::string dispatch_t::str() const {
105 std::ostringstream oss;
106 for (int i = 0; i < ndims_; ++i) {
107 auto &d = dims_[i];
108 oss << " "
109 << "dim #" << i << " name: " << std::setw(10) << d.name
110 << " size: " << std::setw(6) << d.size << " block: " << std::setw(4)
111 << d.block << " nesting_level: " << std::setw(4) << d.nesting_level
112 << " vsize: " << std::setw(4) << d.vector_size
113 << " gws_idx: " << d.gws_index << std::endl;
114 }
115 return oss.str();
116}
117
118void dispatch_t::define_dim_with_nesting_level(
119 const std::string &name, int nesting_level, dim_t size, dim_t block) {
120#ifndef NDEBUG
121 for (int i = 0; i < ndims_; ++i)
122 assert(dims_[i].name != name && "Name is not unique.");
123#endif
124
125 dim_info_t di;
126 di.name = name;
127 di.size = size;
128 di.block = block;
129 di.nesting_level = nesting_level;
130 di.vector_size = 1;
131 di.gws_index = -1;
132 dims_[ndims_] = di;
133
134 ++ndims_;
135}
136
137status_t dispatch_t::vectorize_dim(const std::string &name, int vector_size) {
138 if (!engine_->mayiuse_sub_group(vector_size)) return status::unimplemented;
139 assert(vector_size > 1);
140 for (int i = 0; i < ndims_; ++i) {
141 if (dims_[i].name == name) {
142 assert(dims_[i].size % vector_size == 0);
143 assert(dims_[i].size % (vector_size * dims_[i].block) == 0);
144 dims_[i].vector_size = vector_size;
145 return status::success;
146 }
147 }
148 assert(!"not found");
149 return status::invalid_arguments;
150}
151
152void dispatch_t::def_kernel_macros(kernel_ctx_t &kernel_ctx) const {
153 assert(generate_called && "generate() must be called.");
154
155 // Find a unique prefix (in case there are many kernels in a file).
156 std::string gws_prefix;
157 for (int i = 0; i < 4; i++) {
158 if (!kernel_ctx.has_macro(utils::format("GWS%d_DEF", i))) {
159 gws_prefix = "GWS" + std::to_string(i);
160 break;
161 }
162 }
163
164 assert(!gws_prefix.empty());
165
166 kernel_ctx.define_int(utils::format("%s_DEF", gws_prefix.c_str()), 1);
167
168 for (int i = 0; i < ndims_; ++i) {
169 auto get_dim_str = utils::format("-DGWS_GET_%s=%s_GET_ID%d",
170 dims_[i].name.c_str(), gws_prefix.c_str(), i);
171 kernel_ctx.add_option(get_dim_str);
172
173 auto get_block_str = utils::format("-DGWS_GET_%s_BLOCK=%s_GET_BLOCK%d",
174 dims_[i].name.c_str(), gws_prefix.c_str(), i);
175 kernel_ctx.add_option(get_block_str);
176 kernel_ctx.define_int(utils::format("%s_IDX%d", gws_prefix.c_str(), i),
177 dims_[i].gws_index);
178 kernel_ctx.define_int(
179 utils::format("%s_STRIDE%d", gws_prefix.c_str(), i),
180 get_gws_stride(i));
181
182 bool is_zero = (dims_[i].size == 1);
183 bool is_outermost = (i == ndims_ - 1)
184 || dims_[i + 1].gws_index != dims_[i].gws_index;
185 const char *op_name = is_zero
186 ? "GWS_OP_ZERO"
187 : is_outermost ? "GWS_OP_FIRST" : "GWS_OP_MOD";
188 kernel_ctx.add_option(
189 utils::format("-D%s_OP%d=%s", gws_prefix.c_str(), i, op_name));
190 kernel_ctx.define_int(utils::format("%s_DIM%d", gws_prefix.c_str(), i),
191 dims_[i].size);
192 kernel_ctx.define_int(
193 utils::format("%s_VEC_SIZE%d", gws_prefix.c_str(), i),
194 dims_[i].vector_size);
195 kernel_ctx.define_int(
196 utils::format("%s_BLOCK%d", gws_prefix.c_str(), i),
197 dims_[i].block);
198 }
199
200 // Local work size and subgroup sizes.
201 int vec_dim_idx = find_vectorized_dim();
202 kernel_ctx.define_int(
203 utils::format("GWS_WITH_SG_%s", attr_suffix_), vec_dim_idx != -1);
204
205 if (vec_dim_idx != -1)
206 kernel_ctx.define_int(utils::format("GWS_SGS_%s", attr_suffix_),
207 dims_[vec_dim_idx].vector_size);
208
209 auto r = nd_range();
210 for (int i = 0; i < 3; i++) {
211 auto *lws = r.local_range();
212 // lws may be NULL only when dispatch_info is default-initialized.
213 kernel_ctx.define_int(utils::format("GWS_LWS%d_%s", i, attr_suffix_),
214 lws ? lws[i] : 1);
215 }
216}
217
218void dispatch_t::generate(bool generate_lws) {
219 // Keep order of elements with the same nesting level unchanged.
220 std::stable_sort(dims_, dims_ + ndims_,
221 [](const dim_info_t &a, const dim_info_t &b) {
222 return a.nesting_level > b.nesting_level;
223 });
224
225 // XXX: Move dimensions with size = 1 to the end.
226 for (int i = ndims_ - 2; i >= 0; --i) {
227 if (dims_[i].size == 1) {
228 for (int j = i; j < ndims_ - 1; ++j) {
229 if (dims_[j + 1].size == 1) break;
230 std::swap(dims_[j], dims_[j + 1]);
231 }
232 }
233 }
234
235 // Find vectorized dimension (if any).
236 int vec_dim_idx = find_vectorized_dim();
237
238 // Compute GWS indices.
239 for (int i = 0; i < ndims_; ++i) {
240 if (vec_dim_idx == -1) {
241 // Keep up to 4 dims in gws[0] to have bigger choice for work group
242 // size.
243 dims_[i].gws_index = std::min(2, std::max(0, i - 3));
244 } else {
245 // With vectorized dimension, work group size choices are more
246 // limited so no need to group dimensions together.
247 dims_[i].gws_index = std::min(2, i);
248 }
249 }
250
251 size_t gws[3] = {1, 1, 1};
252 for (int i = ndims_ - 1; i >= 0; --i) {
253 dim_t block = std::max(dims_[i].block, (dim_t)1);
254 int gws_index = dims_[i].gws_index;
255 gws[gws_index] *= utils::div_up(dims_[i].size, block);
256 }
257
258 size_t gws_size = gws[0] * gws[1] * gws[2];
259
260 auto *dev_info = engine_->device_info();
261 size_t hw_threads = dev_info->hw_threads();
262
263 // Calculate block sizes for the dimensions with flexible blocking.
264 for (int i = 0; i < ndims_; ++i) {
265 if (dims_[i].block == 0) {
266 int gws_index = dims_[i].gws_index;
267 // Heuristic: use max blocking but keep at least eu_count work items.
268 size_t max_block = std::max((size_t)1, gws_size / hw_threads);
269 size_t block = utils::max_div(dims_[i].size, max_block);
270 gws[gws_index] /= block;
271 gws_size /= block;
272 dims_[i].block = block;
273 }
274 }
275
276 // Handle a vectorized dimension (if presented).
277 size_t lws[3] = {1, 1, 1};
278 bool with_lws = false;
279 if (vec_dim_idx != -1) {
280 int gws_index = dims_[vec_dim_idx].gws_index;
281 int vec_size = dims_[vec_dim_idx].vector_size;
282 int nblocks = dims_[vec_dim_idx].size / dims_[vec_dim_idx].block;
283 // XXX: max 256 work items per group
284 lws[gws_index]
285 = utils::max_div(gws[gws_index] / vec_size, 256 / vec_size)
286 * vec_size;
287 lws[gws_index] = utils::max_div(nblocks / vec_size,
288 (int)lws[gws_index] / vec_size)
289 * vec_size;
290 with_lws = true;
291
292 // Move the vectorized dimension to the first place in the group.
293 int group_beg = ndims_ - 1;
294 int group_end = 0;
295 for (int i = 0; i < ndims_; ++i) {
296 if (dims_[i].gws_index == gws_index) {
297 group_beg = std::min(group_beg, i);
298 group_end = std::max(group_end, i);
299 }
300 }
301
302 if (vec_dim_idx != group_beg) {
303 auto vec_dim_info = dims_[vec_dim_idx];
304 for (int i = vec_dim_idx - 1; i >= group_beg; --i) {
305 dims_[i + 1] = dims_[i];
306 }
307 dims_[group_beg] = vec_dim_info;
308 }
309 }
310
311 // Use a work-group size = 1 if the number of work items < HW threads.
312 if (!with_lws && gws_size < hw_threads) { with_lws = true; }
313
314 if (!with_lws) {
315 // Compute the best lws.
316 get_optimal_lws(gws, lws, 3,
317 vec_dim_idx != -1 ? dims_[vec_dim_idx].gws_index : -1,
318 dev_info->gpu_arch());
319 with_lws = true;
320 }
321
322 nd_range_ = nd_range_t(gws, with_lws && generate_lws ? lws : nullptr);
323 generate_called = true;
324}
325
326// Allows manual setting of global and local work sizes.
327void dispatch_t::generate_override(const size_t *grange, const size_t *lrange) {
328 dims_[1].gws_index = 2;
329 dims_[2].gws_index = 1;
330 dims_[3].gws_index = 0;
331
332 nd_range_ = nd_range_t(grange, lrange);
333 generate_called = true;
334}
335
336// Allows manual setting of local work sizes.
337void dispatch_t::set_lws(const size_t *lrange) {
338 assert(generate_called);
339 auto *grange = nd_range_.global_range();
340 nd_range_ = nd_range_t(grange, lrange);
341}
342
343void dispatch_t::define_dim_with_md_hint(
344 const std::string &name, int md_hint_index, dim_t size, dim_t block) {
345 int nesting_level = min_nesting_level;
346 if (md_ndims_ > 0) {
347 assert(md_hint_index >= 0 && md_hint_index < md_ndims_);
348 nesting_level = md_nesting_levels_[md_hint_index];
349 }
350
351 define_dim_with_nesting_level(name, nesting_level, size, block);
352}
353
354} // namespace compute
355} // namespace gpu
356} // namespace impl
357} // namespace dnnl
358