1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <iomanip> |
19 | #include <sstream> |
20 | #include "common/utils.hpp" |
21 | #include "gpu/compute/compute_engine.hpp" |
22 | #include "gpu/compute/dispatch.hpp" |
23 | |
24 | namespace dnnl { |
25 | namespace impl { |
26 | namespace gpu { |
27 | namespace compute { |
28 | |
29 | // Compute optimal local work size for the given global work size. |
30 | void get_optimal_lws(const size_t *gws, size_t *lws, const size_t n, |
31 | const int mapped_vec_dim_idx, const gpu_arch_t gpu_arch) { |
32 | const size_t lws_max = 256; |
33 | // Factors in descending order, prefer bigger sizes for local work size. |
34 | const size_t optimal_lws_values[] |
35 | = {256, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 3, 2, 1}; |
36 | size_t total_lws = 1; |
37 | |
38 | size_t gws_copy[3]; |
39 | for (size_t i = 0; i < n; ++i) { |
40 | lws[i] = 1; |
41 | gws_copy[i] = gws[i]; |
42 | } |
43 | |
44 | // Iterate through global work size and calculate max divisor from |
45 | // the array optimal_lws_values. |
46 | for (size_t i = 0; i < n; ++i) { |
47 | auto rest_lws = lws_max / total_lws; |
48 | size_t lws_idx = 0; |
49 | while (rest_lws < optimal_lws_values[lws_idx]) |
50 | lws_idx++; |
51 | |
52 | while (gws_copy[i] % optimal_lws_values[lws_idx]) |
53 | lws_idx++; |
54 | |
55 | lws[i] *= optimal_lws_values[lws_idx]; |
56 | total_lws *= optimal_lws_values[lws_idx]; |
57 | gws_copy[i] /= optimal_lws_values[lws_idx]; |
58 | } |
59 | |
60 | // Temporary WA for HW/Compiler walk order issue: |
61 | // starting from XE_HP, if LWS vectorized dim is not power of 2 |
62 | // it may generate sub_groups with inconsecutive SIMD elements. |
63 | // TODO: remove it when the original issue fixed |
64 | if (mapped_vec_dim_idx != -1 && gpu_arch >= gpu_arch_t::xe_hp) { |
65 | if (!math::is_pow2(lws[mapped_vec_dim_idx])) { |
66 | for (size_t i = 0; i < n; i++) { |
67 | if (i != (size_t)mapped_vec_dim_idx) lws[i] = 1; |
68 | } |
69 | } |
70 | } |
71 | } |
72 | |
73 | dispatch_t::dispatch_t(const compute_engine_t *engine, const memory_desc_t *md) |
74 | : engine_(engine) { |
75 | |
76 | if (md && md->format_kind == dnnl_blocked) { |
77 | md_ndims_ = md->ndims; |
78 | auto &blocking = md->format_desc.blocking; |
79 | auto *strides = blocking.strides; |
80 | std::pair<int, dim_t> sorted_strides[DNNL_MAX_NDIMS]; |
81 | for (int i = 0; i < md->ndims; ++i) { |
82 | sorted_strides[i] = {i, strides[i]}; |
83 | for (int j = 0; j < blocking.inner_nblks; j++) { |
84 | if (blocking.inner_idxs[j] == i) { |
85 | int str = 1; |
86 | for (int k = blocking.inner_nblks - 1; k > j; k--) |
87 | str *= blocking.inner_blks[k]; |
88 | sorted_strides[i] = {i, str}; |
89 | break; |
90 | } |
91 | } |
92 | } |
93 | std::sort(sorted_strides, sorted_strides + md->ndims, |
94 | [](const std::pair<int, dim_t> &a, |
95 | const std::pair<int, dim_t> &b) { |
96 | return a.second < b.second; |
97 | }); |
98 | for (int i = 0; i < md->ndims; i++) { |
99 | md_nesting_levels_[sorted_strides[i].first] = md->ndims - i - 1; |
100 | } |
101 | } |
102 | } |
103 | |
104 | std::string dispatch_t::str() const { |
105 | std::ostringstream oss; |
106 | for (int i = 0; i < ndims_; ++i) { |
107 | auto &d = dims_[i]; |
108 | oss << " " |
109 | << "dim #" << i << " name: " << std::setw(10) << d.name |
110 | << " size: " << std::setw(6) << d.size << " block: " << std::setw(4) |
111 | << d.block << " nesting_level: " << std::setw(4) << d.nesting_level |
112 | << " vsize: " << std::setw(4) << d.vector_size |
113 | << " gws_idx: " << d.gws_index << std::endl; |
114 | } |
115 | return oss.str(); |
116 | } |
117 | |
118 | void dispatch_t::define_dim_with_nesting_level( |
119 | const std::string &name, int nesting_level, dim_t size, dim_t block) { |
120 | #ifndef NDEBUG |
121 | for (int i = 0; i < ndims_; ++i) |
122 | assert(dims_[i].name != name && "Name is not unique." ); |
123 | #endif |
124 | |
125 | dim_info_t di; |
126 | di.name = name; |
127 | di.size = size; |
128 | di.block = block; |
129 | di.nesting_level = nesting_level; |
130 | di.vector_size = 1; |
131 | di.gws_index = -1; |
132 | dims_[ndims_] = di; |
133 | |
134 | ++ndims_; |
135 | } |
136 | |
137 | status_t dispatch_t::vectorize_dim(const std::string &name, int vector_size) { |
138 | if (!engine_->mayiuse_sub_group(vector_size)) return status::unimplemented; |
139 | assert(vector_size > 1); |
140 | for (int i = 0; i < ndims_; ++i) { |
141 | if (dims_[i].name == name) { |
142 | assert(dims_[i].size % vector_size == 0); |
143 | assert(dims_[i].size % (vector_size * dims_[i].block) == 0); |
144 | dims_[i].vector_size = vector_size; |
145 | return status::success; |
146 | } |
147 | } |
148 | assert(!"not found" ); |
149 | return status::invalid_arguments; |
150 | } |
151 | |
152 | void dispatch_t::def_kernel_macros(kernel_ctx_t &kernel_ctx) const { |
153 | assert(generate_called && "generate() must be called." ); |
154 | |
155 | // Find a unique prefix (in case there are many kernels in a file). |
156 | std::string gws_prefix; |
157 | for (int i = 0; i < 4; i++) { |
158 | if (!kernel_ctx.has_macro(utils::format("GWS%d_DEF" , i))) { |
159 | gws_prefix = "GWS" + std::to_string(i); |
160 | break; |
161 | } |
162 | } |
163 | |
164 | assert(!gws_prefix.empty()); |
165 | |
166 | kernel_ctx.define_int(utils::format("%s_DEF" , gws_prefix.c_str()), 1); |
167 | |
168 | for (int i = 0; i < ndims_; ++i) { |
169 | auto get_dim_str = utils::format("-DGWS_GET_%s=%s_GET_ID%d" , |
170 | dims_[i].name.c_str(), gws_prefix.c_str(), i); |
171 | kernel_ctx.add_option(get_dim_str); |
172 | |
173 | auto get_block_str = utils::format("-DGWS_GET_%s_BLOCK=%s_GET_BLOCK%d" , |
174 | dims_[i].name.c_str(), gws_prefix.c_str(), i); |
175 | kernel_ctx.add_option(get_block_str); |
176 | kernel_ctx.define_int(utils::format("%s_IDX%d" , gws_prefix.c_str(), i), |
177 | dims_[i].gws_index); |
178 | kernel_ctx.define_int( |
179 | utils::format("%s_STRIDE%d" , gws_prefix.c_str(), i), |
180 | get_gws_stride(i)); |
181 | |
182 | bool is_zero = (dims_[i].size == 1); |
183 | bool is_outermost = (i == ndims_ - 1) |
184 | || dims_[i + 1].gws_index != dims_[i].gws_index; |
185 | const char *op_name = is_zero |
186 | ? "GWS_OP_ZERO" |
187 | : is_outermost ? "GWS_OP_FIRST" : "GWS_OP_MOD" ; |
188 | kernel_ctx.add_option( |
189 | utils::format("-D%s_OP%d=%s" , gws_prefix.c_str(), i, op_name)); |
190 | kernel_ctx.define_int(utils::format("%s_DIM%d" , gws_prefix.c_str(), i), |
191 | dims_[i].size); |
192 | kernel_ctx.define_int( |
193 | utils::format("%s_VEC_SIZE%d" , gws_prefix.c_str(), i), |
194 | dims_[i].vector_size); |
195 | kernel_ctx.define_int( |
196 | utils::format("%s_BLOCK%d" , gws_prefix.c_str(), i), |
197 | dims_[i].block); |
198 | } |
199 | |
200 | // Local work size and subgroup sizes. |
201 | int vec_dim_idx = find_vectorized_dim(); |
202 | kernel_ctx.define_int( |
203 | utils::format("GWS_WITH_SG_%s" , attr_suffix_), vec_dim_idx != -1); |
204 | |
205 | if (vec_dim_idx != -1) |
206 | kernel_ctx.define_int(utils::format("GWS_SGS_%s" , attr_suffix_), |
207 | dims_[vec_dim_idx].vector_size); |
208 | |
209 | auto r = nd_range(); |
210 | for (int i = 0; i < 3; i++) { |
211 | auto *lws = r.local_range(); |
212 | // lws may be NULL only when dispatch_info is default-initialized. |
213 | kernel_ctx.define_int(utils::format("GWS_LWS%d_%s" , i, attr_suffix_), |
214 | lws ? lws[i] : 1); |
215 | } |
216 | } |
217 | |
218 | void dispatch_t::generate(bool generate_lws) { |
219 | // Keep order of elements with the same nesting level unchanged. |
220 | std::stable_sort(dims_, dims_ + ndims_, |
221 | [](const dim_info_t &a, const dim_info_t &b) { |
222 | return a.nesting_level > b.nesting_level; |
223 | }); |
224 | |
225 | // XXX: Move dimensions with size = 1 to the end. |
226 | for (int i = ndims_ - 2; i >= 0; --i) { |
227 | if (dims_[i].size == 1) { |
228 | for (int j = i; j < ndims_ - 1; ++j) { |
229 | if (dims_[j + 1].size == 1) break; |
230 | std::swap(dims_[j], dims_[j + 1]); |
231 | } |
232 | } |
233 | } |
234 | |
235 | // Find vectorized dimension (if any). |
236 | int vec_dim_idx = find_vectorized_dim(); |
237 | |
238 | // Compute GWS indices. |
239 | for (int i = 0; i < ndims_; ++i) { |
240 | if (vec_dim_idx == -1) { |
241 | // Keep up to 4 dims in gws[0] to have bigger choice for work group |
242 | // size. |
243 | dims_[i].gws_index = std::min(2, std::max(0, i - 3)); |
244 | } else { |
245 | // With vectorized dimension, work group size choices are more |
246 | // limited so no need to group dimensions together. |
247 | dims_[i].gws_index = std::min(2, i); |
248 | } |
249 | } |
250 | |
251 | size_t gws[3] = {1, 1, 1}; |
252 | for (int i = ndims_ - 1; i >= 0; --i) { |
253 | dim_t block = std::max(dims_[i].block, (dim_t)1); |
254 | int gws_index = dims_[i].gws_index; |
255 | gws[gws_index] *= utils::div_up(dims_[i].size, block); |
256 | } |
257 | |
258 | size_t gws_size = gws[0] * gws[1] * gws[2]; |
259 | |
260 | auto *dev_info = engine_->device_info(); |
261 | size_t hw_threads = dev_info->hw_threads(); |
262 | |
263 | // Calculate block sizes for the dimensions with flexible blocking. |
264 | for (int i = 0; i < ndims_; ++i) { |
265 | if (dims_[i].block == 0) { |
266 | int gws_index = dims_[i].gws_index; |
267 | // Heuristic: use max blocking but keep at least eu_count work items. |
268 | size_t max_block = std::max((size_t)1, gws_size / hw_threads); |
269 | size_t block = utils::max_div(dims_[i].size, max_block); |
270 | gws[gws_index] /= block; |
271 | gws_size /= block; |
272 | dims_[i].block = block; |
273 | } |
274 | } |
275 | |
276 | // Handle a vectorized dimension (if presented). |
277 | size_t lws[3] = {1, 1, 1}; |
278 | bool with_lws = false; |
279 | if (vec_dim_idx != -1) { |
280 | int gws_index = dims_[vec_dim_idx].gws_index; |
281 | int vec_size = dims_[vec_dim_idx].vector_size; |
282 | int nblocks = dims_[vec_dim_idx].size / dims_[vec_dim_idx].block; |
283 | // XXX: max 256 work items per group |
284 | lws[gws_index] |
285 | = utils::max_div(gws[gws_index] / vec_size, 256 / vec_size) |
286 | * vec_size; |
287 | lws[gws_index] = utils::max_div(nblocks / vec_size, |
288 | (int)lws[gws_index] / vec_size) |
289 | * vec_size; |
290 | with_lws = true; |
291 | |
292 | // Move the vectorized dimension to the first place in the group. |
293 | int group_beg = ndims_ - 1; |
294 | int group_end = 0; |
295 | for (int i = 0; i < ndims_; ++i) { |
296 | if (dims_[i].gws_index == gws_index) { |
297 | group_beg = std::min(group_beg, i); |
298 | group_end = std::max(group_end, i); |
299 | } |
300 | } |
301 | |
302 | if (vec_dim_idx != group_beg) { |
303 | auto vec_dim_info = dims_[vec_dim_idx]; |
304 | for (int i = vec_dim_idx - 1; i >= group_beg; --i) { |
305 | dims_[i + 1] = dims_[i]; |
306 | } |
307 | dims_[group_beg] = vec_dim_info; |
308 | } |
309 | } |
310 | |
311 | // Use a work-group size = 1 if the number of work items < HW threads. |
312 | if (!with_lws && gws_size < hw_threads) { with_lws = true; } |
313 | |
314 | if (!with_lws) { |
315 | // Compute the best lws. |
316 | get_optimal_lws(gws, lws, 3, |
317 | vec_dim_idx != -1 ? dims_[vec_dim_idx].gws_index : -1, |
318 | dev_info->gpu_arch()); |
319 | with_lws = true; |
320 | } |
321 | |
322 | nd_range_ = nd_range_t(gws, with_lws && generate_lws ? lws : nullptr); |
323 | generate_called = true; |
324 | } |
325 | |
326 | // Allows manual setting of global and local work sizes. |
327 | void dispatch_t::generate_override(const size_t *grange, const size_t *lrange) { |
328 | dims_[1].gws_index = 2; |
329 | dims_[2].gws_index = 1; |
330 | dims_[3].gws_index = 0; |
331 | |
332 | nd_range_ = nd_range_t(grange, lrange); |
333 | generate_called = true; |
334 | } |
335 | |
336 | // Allows manual setting of local work sizes. |
337 | void dispatch_t::set_lws(const size_t *lrange) { |
338 | assert(generate_called); |
339 | auto *grange = nd_range_.global_range(); |
340 | nd_range_ = nd_range_t(grange, lrange); |
341 | } |
342 | |
343 | void dispatch_t::define_dim_with_md_hint( |
344 | const std::string &name, int md_hint_index, dim_t size, dim_t block) { |
345 | int nesting_level = min_nesting_level; |
346 | if (md_ndims_ > 0) { |
347 | assert(md_hint_index >= 0 && md_hint_index < md_ndims_); |
348 | nesting_level = md_nesting_levels_[md_hint_index]; |
349 | } |
350 | |
351 | define_dim_with_nesting_level(name, nesting_level, size, block); |
352 | } |
353 | |
354 | } // namespace compute |
355 | } // namespace gpu |
356 | } // namespace impl |
357 | } // namespace dnnl |
358 | |