1 | /******************************************************************************* |
2 | * Copyright 2021-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /* |
18 | Fast, generic reorder kernel. |
19 | |
20 | Reorder kernel is strictly memory-bound. Performance is determined only by |
21 | memory access efficiency. |
22 | |
23 | There's no perf difference between using single word reads and block read |
24 | functions (intel_sub_group_read8 etc.). Only thing that matters is that when |
25 | data is read/written from/to global memory, code must utilize whole cache |
26 | lines. Using smaller chunks causes cache eviction and requires the same data |
27 | to be accessed later again, wasting bandwidth. |
28 | |
29 | This kernel tries to load/store data in packets that are at least as large as |
30 | cache line. |
31 | |
32 | Example: abc -> bca, 32x32x32, data type f32 |
33 | Assume SIMD16 and cache line size = 64B |
34 | To fill cache line kernel must load 16 consecutive items (16c) and |
35 | store 16 consecutive items (16a). So it needs to operate on a matrix of |
36 | 16a x 16c. |
37 | It will load 16 non-adjacent (strided by A) sets of 16 adjacent data |
38 | (strided by C, src' innermost dimension), perform internal transposition, |
39 | then store 16 non-adjacent (strided by C) sets of 16 adjacent data (strided |
40 | by A, dst's innermost dimension). |
41 | |
42 | Difficulty is in determining how to achieve the above goal for |
43 | any combination of tensor size and format tags. |
44 | */ |
45 | |
46 | #include <algorithm> |
47 | #include "gpu/ocl/generic_reorder.hpp" |
48 | |
49 | #include "common/utils.hpp" |
50 | #include "gpu/ocl/ocl_stream.hpp" |
51 | #include "gpu/ocl/ocl_utils.hpp" |
52 | namespace dnnl { |
53 | namespace impl { |
54 | namespace gpu { |
55 | namespace ocl { |
56 | |
57 | using namespace dnnl::impl::memory_tracking::names; |
58 | |
59 | struct dimension_t { |
60 | dim_t size; |
61 | dim_t step; |
62 | int idx; |
63 | }; |
64 | |
65 | using dimensions_t = std::vector<dimension_t>; |
66 | |
67 | // Return a description of dimensions sorted by stride, i.e., nesting order. |
68 | dimensions_t dims_by_stride(const memory_desc_wrapper &mdw) { |
69 | const auto &desc = mdw.blocking_desc(); |
70 | const auto &strides = desc.strides; |
71 | |
72 | // Sort blocks by stride. |
73 | const auto cmp = [&](const dimension_t &a, const dimension_t &b) { |
74 | // Order by stride. Ties mean that we have at least one dim of size 1. |
75 | // We don't care about the order of those dims, just that that the dim |
76 | // with size > 1 is sorted last. |
77 | const auto a_stride = strides[a.idx]; |
78 | const auto b_stride = strides[b.idx]; |
79 | return a_stride < b_stride || (a_stride == b_stride && a.size < b.size); |
80 | }; |
81 | |
82 | const int ndims = mdw.ndims(); |
83 | dimensions_t dims(ndims); |
84 | for (int d = 0; d < ndims; ++d) { |
85 | auto &blk = dims[d]; |
86 | blk.idx = d; |
87 | blk.size = mdw.padded_dims()[d]; |
88 | } |
89 | std::sort(dims.begin(), dims.end(), cmp); |
90 | return dims; |
91 | } |
92 | |
93 | // Returns description of blocks and dimensions that constitute the format tag |
94 | // of tensor, starting from innermost. Blocks, if exist, take precedence before |
95 | // dimensions. Order of dimensions is determined by sorting strides; smallest |
96 | // stride is innermost dimension. Dimensions of size 1 are ignored. This may |
97 | // lead to illegal tensor tags, where the innermost dim is the same as the |
98 | // outermost block: |
99 | // 8x8x1x1 aBcd8b becomes cdaB8b (note: "B8b"). |
100 | // In such cases, this function combines the last dim with the first block(s), |
101 | // so xB8b becomes xb. Dimensions are treated like blocks, that is they don't |
102 | // report whole tensor size across given axis but rather number of underlying |
103 | // blocks for given dimension. |
104 | // Example: ABcd8b8a2b 32x48x1x7 will return description that amounts to... |
105 | // outermost-> 1c:4a:3b:7d:8b:8a:2b <-innermost |
106 | dimensions_t query_dims_and_blocks(const memory_desc_wrapper &mdw) { |
107 | auto blocks = dims_by_stride(mdw); |
108 | const int ndims = mdw.ndims(); |
109 | const auto &desc = mdw.blocking_desc(); |
110 | const int nblks = desc.inner_nblks; |
111 | |
112 | // Calculate info for inner blocks |
113 | dimensions_t inner_blks(nblks); |
114 | std::vector<int> steps(ndims, 1); |
115 | dim_t blks_size = 1; |
116 | for (int i = nblks - 1; i >= 0; --i) { |
117 | auto &blk = inner_blks[i]; |
118 | blk.idx = desc.inner_idxs[i]; |
119 | blk.size = desc.inner_blks[i]; |
120 | blk.step = steps[blk.idx]; |
121 | // steps increase in reverse order of how blocks are listed |
122 | steps[blk.idx] *= blk.size; |
123 | blks_size *= blk.size; |
124 | } |
125 | |
126 | // Divide dim by its step to get block size |
127 | for (auto &blk : blocks) { |
128 | blk.step = steps[blk.idx]; |
129 | blk.size = utils::div_up(blk.size, blk.step); |
130 | } |
131 | |
132 | // If we have any dims with block size 1, we ignore them. |
133 | const auto size_1 = [](const dimension_t &b) { return b.size == 1; }; |
134 | const auto end = blocks.end(); |
135 | blocks.erase(std::remove_if(blocks.begin(), end, size_1), end); |
136 | |
137 | dim_t stride = blocks.empty() ? 1 : desc.strides[blocks[0].idx]; |
138 | for (auto &blk : inner_blks) { |
139 | if (blk.size == 1) continue; // Can safely ignore blocks of size 1 |
140 | if (blocks.empty() || blocks[0].idx != blk.idx || blks_size != stride) { |
141 | blocks.insert(blocks.begin(), blk); |
142 | } else { |
143 | // Combine blocks with repeated index if there is no extra padding |
144 | blk.size *= blocks[0].size; |
145 | blocks[0] = blk; |
146 | } |
147 | blks_size /= blk.size; |
148 | stride = blks_size; |
149 | } |
150 | |
151 | if (blocks.empty() && ndims > 0) { |
152 | dimension_t blk; |
153 | blk.idx = 0; |
154 | blk.size = 1; |
155 | blk.step = 1; |
156 | blocks.push_back(blk); |
157 | } |
158 | return blocks; |
159 | } |
160 | |
161 | dimensions_t query_dims_and_blocks(const memory_desc_t &md) { |
162 | const memory_desc_wrapper mdw(md); |
163 | return query_dims_and_blocks(mdw); |
164 | } |
165 | |
166 | bool is_generic_faster_than_ref( |
167 | const memory_desc_t &src_md, const memory_desc_t &dst_md) { |
168 | const dim_t max_1d_ref_nelems = 512; |
169 | const dim_t max_nd_ref_nelems = 512 * 512; |
170 | auto nelems |
171 | = std::max(utils::array_product(src_md.padded_dims, src_md.ndims), |
172 | utils::array_product(dst_md.padded_dims, dst_md.ndims)); |
173 | if (src_md.ndims == 1 && dst_md.ndims == 1) |
174 | return nelems > max_1d_ref_nelems; |
175 | auto src_blks = query_dims_and_blocks(src_md); |
176 | auto dst_blks = query_dims_and_blocks(dst_md); |
177 | if (src_blks.empty() || dst_blks.empty()) return false; |
178 | auto src_inner_idx = src_blks[0].idx; |
179 | auto dst_inner_idx = dst_blks[0].idx; |
180 | auto scale = (src_inner_idx != dst_inner_idx) ? 2 : 1; |
181 | return nelems > scale * max_nd_ref_nelems; |
182 | } |
183 | |
184 | using dim_pair_t = std::array<dimension_t, 2>; |
185 | |
186 | // Return whether the two blocks represent an equal part of the same dimension. |
187 | bool equal_blocks(const dim_pair_t &a, const dim_pair_t &b) { |
188 | return (a[0].size == b[0].size && a[1].size == b[1].size); |
189 | } |
190 | |
191 | // Combine dimension j into dimension i. |
192 | void combine(memory_desc_t &md, int i, int j) { |
193 | const int new_ndims = md.ndims - 1; |
194 | if (new_ndims == 0) return; // Don't delete the only dimension. |
195 | auto &desc = md.format_desc.blocking; |
196 | auto &strides = desc.strides; |
197 | const int outer = strides[i] < strides[j] ? j : i; |
198 | const int inner = strides[i] < strides[j] ? i : j; |
199 | |
200 | const auto outer_stride = strides[outer]; |
201 | const auto outer_size = md.padded_dims[outer]; |
202 | md.offset0 += strides[outer] * md.padded_offsets[outer]; |
203 | md.dims[i] = md.dims[outer] * md.padded_dims[inner]; |
204 | md.padded_dims[i] = md.padded_dims[i] * md.padded_dims[j]; |
205 | md.padded_offsets[i] = md.padded_offsets[inner]; |
206 | strides[i] = strides[inner]; |
207 | for (int k = j; k < new_ndims; ++k) { |
208 | md.dims[k] = md.dims[k + 1]; |
209 | md.padded_dims[k] = md.padded_dims[k + 1]; |
210 | md.padded_offsets[k] = md.padded_offsets[k + 1]; |
211 | strides[k] = strides[k + 1]; |
212 | } |
213 | md.dims[new_ndims] = 0; |
214 | md.padded_dims[new_ndims] = 0; |
215 | md.padded_offsets[new_ndims] = 0; |
216 | strides[new_ndims] = 0; |
217 | |
218 | auto &idxs = desc.inner_idxs; |
219 | auto &blks = desc.inner_blks; |
220 | int nblks = desc.inner_nblks; |
221 | auto blks_size = utils::array_product(blks, nblks); |
222 | int count = 0; |
223 | bool last_is_combined = false; |
224 | dim_t blocks = 1; |
225 | for (int k = 0; k < nblks; ++k) { |
226 | if (idxs[k] == i || idxs[k] == j) { |
227 | blocks *= blks[k]; |
228 | // Combine the innermost dim and outermost block when they have the |
229 | // same index and no extra padding, e.g., ...A8a... -> ...a... |
230 | if (count == 0 && strides[i] == blks_size) { |
231 | md.dims[i] = md.padded_dims[i]; |
232 | strides[i] /= blks[k]; |
233 | blks_size /= blks[k]; |
234 | } else if (last_is_combined) { |
235 | blks[count - 1] *= blks[k]; |
236 | } else { |
237 | last_is_combined = true; |
238 | blks[count] = blks[k]; |
239 | idxs[count] = i; |
240 | count++; |
241 | } |
242 | continue; |
243 | } |
244 | last_is_combined = false; |
245 | blks[count] = blks[k]; |
246 | idxs[count] = (idxs[k] > j ? idxs[k] - 1 : idxs[k]); |
247 | count++; |
248 | } |
249 | // We've changed Nx1x...x1xM to 1x...x1xNM by combining dims, now fix the |
250 | // strides of the size-1 dims by multiplying by the step of the size-N dim. |
251 | auto outer_step = utils::div_up(outer_size, blocks); |
252 | for (int k = 0; k < new_ndims; ++k) { |
253 | if (strides[k] == outer_stride) strides[k] *= outer_step; |
254 | } |
255 | desc.inner_nblks = count; |
256 | md.ndims = new_ndims; |
257 | } |
258 | |
259 | void remove_bit(int &mask, int bit) { |
260 | const int lower_bits = (1 << bit) - 1; |
261 | mask = (mask & lower_bits) | ((mask >> 1) & ~lower_bits); |
262 | } |
263 | |
264 | // For each dimension, determine if the inner dimensions do not account for its |
265 | // stride. We cannot combine a dimension that does not align with the stride of |
266 | // the next outer dimension. |
267 | int extended_dims(const memory_desc_t &md) { |
268 | int mask = 0; |
269 | const int ndims = md.ndims; |
270 | const auto &blkg = md.format_desc.blocking; |
271 | const int nblks = blkg.inner_nblks; |
272 | |
273 | auto dims = dims_by_stride(md); |
274 | std::vector<dim_t> blocks(ndims, 1); |
275 | dim_t expected_stride = 1; |
276 | for (int i = 0; i < nblks; ++i) { |
277 | auto idx = blkg.inner_idxs[i]; |
278 | auto blks = blkg.inner_blks[i]; |
279 | blocks[idx] *= blks; |
280 | expected_stride *= blks; |
281 | } |
282 | |
283 | for (int i = 0; i < ndims; ++i) { |
284 | const auto &dim = dims[i]; |
285 | auto stride = blkg.strides[dim.idx]; |
286 | auto step = utils::div_up(dim.size, blocks[dim.idx]); |
287 | if (stride != expected_stride) { |
288 | mask |= (1 << dim.idx); |
289 | expected_stride = stride; |
290 | } |
291 | expected_stride *= step; |
292 | } |
293 | return mask; |
294 | } |
295 | |
296 | struct pair_filter_t { |
297 | public: |
298 | using value_type = dim_pair_t; |
299 | |
300 | private: |
301 | using const_dim_iterator_t = typename dimensions_t::const_iterator; |
302 | using predicate_t = std::function<bool(const value_type &)>; |
303 | |
304 | public: |
305 | struct iterator_t { |
306 | bool operator==(const iterator_t &o) const { return it == o.it; } |
307 | bool operator!=(const iterator_t &o) const { return it != o.it; } |
308 | value_type operator*() const { return {*it, *(it + 1)}; } |
309 | iterator_t &operator++() { |
310 | advance(); |
311 | return *this; |
312 | } |
313 | iterator_t operator++(int) { |
314 | auto cpy = *this; |
315 | advance(); |
316 | return cpy; |
317 | } |
318 | iterator_t(const_dim_iterator_t it, const_dim_iterator_t end, |
319 | predicate_t pred) |
320 | : it(it), end(end), pred(std::move(pred)) { |
321 | advance(true); |
322 | } |
323 | |
324 | private: |
325 | void advance(bool check_first = false) { |
326 | if (it == end || (check_first && pred(operator*()))) return; |
327 | while (++it != end && !pred(operator*())) {} |
328 | } |
329 | |
330 | const_dim_iterator_t it, end; |
331 | predicate_t pred; |
332 | }; |
333 | |
334 | iterator_t begin() const { return {begin_, end_ - 1, pred}; } |
335 | iterator_t end() const { return {end_ - 1, end_ - 1, pred}; } |
336 | bool empty() const { return begin() == end(); } |
337 | |
338 | pair_filter_t(const dimensions_t &iter, const predicate_t &pred) |
339 | : begin_(iter.begin()), end_(iter.end()), pred(pred) {} |
340 | |
341 | private: |
342 | const_dim_iterator_t begin_, end_; |
343 | predicate_t pred; |
344 | }; |
345 | |
346 | #define NO_IDX (-1) |
347 | // Find the index of the dimension that always and only follows the dimension |
348 | // with index idx. If none exists, return NO_IDX. If no dimension with index idx |
349 | // is present in the given block representation, return idx to delete the |
350 | // dimension |
351 | int successor(const dimensions_t &a, int idx) { |
352 | int succ; |
353 | auto match_idx = [&](const dim_pair_t &p) { return p[0].idx == idx; }; |
354 | auto match_xor = [&](const dim_pair_t &p) { |
355 | return match_idx(p) ^ (p[1].idx == succ); |
356 | }; |
357 | // idx is the index of outermost dim; it has no successor |
358 | if (a.back().idx == idx) return NO_IDX; |
359 | auto filtered = pair_filter_t(a, match_idx); |
360 | // no dim with index idx appears in block representation; delete it |
361 | if (filtered.empty()) return idx; |
362 | succ = (*filtered.begin())[1].idx; |
363 | // succ is the index of the innermost dim; it has no predecessor |
364 | if (a.front().idx == succ) return NO_IDX; |
365 | if (!pair_filter_t(a, match_xor).empty()) return NO_IDX; |
366 | return succ; |
367 | } |
368 | |
369 | // Find the index of the dimension that ALWAYS follows dimension `idx` in the |
370 | // given block representations. The successor dimension will be combined with |
371 | // the given dimension, or, in the case that the given dimension does not appear |
372 | // in the block representation, it will be deleted. |
373 | int successor(const dimensions_t &a, const dimensions_t &b, int idx) { |
374 | auto succ = successor(a, idx); |
375 | if (succ == NO_IDX || succ != successor(b, idx)) return NO_IDX; |
376 | |
377 | auto pred = [&](const dim_pair_t &p) { return p[0].idx == idx; }; |
378 | pair_filter_t iter_a(a, pred); |
379 | pair_filter_t iter_b(b, pred); |
380 | |
381 | auto it_a = iter_a.begin(); |
382 | auto it_b = iter_b.begin(); |
383 | const auto end_a = iter_a.end(); |
384 | const auto end_b = iter_b.end(); |
385 | |
386 | for (; it_a != end_a && it_b != end_b; ++it_a, ++it_b) { |
387 | if (!equal_blocks(*it_a, *it_b)) return NO_IDX; |
388 | } |
389 | return (it_a != end_a || it_b != end_b) ? NO_IDX : succ; |
390 | } |
391 | |
392 | bool can_be_combined(int idx, int mask) { |
393 | return !(idx == NO_IDX || (mask & (1 << idx))); |
394 | } |
395 | |
396 | void compress(memory_desc_t &a, memory_desc_t &b, int &a_mask, int &b_mask) { |
397 | const auto blks_a = query_dims_and_blocks(a); |
398 | const auto blks_b = query_dims_and_blocks(b); |
399 | const int skip_mask = a_mask | b_mask | extended_dims(a) | extended_dims(b); |
400 | |
401 | const int ndims = a.ndims; |
402 | std::vector<int> successors(ndims, NO_IDX); |
403 | std::vector<int> aliases(ndims); |
404 | for (int i = 0; i < ndims; ++i) { |
405 | aliases[i] = i; |
406 | if ((a_mask | b_mask) & (1 << i)) continue; |
407 | auto succ = successor(blks_a, blks_b, i); |
408 | if (!can_be_combined(succ, skip_mask)) continue; |
409 | successors[i] = succ; |
410 | } |
411 | |
412 | for (int i = ndims - 1; i >= 0; --i) { |
413 | int succ = successors[i]; |
414 | if (succ == NO_IDX) continue; |
415 | while (succ != aliases[succ]) |
416 | succ = aliases[succ]; |
417 | int from = std::max(i, succ); |
418 | int into = std::min(i, succ); |
419 | combine(a, into, from); |
420 | combine(b, into, from); |
421 | remove_bit(a_mask, from); |
422 | remove_bit(b_mask, from); |
423 | aliases[from] = into; |
424 | } |
425 | } |
426 | #undef NO_IDX |
427 | |
428 | void fix_steps(dimensions_t &blk, dimensions_t pkt) { |
429 | int steps[MAX_NDIMS] = {1, 1, 1, 1, 1, 1}; |
430 | for (size_t i = 0; i < pkt.size(); i++) { |
431 | steps[pkt[i].idx] *= pkt[i].size; |
432 | } |
433 | for (size_t i = 0; i < blk.size(); i++) { |
434 | blk[i].step = steps[blk[i].idx]; |
435 | steps[blk[i].idx] *= blk[i].size; |
436 | } |
437 | } |
438 | |
439 | // Returns vector of blocks that were present in a but missing from b |
440 | dimensions_t find_missing_blocks( |
441 | dimensions_t all, dimensions_t subset, bool round_up) { |
442 | dimensions_t ret; |
443 | for (size_t ia = 0; ia < all.size(); ia++) { |
444 | dimension_t from_a = all[ia]; |
445 | for (size_t ib = 0; ib < subset.size(); ib++) { |
446 | if (subset[ib].idx == from_a.idx) { |
447 | auto smaller = std::min(from_a.size, subset[ib].size); |
448 | if (round_up) { |
449 | from_a.size = utils::div_up(from_a.size, smaller); |
450 | subset[ib].size = utils::div_up(subset[ib].size, smaller); |
451 | } else { |
452 | from_a.size /= smaller; |
453 | subset[ib].size /= smaller; |
454 | } |
455 | } |
456 | } |
457 | if (from_a.size > 1) { ret.push_back(from_a); } |
458 | } |
459 | return ret; |
460 | } |
461 | |
462 | enum order_t { none, a_then_b, b_then_a }; |
463 | |
464 | dimensions_t remainder(dimensions_t all, dimensions_t subset) { |
465 | dimensions_t ret; |
466 | for (size_t i = 0; i < all.size(); i++) { |
467 | if (i < subset.size()) { |
468 | if (all[i].size == subset[i].size) { |
469 | continue; |
470 | } else { |
471 | dimension_t item; |
472 | item.idx = all[i].idx; |
473 | item.size = all[i].size / subset[i].size; |
474 | item.step = all[i].step * subset[i].size; |
475 | ret.push_back(item); |
476 | } |
477 | } else { |
478 | ret.push_back(all[i]); |
479 | } |
480 | } |
481 | return ret; |
482 | } |
483 | |
484 | // Given format description, try to find formula for 16 adjacent items to |
485 | // vectorize across. |
486 | // Examples: |
487 | // For 1024x1024 ab, it will be (16b) |
488 | // for 16x16x16 ABc2a2b, it will be (4c2a2b) |
489 | bool fill_to_vect( |
490 | int simd_size, const dimensions_t &all, dimensions_t &subset) { |
491 | const int min_full_vecs = 5; // TODO: tune me |
492 | dim_t current_size = 1; |
493 | subset.clear(); |
494 | for (auto &dim : all) { |
495 | dim_t next_size = current_size * dim.size; |
496 | int next_full_vecs = next_size / simd_size; |
497 | if (next_full_vecs >= min_full_vecs || next_size % simd_size == 0) { |
498 | // Vectorize innermost dim(s). If it's not divisible by simd size, |
499 | // they will need to be padded. And for that the vectorised dim(s) |
500 | // should be large enough because otherwise the padding would be |
501 | // too significant fraction of tensor and it would hurt perf. |
502 | dimension_t tmp = dim; |
503 | tmp.size = simd_size / current_size; |
504 | subset.push_back(tmp); |
505 | return true; |
506 | } |
507 | // No hope of properly filling the vector. |
508 | if (simd_size % next_size != 0) return false; |
509 | current_size = next_size; |
510 | subset.push_back(dim); |
511 | } |
512 | // there was not enough data in tensor to fill even a single packet |
513 | return false; |
514 | } |
515 | |
516 | bool add_to_vector(dimensions_t &v, dimension_t item) { |
517 | if (v.empty() || item.idx != v.back().idx) { |
518 | if (v.size() >= LOOP_NEST_LEVEL) { return false; } |
519 | v.push_back(item); |
520 | v.back().size = item.size; |
521 | } else { |
522 | v.back().size *= item.size; |
523 | } |
524 | return true; |
525 | } |
526 | |
527 | bool no_more_such_idx(dimensions_t &vect, size_t iter) { |
528 | const int idx_to_search_for = vect[iter].idx; |
529 | for (size_t i = iter + 1; i < vect.size(); i++) { |
530 | if (vect[i].idx == idx_to_search_for) { return false; } |
531 | } |
532 | return true; |
533 | } |
534 | |
535 | // Given full description of tensor and subset of description, |
536 | // sort the subset in such way that it will describe longest possible |
537 | // sequence of continuous memory addresses. |
538 | // Example: full 32a32b4c4a, subset 12a2b4c, |
539 | // result = 3a2b4c4a, it gives 3 distant sets of 2*4*4 adjacent items |
540 | dimensions_t fix_order_to(dimensions_t input, dimensions_t ref) { |
541 | dimensions_t ret; |
542 | for (size_t i = 0; i < ref.size(); i++) { |
543 | for (size_t j = 0; j < input.size(); j++) { |
544 | if (ref[i].size != 1 && input[j].size != 1 |
545 | && ref[i].idx == input[j].idx) { |
546 | int smaller = std::min(ref[i].size, input[j].size); |
547 | if (no_more_such_idx(ref, i) || j == input.size() - 1) { |
548 | smaller = input[j].size; |
549 | } |
550 | dimension_t item = ref[i]; |
551 | item.size = smaller; |
552 | ref[i].size = utils::div_up(ref[i].size, smaller); |
553 | input[j].size = utils::div_up(input[j].size, smaller); |
554 | add_to_vector(ret, item); |
555 | } |
556 | } |
557 | } |
558 | // It is possible that requested block on a dimension of src is bigger than |
559 | // whole dimension in src. That happens when there's large padding in dst. |
560 | // Add this block at the end, it will be handled by padding in opencl code. |
561 | for (size_t i = 0; i < input.size(); i++) { |
562 | if (input[i].size > 1) { add_to_vector(ret, input[i]); } |
563 | } |
564 | return ret; |
565 | } |
566 | |
567 | int check_size(dimensions_t block) { |
568 | int length = 1; |
569 | for (size_t i = 0; i < block.size(); i++) { |
570 | length *= block[i].size; |
571 | } |
572 | return length; |
573 | } |
574 | |
575 | // Given full tensor description and subset of that description, find |
576 | // how many items are adjacent in memory |
577 | size_t check_burst_length(dimensions_t all, dimensions_t subset) { |
578 | size_t length = 1; |
579 | for (size_t i = 0; i < all.size(); i++) { |
580 | for (size_t j = 0; j < subset.size(); j++) { |
581 | if (all[i].idx == subset[j].idx) { |
582 | auto smaller = std::min(all[i].size, subset[j].size); |
583 | length *= (int)smaller; |
584 | all[i].size /= smaller; |
585 | subset[j].size /= smaller; |
586 | } |
587 | } |
588 | if (all[i].size != 1) { |
589 | return length; |
590 | } // dim not covered in block, so burst ends |
591 | } |
592 | return length; |
593 | } |
594 | |
595 | // Given full tensor description and subset of that description which |
596 | // determines how many items will be read in a burst, try to enlarge subset |
597 | // to increase burst size to achieve better cache line utilizaton. |
598 | // Example: full 32a32b4c4a, subset 12a2b2c, |
599 | // current burst size = 8 (2c*4a); enlarge subset to 12a2b4c to achieve |
600 | // burst size = 32 (2b*4c*4a) |
601 | bool increase_burst(dimensions_t all, dimensions_t &subset, dimensions_t &other, |
602 | size_t itemlimit, size_t current_size, size_t optimal_size) { |
603 | const dim_t space_coeff = itemlimit / check_size(subset); |
604 | const dim_t request_coeff = utils::div_up(optimal_size, current_size); |
605 | dimensions_t subset_copy = subset; |
606 | if (space_coeff < 2) { return false; } |
607 | for (size_t i = 0; i < all.size(); i++) { |
608 | for (size_t j = 0; j < subset_copy.size(); j++) { |
609 | if (all[i].idx == subset_copy[j].idx) { |
610 | auto smaller = std::min(all[i].size, subset_copy[j].size); |
611 | all[i].size /= smaller; |
612 | subset_copy[j].size /= smaller; |
613 | } |
614 | } |
615 | if (all[i].size != 1) { |
616 | // add to subset new item or enlarge last item, if it was the same dim |
617 | auto incr = std::min(space_coeff, all[i].size); |
618 | incr = std::min(incr, request_coeff); |
619 | all[i].size = incr; |
620 | bool success = add_to_vector(subset, all[i]); |
621 | if (!success) { return false; } |
622 | add_to_vector(other, all[i]); |
623 | return true; |
624 | } |
625 | } |
626 | return false; |
627 | } |
628 | |
629 | // "packet" - set of 16 adjacent data to be read in one go by a subgroup |
630 | // "block" - how many iterations of packet read should a subgroup do |
631 | // This function splits tensor description into blocks and packets in such way |
632 | // that optimizes burst length. |
633 | bool split_into_blocks_and_packets(size_t vect, size_t optimal_burst_bytes, |
634 | size_t memlimit_bytes, size_t sizeof_src, size_t sizeof_dst, |
635 | const dimensions_t &src, const dimensions_t &dst, |
636 | dimensions_t &src_packet, dimensions_t &src_block, |
637 | dimensions_t &dst_packet, dimensions_t &dst_block) { |
638 | |
639 | // 1. determine composition of src and dst packet |
640 | if (!fill_to_vect((int)vect, src, src_packet)) { return false; } |
641 | if (!fill_to_vect((int)vect, dst, dst_packet)) { return false; } |
642 | // 2. determine which parts of tensor format tag are left after taking away packet |
643 | dimensions_t sremainder = remainder(src, src_packet); |
644 | dimensions_t dremainder = remainder(dst, dst_packet); |
645 | // 3. The same amount of data will be read and written. So, every dimension |
646 | // that's in src packet and not in dst packet must be in dst block. |
647 | src_block = find_missing_blocks(dst_packet, src_packet, true); |
648 | dst_block = find_missing_blocks(src_packet, dst_packet, false); |
649 | // 4a. Check how much continuous data will be read/written... |
650 | size_t burst_size_src |
651 | = vect * sizeof_src * check_burst_length(sremainder, src_block); |
652 | size_t burst_size_dst |
653 | = vect * sizeof_dst * check_burst_length(dremainder, dst_block); |
654 | bool success = true; |
655 | // TODO: use smaller of SRC_T, DST_T type to conserve local mem |
656 | size_t itemlimit = memlimit_bytes / (vect * sizeof_src); |
657 | // 4b. ... and determine if that's long enough to achieve good performance |
658 | while (success |
659 | && (burst_size_src < optimal_burst_bytes |
660 | || burst_size_dst < optimal_burst_bytes)) { |
661 | // 5. If burst needs to be longer, attempt to increase block size (but |
662 | // don't exceed local memory limits as that would hurt performance) |
663 | if (burst_size_src < burst_size_dst) { |
664 | success = increase_burst(sremainder, src_block, dst_block, |
665 | itemlimit, burst_size_src, optimal_burst_bytes); |
666 | } else { |
667 | success = increase_burst(dremainder, dst_block, src_block, |
668 | itemlimit, burst_size_dst, optimal_burst_bytes); |
669 | } |
670 | burst_size_src |
671 | = vect * sizeof_src * check_burst_length(sremainder, src_block); |
672 | burst_size_dst |
673 | = vect * sizeof_dst * check_burst_length(dremainder, dst_block); |
674 | } |
675 | // 6. At this point contents of src block and dst blocks are not sorted. |
676 | // Sort each of them according to tensor format tag to make longest |
677 | // possible continuous memory accesses. |
678 | src_block = fix_order_to(src_block, sremainder); |
679 | dst_block = fix_order_to(dst_block, dremainder); |
680 | fix_steps(src_block, src_packet); |
681 | fix_steps(dst_block, dst_packet); |
682 | return true; |
683 | } |
684 | |
685 | bool fill_conf_vld(const memory_desc_wrapper &src, |
686 | const memory_desc_wrapper &dst, int scale_mask, size_t memlimit_bytes, |
687 | size_t optimal_burst_bytes, vectorize_last_dim_t &cfg, int &vect_dim, |
688 | int &vect_size, dim_t *blocks) { |
689 | |
690 | const dimensions_t src_dims = query_dims_and_blocks(src); |
691 | const dimensions_t dst_dims = query_dims_and_blocks(dst); |
692 | dimensions_t src_packet, src_block, dst_packet, dst_block; |
693 | bool success = split_into_blocks_and_packets(16, memlimit_bytes, |
694 | optimal_burst_bytes, src.data_type_size(), dst.data_type_size(), |
695 | src_dims, dst_dims, src_packet, src_block, dst_packet, dst_block); |
696 | if (!success) { return false; } |
697 | // Below: unpack std vectors into POD arrays |
698 | |
699 | cfg.src_vect_limit = (int)check_burst_length(src_packet, src_packet); |
700 | cfg.dst_vect_limit = (int)check_burst_length(dst_packet, dst_packet); |
701 | |
702 | // reset packet and loop |
703 | for (size_t i = 0; i < LOOP_NEST_LEVEL; i++) { |
704 | cfg.src_vct[i].blk_size = 1; |
705 | cfg.dst_vct[i].blk_size = 1; |
706 | cfg.src_blk[i].blk_size = 1; |
707 | cfg.dst_blk[i].blk_size = 1; |
708 | cfg.src_vct[i].step_size = 1; |
709 | cfg.dst_vct[i].step_size = 1; |
710 | cfg.src_blk[i].step_size = 1; |
711 | cfg.dst_blk[i].step_size = 1; |
712 | cfg.src_vct[i].dim_idx = 0; |
713 | cfg.dst_vct[i].dim_idx = 0; |
714 | cfg.src_blk[i].dim_idx = 0; |
715 | cfg.dst_blk[i].dim_idx = 0; |
716 | } |
717 | cfg.src_vct[0].blk_size = src_packet[0].size; |
718 | cfg.src_vct[0].dim_idx = src_packet[0].idx; |
719 | cfg.dst_vct[0].blk_size = dst_packet[0].size; |
720 | cfg.dst_vct[0].dim_idx = dst_packet[0].idx; |
721 | for (size_t i = 0; i < src_packet.size(); i++) { |
722 | cfg.src_vct[i].dim_idx = src_packet[i].idx; |
723 | cfg.src_vct[i].blk_size = src_packet[i].size; |
724 | cfg.src_vct[i].step_size = src_packet[i].step; |
725 | } |
726 | for (size_t i = 0; i < dst_packet.size(); i++) { |
727 | cfg.dst_vct[i].dim_idx = dst_packet[i].idx; |
728 | cfg.dst_vct[i].blk_size = dst_packet[i].size; |
729 | cfg.dst_vct[i].step_size = dst_packet[i].step; |
730 | } |
731 | |
732 | // fill src's and dst's loop recipe |
733 | for (size_t i = 0; i < src_block.size(); i++) { |
734 | cfg.src_blk[i].dim_idx = src_block[i].idx; |
735 | cfg.src_blk[i].blk_size = src_block[i].size; |
736 | cfg.src_blk[i].step_size = src_block[i].step; |
737 | } |
738 | for (size_t i = 0; i < dst_block.size(); i++) { |
739 | cfg.dst_blk[i].dim_idx = dst_block[i].idx; |
740 | cfg.dst_blk[i].blk_size = dst_block[i].size; |
741 | cfg.dst_blk[i].step_size = dst_block[i].step; |
742 | } |
743 | cfg.vector_dim = dst_packet[0].idx; |
744 | vect_dim = dst_packet[0].idx; |
745 | vect_size = 16; |
746 | for (int i = 0; i < LOOP_NEST_LEVEL; i++) { |
747 | if (cfg.dst_blk[i].blk_size != 1) { |
748 | blocks[cfg.dst_blk[i].dim_idx] *= cfg.dst_blk[i].blk_size; |
749 | } |
750 | } |
751 | // Multiply by 16 the size of the dimension that will be vectorized. |
752 | // This is workaround for 2 dispatcher problems: |
753 | // - it doesn't allow vectorization of dims that are not divisible by 16 |
754 | // - vectorized dim's coordinate returned in openCL side is rounded to 16 |
755 | // Here we multiply the dim-to-be-vectorized by 16 and it immediately |
756 | // solves 1st issue; we declare larger block on dim-to-be-vectorized to |
757 | // prevent dispatcher from spawning too many work items over this enlarged |
758 | // dim; and later on openCL side we'll divide this dim's coordinate by 16 |
759 | // to get fine-grained coordinates not rounded to 16. |
760 | cfg.rescale_coeff = 16; |
761 | |
762 | for (int i = 0; i < LOOP_NEST_LEVEL; i++) { |
763 | auto db = cfg.dst_vct[i]; |
764 | blocks[db.dim_idx] *= db.blk_size; |
765 | } |
766 | |
767 | return true; |
768 | } |
769 | |
770 | status_t generic_reorder_t::pd_t::init_conf(engine_t *engine) { |
771 | using namespace format_tag; |
772 | |
773 | size_t memlimit_bytes; |
774 | size_t optimal_burst_bytes; |
775 | |
776 | const memory_desc_wrapper original_src_mdw(src_md()); |
777 | const memory_desc_wrapper original_dst_mdw(dst_md()); |
778 | quantization_t src_quant(attr(), original_src_mdw, DNNL_ARG_SRC); |
779 | quantization_t dst_quant(attr(), original_dst_mdw, DNNL_ARG_DST); |
780 | |
781 | auto src_mask = src_quant.scale_mask(); |
782 | auto dst_mask = dst_quant.scale_mask(); |
783 | |
784 | memory_desc_t new_a; |
785 | memory_desc_t new_b; |
786 | primitive_attr_t attr_copy = *attr(); |
787 | memcpy(&new_a, src_md(), sizeof(new_a)); |
788 | memcpy(&new_b, dst_md(), sizeof(new_b)); |
789 | compress(new_a, new_b, src_mask, dst_mask); |
790 | if (src_mask) attr_copy.scales_.set(DNNL_ARG_SRC, src_mask); |
791 | if (dst_mask) attr_copy.scales_.set(DNNL_ARG_DST, dst_mask); |
792 | |
793 | if (!is_generic_faster_than_ref(new_a, new_b)) return status::unimplemented; |
794 | |
795 | const memory_desc_wrapper src_mdw(new_a); |
796 | const memory_desc_wrapper dst_mdw(new_b); |
797 | conf.src_md_info = memory_desc_info_t::create(src_mdw); |
798 | conf.dst_md_info = memory_desc_info_t::create(dst_mdw); |
799 | |
800 | conf.src_quant = {&attr_copy, src_mdw, DNNL_ARG_SRC}; |
801 | conf.dst_quant = {&attr_copy, dst_mdw, DNNL_ARG_DST}; |
802 | conf.sum_quant = {&attr_copy}; |
803 | |
804 | status_t status = status::success; |
805 | |
806 | const auto &padded_dims = dst_mdw.padded_dims(); |
807 | conf.has_padding = !src_mdw.is_dense() || !dst_mdw.is_dense(); |
808 | conf.ndims = src_mdw.ndims(); |
809 | conf.nelems = utils::array_product(padded_dims, conf.ndims); |
810 | |
811 | conf.sub_group_size = 1; |
812 | if (conf.nelems == 0) { return status::success; } |
813 | auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine); |
814 | |
815 | // Theoretically, bursts should be at least big enough to span whole |
816 | // cache line and bigger bursts should give better perf as long as |
817 | // local mem capacity is not exceeded. However, all tests show that |
818 | // burst size 64 gives best performance regardless of cache line size. |
819 | memlimit_bytes = 2048; |
820 | optimal_burst_bytes = 64; |
821 | |
822 | dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1}; |
823 | int vect_size = 1; |
824 | int vect_dim = 0; |
825 | |
826 | if (!fill_conf_vld(src_mdw, dst_mdw, src_mask | dst_mask, memlimit_bytes, |
827 | optimal_burst_bytes, conf.aux_data.vld, vect_dim, vect_size, |
828 | &blocks[0])) { |
829 | return status::unimplemented; |
830 | } |
831 | |
832 | conf.sub_group_size = vect_size; |
833 | |
834 | conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_); |
835 | |
836 | for (int i = 0; i < MAX_NDIMS; ++i) { |
837 | auto dim_str = utils::format("D%d" , i); |
838 | if (i < dst_mdw.ndims()) { |
839 | uint64_t dim = padded_dims[i]; |
840 | // Pad vectorized dim to multiple of block size (to make sure that |
841 | // enough work items will be generated to have only full subgroups, |
842 | // no fractions) then multiply it by vector size (to work around |
843 | // dispatcher's limitation that vectorized dim must be divisible by |
844 | // vector size). |
845 | if (i == vect_dim) { |
846 | dim = utils::rnd_up(dim, blocks[i]); |
847 | dim *= 16; |
848 | } |
849 | conf.dispatch.define_dim(dim_str, i, dim, blocks[i]); |
850 | } else { |
851 | conf.dispatch.define_dim(dim_str, 1); |
852 | } |
853 | } |
854 | if (vect_size != 1) { |
855 | const auto dim_str = utils::format("D%d" , vect_dim); |
856 | conf.dispatch.vectorize_dim(dim_str, vect_size); |
857 | } |
858 | |
859 | conf.dispatch.generate(); |
860 | |
861 | return status; |
862 | } |
863 | |
864 | status_t generic_reorder_t::pd_t::init_kernel_ctx( |
865 | compute::kernel_ctx_t &kernel_ctx) const { |
866 | using namespace format_tag; |
867 | |
868 | const memory_desc_wrapper src_mdw(src_md()); |
869 | const memory_desc_wrapper dst_mdw(dst_md()); |
870 | |
871 | if (conf.nelems == 0) return status::success; |
872 | |
873 | kernel_ctx.define_int("NDIMS" , conf.ndims); |
874 | kernel_ctx.add_option("-cl-std=CL2.0" ); |
875 | |
876 | conf.src_quant.define_macros(kernel_ctx, "SRC" ); |
877 | conf.dst_quant.define_macros(kernel_ctx, "DST" ); |
878 | conf.sum_quant.define_macros(kernel_ctx, "SUM" ); |
879 | |
880 | def_dispatch(kernel_ctx, conf.dispatch); |
881 | |
882 | kernel_ctx.define_int("SUB_GROUP_SIZE" , conf.sub_group_size); |
883 | |
884 | kernel_ctx.define_int("PAD_FILL_ZERO" , conf.has_padding); |
885 | |
886 | def_memory_desc_info(kernel_ctx, conf.src_md_info, "SRC" ); |
887 | def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST" ); |
888 | |
889 | kernel_ctx.define_int("GENERIC_REORDER" , 1); |
890 | kernel_ctx.define_int("VECT_DIM" , conf.aux_data.vld.vector_dim); |
891 | kernel_ctx.define_int("VECT_SIZE" , conf.sub_group_size); |
892 | kernel_ctx.define_int("RESCALE_COEFF" , conf.aux_data.vld.rescale_coeff); |
893 | kernel_ctx.define_int("LIMIT_SSGID" , conf.aux_data.vld.src_vect_limit); |
894 | kernel_ctx.define_int("LIMIT_DSGID" , conf.aux_data.vld.dst_vect_limit); |
895 | auto r = conf.dispatch.nd_range(); |
896 | auto *lr = r.local_range(); |
897 | kernel_ctx.define_int( |
898 | "SG_PER_WG" , (lr[0] * lr[1] * lr[2]) / conf.sub_group_size); |
899 | int i = 0; |
900 | int cache_dim[MAX_NDIMS] = {1, 1, 1, 1, 1, 1}; |
901 | while (i < LOOP_NEST_LEVEL) { |
902 | cache_dim[conf.aux_data.vld.dst_vct[i].dim_idx] |
903 | *= conf.aux_data.vld.dst_vct[i].blk_size; |
904 | cache_dim[conf.aux_data.vld.dst_blk[i].dim_idx] |
905 | *= conf.aux_data.vld.dst_blk[i].blk_size; |
906 | kernel_ctx.define_int(std::string("S_BLK_SIZE_" ) + std::to_string(i), |
907 | conf.aux_data.vld.src_blk[i].blk_size); |
908 | kernel_ctx.define_int(std::string("S_BLK_STEP_" ) + std::to_string(i), |
909 | conf.aux_data.vld.src_blk[i].step_size); |
910 | kernel_ctx.define_int(std::string("S_BLK_IDX_" ) + std::to_string(i), |
911 | conf.aux_data.vld.src_blk[i].dim_idx); |
912 | kernel_ctx.define_int(std::string("D_BLK_SIZE_" ) + std::to_string(i), |
913 | conf.aux_data.vld.dst_blk[i].blk_size); |
914 | kernel_ctx.define_int(std::string("D_BLK_STEP_" ) + std::to_string(i), |
915 | conf.aux_data.vld.dst_blk[i].step_size); |
916 | kernel_ctx.define_int(std::string("D_BLK_IDX_" ) + std::to_string(i), |
917 | conf.aux_data.vld.dst_blk[i].dim_idx); |
918 | i++; |
919 | } |
920 | int cache_stride = 1; |
921 | for (int i = 0; i < MAX_NDIMS; i++) { |
922 | kernel_ctx.define_int( |
923 | std::string("CACHE_STRIDE_" ) + std::to_string(i), cache_stride); |
924 | cache_stride *= cache_dim[i]; |
925 | } |
926 | int s_size_so_far = 1; |
927 | int d_size_so_far = 1; |
928 | for (int i = 0; i < LOOP_NEST_LEVEL; i++) { |
929 | auto s = conf.aux_data.vld.src_vct[i]; |
930 | auto d = conf.aux_data.vld.dst_vct[i]; |
931 | kernel_ctx.define_int( |
932 | std::string("S_MOD_" ) + std::to_string(i), s.blk_size); |
933 | kernel_ctx.define_int( |
934 | std::string("S_DIV_" ) + std::to_string(i), s_size_so_far); |
935 | kernel_ctx.define_int( |
936 | std::string("S_MUL_" ) + std::to_string(i), s.step_size); |
937 | kernel_ctx.define_int( |
938 | std::string("S_IDX_" ) + std::to_string(i), s.dim_idx); |
939 | kernel_ctx.define_int( |
940 | std::string("D_MOD_" ) + std::to_string(i), d.blk_size); |
941 | kernel_ctx.define_int( |
942 | std::string("D_DIV_" ) + std::to_string(i), d_size_so_far); |
943 | kernel_ctx.define_int( |
944 | std::string("D_MUL_" ) + std::to_string(i), d.step_size); |
945 | kernel_ctx.define_int( |
946 | std::string("D_IDX_" ) + std::to_string(i), d.dim_idx); |
947 | |
948 | s_size_so_far *= s.blk_size; |
949 | d_size_so_far *= d.blk_size; |
950 | } |
951 | |
952 | kernel_ctx.print_options(); |
953 | return status::success; |
954 | } |
955 | |
956 | void generic_reorder_t::pd_t::init_scratchpad() { |
957 | if (conf.src_quant.with_scale()) { |
958 | auto scratchpad = scratchpad_registry().registrar(); |
959 | scratchpad.book(memory_tracking::names::key_reorder_src_scales, |
960 | conf.src_quant.num_scales(), sizeof(float), |
961 | OCL_BUFFER_ALIGNMENT); |
962 | } |
963 | if (conf.dst_quant.with_scale()) { |
964 | auto scratchpad = scratchpad_registry().registrar(); |
965 | scratchpad.book(memory_tracking::names::key_reorder_dst_scales, |
966 | conf.dst_quant.num_scales(), sizeof(float), |
967 | OCL_BUFFER_ALIGNMENT); |
968 | } |
969 | } |
970 | |
971 | status_t generic_reorder_t::execute(const exec_ctx_t &ctx) const { |
972 | |
973 | status_t status = status::success; |
974 | |
975 | auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM); |
976 | auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO); |
977 | CHECK(status); |
978 | |
979 | const auto &conf = pd()->conf; |
980 | if (conf.nelems == 0) { return status::success; } |
981 | |
982 | compute::kernel_arg_list_t arg_list; |
983 | arg_list.set(0, src); |
984 | arg_list.set(1, dst); |
985 | |
986 | arg_list.set(2, conf.src_quant.scales(ctx)); |
987 | arg_list.set(3, conf.src_quant.zero_points(ctx)); |
988 | arg_list.set(4, conf.dst_quant.scales(ctx)); |
989 | arg_list.set(5, conf.dst_quant.zero_points(ctx)); |
990 | |
991 | arg_list.set(6, conf.sum_quant.scales()); |
992 | arg_list.set(7, conf.sum_quant.zero_points()); |
993 | |
994 | auto nd_range = conf.dispatch.nd_range(); |
995 | |
996 | status = parallel_for(ctx, nd_range, kernel_, arg_list); |
997 | return status; |
998 | } |
999 | |
1000 | } // namespace ocl |
1001 | } // namespace gpu |
1002 | } // namespace impl |
1003 | } // namespace dnnl |
1004 | |