1/*******************************************************************************
2* Copyright 2021-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/*
18 Fast, generic reorder kernel.
19
20 Reorder kernel is strictly memory-bound. Performance is determined only by
21 memory access efficiency.
22
23 There's no perf difference between using single word reads and block read
24 functions (intel_sub_group_read8 etc.). Only thing that matters is that when
25 data is read/written from/to global memory, code must utilize whole cache
26 lines. Using smaller chunks causes cache eviction and requires the same data
27 to be accessed later again, wasting bandwidth.
28
29 This kernel tries to load/store data in packets that are at least as large as
30 cache line.
31
32 Example: abc -> bca, 32x32x32, data type f32
33 Assume SIMD16 and cache line size = 64B
34 To fill cache line kernel must load 16 consecutive items (16c) and
35 store 16 consecutive items (16a). So it needs to operate on a matrix of
36 16a x 16c.
37 It will load 16 non-adjacent (strided by A) sets of 16 adjacent data
38 (strided by C, src' innermost dimension), perform internal transposition,
39 then store 16 non-adjacent (strided by C) sets of 16 adjacent data (strided
40 by A, dst's innermost dimension).
41
42Difficulty is in determining how to achieve the above goal for
43 any combination of tensor size and format tags.
44*/
45
46#include <algorithm>
47#include "gpu/ocl/generic_reorder.hpp"
48
49#include "common/utils.hpp"
50#include "gpu/ocl/ocl_stream.hpp"
51#include "gpu/ocl/ocl_utils.hpp"
52namespace dnnl {
53namespace impl {
54namespace gpu {
55namespace ocl {
56
57using namespace dnnl::impl::memory_tracking::names;
58
59struct dimension_t {
60 dim_t size;
61 dim_t step;
62 int idx;
63};
64
65using dimensions_t = std::vector<dimension_t>;
66
67// Return a description of dimensions sorted by stride, i.e., nesting order.
68dimensions_t dims_by_stride(const memory_desc_wrapper &mdw) {
69 const auto &desc = mdw.blocking_desc();
70 const auto &strides = desc.strides;
71
72 // Sort blocks by stride.
73 const auto cmp = [&](const dimension_t &a, const dimension_t &b) {
74 // Order by stride. Ties mean that we have at least one dim of size 1.
75 // We don't care about the order of those dims, just that that the dim
76 // with size > 1 is sorted last.
77 const auto a_stride = strides[a.idx];
78 const auto b_stride = strides[b.idx];
79 return a_stride < b_stride || (a_stride == b_stride && a.size < b.size);
80 };
81
82 const int ndims = mdw.ndims();
83 dimensions_t dims(ndims);
84 for (int d = 0; d < ndims; ++d) {
85 auto &blk = dims[d];
86 blk.idx = d;
87 blk.size = mdw.padded_dims()[d];
88 }
89 std::sort(dims.begin(), dims.end(), cmp);
90 return dims;
91}
92
93// Returns description of blocks and dimensions that constitute the format tag
94// of tensor, starting from innermost. Blocks, if exist, take precedence before
95// dimensions. Order of dimensions is determined by sorting strides; smallest
96// stride is innermost dimension. Dimensions of size 1 are ignored. This may
97// lead to illegal tensor tags, where the innermost dim is the same as the
98// outermost block:
99// 8x8x1x1 aBcd8b becomes cdaB8b (note: "B8b").
100// In such cases, this function combines the last dim with the first block(s),
101// so xB8b becomes xb. Dimensions are treated like blocks, that is they don't
102// report whole tensor size across given axis but rather number of underlying
103// blocks for given dimension.
104// Example: ABcd8b8a2b 32x48x1x7 will return description that amounts to...
105// outermost-> 1c:4a:3b:7d:8b:8a:2b <-innermost
106dimensions_t query_dims_and_blocks(const memory_desc_wrapper &mdw) {
107 auto blocks = dims_by_stride(mdw);
108 const int ndims = mdw.ndims();
109 const auto &desc = mdw.blocking_desc();
110 const int nblks = desc.inner_nblks;
111
112 // Calculate info for inner blocks
113 dimensions_t inner_blks(nblks);
114 std::vector<int> steps(ndims, 1);
115 dim_t blks_size = 1;
116 for (int i = nblks - 1; i >= 0; --i) {
117 auto &blk = inner_blks[i];
118 blk.idx = desc.inner_idxs[i];
119 blk.size = desc.inner_blks[i];
120 blk.step = steps[blk.idx];
121 // steps increase in reverse order of how blocks are listed
122 steps[blk.idx] *= blk.size;
123 blks_size *= blk.size;
124 }
125
126 // Divide dim by its step to get block size
127 for (auto &blk : blocks) {
128 blk.step = steps[blk.idx];
129 blk.size = utils::div_up(blk.size, blk.step);
130 }
131
132 // If we have any dims with block size 1, we ignore them.
133 const auto size_1 = [](const dimension_t &b) { return b.size == 1; };
134 const auto end = blocks.end();
135 blocks.erase(std::remove_if(blocks.begin(), end, size_1), end);
136
137 dim_t stride = blocks.empty() ? 1 : desc.strides[blocks[0].idx];
138 for (auto &blk : inner_blks) {
139 if (blk.size == 1) continue; // Can safely ignore blocks of size 1
140 if (blocks.empty() || blocks[0].idx != blk.idx || blks_size != stride) {
141 blocks.insert(blocks.begin(), blk);
142 } else {
143 // Combine blocks with repeated index if there is no extra padding
144 blk.size *= blocks[0].size;
145 blocks[0] = blk;
146 }
147 blks_size /= blk.size;
148 stride = blks_size;
149 }
150
151 if (blocks.empty() && ndims > 0) {
152 dimension_t blk;
153 blk.idx = 0;
154 blk.size = 1;
155 blk.step = 1;
156 blocks.push_back(blk);
157 }
158 return blocks;
159}
160
161dimensions_t query_dims_and_blocks(const memory_desc_t &md) {
162 const memory_desc_wrapper mdw(md);
163 return query_dims_and_blocks(mdw);
164}
165
166bool is_generic_faster_than_ref(
167 const memory_desc_t &src_md, const memory_desc_t &dst_md) {
168 const dim_t max_1d_ref_nelems = 512;
169 const dim_t max_nd_ref_nelems = 512 * 512;
170 auto nelems
171 = std::max(utils::array_product(src_md.padded_dims, src_md.ndims),
172 utils::array_product(dst_md.padded_dims, dst_md.ndims));
173 if (src_md.ndims == 1 && dst_md.ndims == 1)
174 return nelems > max_1d_ref_nelems;
175 auto src_blks = query_dims_and_blocks(src_md);
176 auto dst_blks = query_dims_and_blocks(dst_md);
177 if (src_blks.empty() || dst_blks.empty()) return false;
178 auto src_inner_idx = src_blks[0].idx;
179 auto dst_inner_idx = dst_blks[0].idx;
180 auto scale = (src_inner_idx != dst_inner_idx) ? 2 : 1;
181 return nelems > scale * max_nd_ref_nelems;
182}
183
184using dim_pair_t = std::array<dimension_t, 2>;
185
186// Return whether the two blocks represent an equal part of the same dimension.
187bool equal_blocks(const dim_pair_t &a, const dim_pair_t &b) {
188 return (a[0].size == b[0].size && a[1].size == b[1].size);
189}
190
191// Combine dimension j into dimension i.
192void combine(memory_desc_t &md, int i, int j) {
193 const int new_ndims = md.ndims - 1;
194 if (new_ndims == 0) return; // Don't delete the only dimension.
195 auto &desc = md.format_desc.blocking;
196 auto &strides = desc.strides;
197 const int outer = strides[i] < strides[j] ? j : i;
198 const int inner = strides[i] < strides[j] ? i : j;
199
200 const auto outer_stride = strides[outer];
201 const auto outer_size = md.padded_dims[outer];
202 md.offset0 += strides[outer] * md.padded_offsets[outer];
203 md.dims[i] = md.dims[outer] * md.padded_dims[inner];
204 md.padded_dims[i] = md.padded_dims[i] * md.padded_dims[j];
205 md.padded_offsets[i] = md.padded_offsets[inner];
206 strides[i] = strides[inner];
207 for (int k = j; k < new_ndims; ++k) {
208 md.dims[k] = md.dims[k + 1];
209 md.padded_dims[k] = md.padded_dims[k + 1];
210 md.padded_offsets[k] = md.padded_offsets[k + 1];
211 strides[k] = strides[k + 1];
212 }
213 md.dims[new_ndims] = 0;
214 md.padded_dims[new_ndims] = 0;
215 md.padded_offsets[new_ndims] = 0;
216 strides[new_ndims] = 0;
217
218 auto &idxs = desc.inner_idxs;
219 auto &blks = desc.inner_blks;
220 int nblks = desc.inner_nblks;
221 auto blks_size = utils::array_product(blks, nblks);
222 int count = 0;
223 bool last_is_combined = false;
224 dim_t blocks = 1;
225 for (int k = 0; k < nblks; ++k) {
226 if (idxs[k] == i || idxs[k] == j) {
227 blocks *= blks[k];
228 // Combine the innermost dim and outermost block when they have the
229 // same index and no extra padding, e.g., ...A8a... -> ...a...
230 if (count == 0 && strides[i] == blks_size) {
231 md.dims[i] = md.padded_dims[i];
232 strides[i] /= blks[k];
233 blks_size /= blks[k];
234 } else if (last_is_combined) {
235 blks[count - 1] *= blks[k];
236 } else {
237 last_is_combined = true;
238 blks[count] = blks[k];
239 idxs[count] = i;
240 count++;
241 }
242 continue;
243 }
244 last_is_combined = false;
245 blks[count] = blks[k];
246 idxs[count] = (idxs[k] > j ? idxs[k] - 1 : idxs[k]);
247 count++;
248 }
249 // We've changed Nx1x...x1xM to 1x...x1xNM by combining dims, now fix the
250 // strides of the size-1 dims by multiplying by the step of the size-N dim.
251 auto outer_step = utils::div_up(outer_size, blocks);
252 for (int k = 0; k < new_ndims; ++k) {
253 if (strides[k] == outer_stride) strides[k] *= outer_step;
254 }
255 desc.inner_nblks = count;
256 md.ndims = new_ndims;
257}
258
259void remove_bit(int &mask, int bit) {
260 const int lower_bits = (1 << bit) - 1;
261 mask = (mask & lower_bits) | ((mask >> 1) & ~lower_bits);
262}
263
264// For each dimension, determine if the inner dimensions do not account for its
265// stride. We cannot combine a dimension that does not align with the stride of
266// the next outer dimension.
267int extended_dims(const memory_desc_t &md) {
268 int mask = 0;
269 const int ndims = md.ndims;
270 const auto &blkg = md.format_desc.blocking;
271 const int nblks = blkg.inner_nblks;
272
273 auto dims = dims_by_stride(md);
274 std::vector<dim_t> blocks(ndims, 1);
275 dim_t expected_stride = 1;
276 for (int i = 0; i < nblks; ++i) {
277 auto idx = blkg.inner_idxs[i];
278 auto blks = blkg.inner_blks[i];
279 blocks[idx] *= blks;
280 expected_stride *= blks;
281 }
282
283 for (int i = 0; i < ndims; ++i) {
284 const auto &dim = dims[i];
285 auto stride = blkg.strides[dim.idx];
286 auto step = utils::div_up(dim.size, blocks[dim.idx]);
287 if (stride != expected_stride) {
288 mask |= (1 << dim.idx);
289 expected_stride = stride;
290 }
291 expected_stride *= step;
292 }
293 return mask;
294}
295
296struct pair_filter_t {
297public:
298 using value_type = dim_pair_t;
299
300private:
301 using const_dim_iterator_t = typename dimensions_t::const_iterator;
302 using predicate_t = std::function<bool(const value_type &)>;
303
304public:
305 struct iterator_t {
306 bool operator==(const iterator_t &o) const { return it == o.it; }
307 bool operator!=(const iterator_t &o) const { return it != o.it; }
308 value_type operator*() const { return {*it, *(it + 1)}; }
309 iterator_t &operator++() {
310 advance();
311 return *this;
312 }
313 iterator_t operator++(int) {
314 auto cpy = *this;
315 advance();
316 return cpy;
317 }
318 iterator_t(const_dim_iterator_t it, const_dim_iterator_t end,
319 predicate_t pred)
320 : it(it), end(end), pred(std::move(pred)) {
321 advance(true);
322 }
323
324 private:
325 void advance(bool check_first = false) {
326 if (it == end || (check_first && pred(operator*()))) return;
327 while (++it != end && !pred(operator*())) {}
328 }
329
330 const_dim_iterator_t it, end;
331 predicate_t pred;
332 };
333
334 iterator_t begin() const { return {begin_, end_ - 1, pred}; }
335 iterator_t end() const { return {end_ - 1, end_ - 1, pred}; }
336 bool empty() const { return begin() == end(); }
337
338 pair_filter_t(const dimensions_t &iter, const predicate_t &pred)
339 : begin_(iter.begin()), end_(iter.end()), pred(pred) {}
340
341private:
342 const_dim_iterator_t begin_, end_;
343 predicate_t pred;
344};
345
346#define NO_IDX (-1)
347// Find the index of the dimension that always and only follows the dimension
348// with index idx. If none exists, return NO_IDX. If no dimension with index idx
349// is present in the given block representation, return idx to delete the
350// dimension
351int successor(const dimensions_t &a, int idx) {
352 int succ;
353 auto match_idx = [&](const dim_pair_t &p) { return p[0].idx == idx; };
354 auto match_xor = [&](const dim_pair_t &p) {
355 return match_idx(p) ^ (p[1].idx == succ);
356 };
357 // idx is the index of outermost dim; it has no successor
358 if (a.back().idx == idx) return NO_IDX;
359 auto filtered = pair_filter_t(a, match_idx);
360 // no dim with index idx appears in block representation; delete it
361 if (filtered.empty()) return idx;
362 succ = (*filtered.begin())[1].idx;
363 // succ is the index of the innermost dim; it has no predecessor
364 if (a.front().idx == succ) return NO_IDX;
365 if (!pair_filter_t(a, match_xor).empty()) return NO_IDX;
366 return succ;
367}
368
369// Find the index of the dimension that ALWAYS follows dimension `idx` in the
370// given block representations. The successor dimension will be combined with
371// the given dimension, or, in the case that the given dimension does not appear
372// in the block representation, it will be deleted.
373int successor(const dimensions_t &a, const dimensions_t &b, int idx) {
374 auto succ = successor(a, idx);
375 if (succ == NO_IDX || succ != successor(b, idx)) return NO_IDX;
376
377 auto pred = [&](const dim_pair_t &p) { return p[0].idx == idx; };
378 pair_filter_t iter_a(a, pred);
379 pair_filter_t iter_b(b, pred);
380
381 auto it_a = iter_a.begin();
382 auto it_b = iter_b.begin();
383 const auto end_a = iter_a.end();
384 const auto end_b = iter_b.end();
385
386 for (; it_a != end_a && it_b != end_b; ++it_a, ++it_b) {
387 if (!equal_blocks(*it_a, *it_b)) return NO_IDX;
388 }
389 return (it_a != end_a || it_b != end_b) ? NO_IDX : succ;
390}
391
392bool can_be_combined(int idx, int mask) {
393 return !(idx == NO_IDX || (mask & (1 << idx)));
394}
395
396void compress(memory_desc_t &a, memory_desc_t &b, int &a_mask, int &b_mask) {
397 const auto blks_a = query_dims_and_blocks(a);
398 const auto blks_b = query_dims_and_blocks(b);
399 const int skip_mask = a_mask | b_mask | extended_dims(a) | extended_dims(b);
400
401 const int ndims = a.ndims;
402 std::vector<int> successors(ndims, NO_IDX);
403 std::vector<int> aliases(ndims);
404 for (int i = 0; i < ndims; ++i) {
405 aliases[i] = i;
406 if ((a_mask | b_mask) & (1 << i)) continue;
407 auto succ = successor(blks_a, blks_b, i);
408 if (!can_be_combined(succ, skip_mask)) continue;
409 successors[i] = succ;
410 }
411
412 for (int i = ndims - 1; i >= 0; --i) {
413 int succ = successors[i];
414 if (succ == NO_IDX) continue;
415 while (succ != aliases[succ])
416 succ = aliases[succ];
417 int from = std::max(i, succ);
418 int into = std::min(i, succ);
419 combine(a, into, from);
420 combine(b, into, from);
421 remove_bit(a_mask, from);
422 remove_bit(b_mask, from);
423 aliases[from] = into;
424 }
425}
426#undef NO_IDX
427
428void fix_steps(dimensions_t &blk, dimensions_t pkt) {
429 int steps[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
430 for (size_t i = 0; i < pkt.size(); i++) {
431 steps[pkt[i].idx] *= pkt[i].size;
432 }
433 for (size_t i = 0; i < blk.size(); i++) {
434 blk[i].step = steps[blk[i].idx];
435 steps[blk[i].idx] *= blk[i].size;
436 }
437}
438
439// Returns vector of blocks that were present in a but missing from b
440dimensions_t find_missing_blocks(
441 dimensions_t all, dimensions_t subset, bool round_up) {
442 dimensions_t ret;
443 for (size_t ia = 0; ia < all.size(); ia++) {
444 dimension_t from_a = all[ia];
445 for (size_t ib = 0; ib < subset.size(); ib++) {
446 if (subset[ib].idx == from_a.idx) {
447 auto smaller = std::min(from_a.size, subset[ib].size);
448 if (round_up) {
449 from_a.size = utils::div_up(from_a.size, smaller);
450 subset[ib].size = utils::div_up(subset[ib].size, smaller);
451 } else {
452 from_a.size /= smaller;
453 subset[ib].size /= smaller;
454 }
455 }
456 }
457 if (from_a.size > 1) { ret.push_back(from_a); }
458 }
459 return ret;
460}
461
462enum order_t { none, a_then_b, b_then_a };
463
464dimensions_t remainder(dimensions_t all, dimensions_t subset) {
465 dimensions_t ret;
466 for (size_t i = 0; i < all.size(); i++) {
467 if (i < subset.size()) {
468 if (all[i].size == subset[i].size) {
469 continue;
470 } else {
471 dimension_t item;
472 item.idx = all[i].idx;
473 item.size = all[i].size / subset[i].size;
474 item.step = all[i].step * subset[i].size;
475 ret.push_back(item);
476 }
477 } else {
478 ret.push_back(all[i]);
479 }
480 }
481 return ret;
482}
483
484// Given format description, try to find formula for 16 adjacent items to
485// vectorize across.
486// Examples:
487// For 1024x1024 ab, it will be (16b)
488// for 16x16x16 ABc2a2b, it will be (4c2a2b)
489bool fill_to_vect(
490 int simd_size, const dimensions_t &all, dimensions_t &subset) {
491 const int min_full_vecs = 5; // TODO: tune me
492 dim_t current_size = 1;
493 subset.clear();
494 for (auto &dim : all) {
495 dim_t next_size = current_size * dim.size;
496 int next_full_vecs = next_size / simd_size;
497 if (next_full_vecs >= min_full_vecs || next_size % simd_size == 0) {
498 // Vectorize innermost dim(s). If it's not divisible by simd size,
499 // they will need to be padded. And for that the vectorised dim(s)
500 // should be large enough because otherwise the padding would be
501 // too significant fraction of tensor and it would hurt perf.
502 dimension_t tmp = dim;
503 tmp.size = simd_size / current_size;
504 subset.push_back(tmp);
505 return true;
506 }
507 // No hope of properly filling the vector.
508 if (simd_size % next_size != 0) return false;
509 current_size = next_size;
510 subset.push_back(dim);
511 }
512 // there was not enough data in tensor to fill even a single packet
513 return false;
514}
515
516bool add_to_vector(dimensions_t &v, dimension_t item) {
517 if (v.empty() || item.idx != v.back().idx) {
518 if (v.size() >= LOOP_NEST_LEVEL) { return false; }
519 v.push_back(item);
520 v.back().size = item.size;
521 } else {
522 v.back().size *= item.size;
523 }
524 return true;
525}
526
527bool no_more_such_idx(dimensions_t &vect, size_t iter) {
528 const int idx_to_search_for = vect[iter].idx;
529 for (size_t i = iter + 1; i < vect.size(); i++) {
530 if (vect[i].idx == idx_to_search_for) { return false; }
531 }
532 return true;
533}
534
535// Given full description of tensor and subset of description,
536// sort the subset in such way that it will describe longest possible
537// sequence of continuous memory addresses.
538// Example: full 32a32b4c4a, subset 12a2b4c,
539// result = 3a2b4c4a, it gives 3 distant sets of 2*4*4 adjacent items
540dimensions_t fix_order_to(dimensions_t input, dimensions_t ref) {
541 dimensions_t ret;
542 for (size_t i = 0; i < ref.size(); i++) {
543 for (size_t j = 0; j < input.size(); j++) {
544 if (ref[i].size != 1 && input[j].size != 1
545 && ref[i].idx == input[j].idx) {
546 int smaller = std::min(ref[i].size, input[j].size);
547 if (no_more_such_idx(ref, i) || j == input.size() - 1) {
548 smaller = input[j].size;
549 }
550 dimension_t item = ref[i];
551 item.size = smaller;
552 ref[i].size = utils::div_up(ref[i].size, smaller);
553 input[j].size = utils::div_up(input[j].size, smaller);
554 add_to_vector(ret, item);
555 }
556 }
557 }
558 // It is possible that requested block on a dimension of src is bigger than
559 // whole dimension in src. That happens when there's large padding in dst.
560 // Add this block at the end, it will be handled by padding in opencl code.
561 for (size_t i = 0; i < input.size(); i++) {
562 if (input[i].size > 1) { add_to_vector(ret, input[i]); }
563 }
564 return ret;
565}
566
567int check_size(dimensions_t block) {
568 int length = 1;
569 for (size_t i = 0; i < block.size(); i++) {
570 length *= block[i].size;
571 }
572 return length;
573}
574
575// Given full tensor description and subset of that description, find
576// how many items are adjacent in memory
577size_t check_burst_length(dimensions_t all, dimensions_t subset) {
578 size_t length = 1;
579 for (size_t i = 0; i < all.size(); i++) {
580 for (size_t j = 0; j < subset.size(); j++) {
581 if (all[i].idx == subset[j].idx) {
582 auto smaller = std::min(all[i].size, subset[j].size);
583 length *= (int)smaller;
584 all[i].size /= smaller;
585 subset[j].size /= smaller;
586 }
587 }
588 if (all[i].size != 1) {
589 return length;
590 } // dim not covered in block, so burst ends
591 }
592 return length;
593}
594
595// Given full tensor description and subset of that description which
596// determines how many items will be read in a burst, try to enlarge subset
597// to increase burst size to achieve better cache line utilizaton.
598// Example: full 32a32b4c4a, subset 12a2b2c,
599// current burst size = 8 (2c*4a); enlarge subset to 12a2b4c to achieve
600// burst size = 32 (2b*4c*4a)
601bool increase_burst(dimensions_t all, dimensions_t &subset, dimensions_t &other,
602 size_t itemlimit, size_t current_size, size_t optimal_size) {
603 const dim_t space_coeff = itemlimit / check_size(subset);
604 const dim_t request_coeff = utils::div_up(optimal_size, current_size);
605 dimensions_t subset_copy = subset;
606 if (space_coeff < 2) { return false; }
607 for (size_t i = 0; i < all.size(); i++) {
608 for (size_t j = 0; j < subset_copy.size(); j++) {
609 if (all[i].idx == subset_copy[j].idx) {
610 auto smaller = std::min(all[i].size, subset_copy[j].size);
611 all[i].size /= smaller;
612 subset_copy[j].size /= smaller;
613 }
614 }
615 if (all[i].size != 1) {
616 // add to subset new item or enlarge last item, if it was the same dim
617 auto incr = std::min(space_coeff, all[i].size);
618 incr = std::min(incr, request_coeff);
619 all[i].size = incr;
620 bool success = add_to_vector(subset, all[i]);
621 if (!success) { return false; }
622 add_to_vector(other, all[i]);
623 return true;
624 }
625 }
626 return false;
627}
628
629// "packet" - set of 16 adjacent data to be read in one go by a subgroup
630// "block" - how many iterations of packet read should a subgroup do
631// This function splits tensor description into blocks and packets in such way
632// that optimizes burst length.
633bool split_into_blocks_and_packets(size_t vect, size_t optimal_burst_bytes,
634 size_t memlimit_bytes, size_t sizeof_src, size_t sizeof_dst,
635 const dimensions_t &src, const dimensions_t &dst,
636 dimensions_t &src_packet, dimensions_t &src_block,
637 dimensions_t &dst_packet, dimensions_t &dst_block) {
638
639 // 1. determine composition of src and dst packet
640 if (!fill_to_vect((int)vect, src, src_packet)) { return false; }
641 if (!fill_to_vect((int)vect, dst, dst_packet)) { return false; }
642 // 2. determine which parts of tensor format tag are left after taking away packet
643 dimensions_t sremainder = remainder(src, src_packet);
644 dimensions_t dremainder = remainder(dst, dst_packet);
645 // 3. The same amount of data will be read and written. So, every dimension
646 // that's in src packet and not in dst packet must be in dst block.
647 src_block = find_missing_blocks(dst_packet, src_packet, true);
648 dst_block = find_missing_blocks(src_packet, dst_packet, false);
649 // 4a. Check how much continuous data will be read/written...
650 size_t burst_size_src
651 = vect * sizeof_src * check_burst_length(sremainder, src_block);
652 size_t burst_size_dst
653 = vect * sizeof_dst * check_burst_length(dremainder, dst_block);
654 bool success = true;
655 // TODO: use smaller of SRC_T, DST_T type to conserve local mem
656 size_t itemlimit = memlimit_bytes / (vect * sizeof_src);
657 // 4b. ... and determine if that's long enough to achieve good performance
658 while (success
659 && (burst_size_src < optimal_burst_bytes
660 || burst_size_dst < optimal_burst_bytes)) {
661 // 5. If burst needs to be longer, attempt to increase block size (but
662 // don't exceed local memory limits as that would hurt performance)
663 if (burst_size_src < burst_size_dst) {
664 success = increase_burst(sremainder, src_block, dst_block,
665 itemlimit, burst_size_src, optimal_burst_bytes);
666 } else {
667 success = increase_burst(dremainder, dst_block, src_block,
668 itemlimit, burst_size_dst, optimal_burst_bytes);
669 }
670 burst_size_src
671 = vect * sizeof_src * check_burst_length(sremainder, src_block);
672 burst_size_dst
673 = vect * sizeof_dst * check_burst_length(dremainder, dst_block);
674 }
675 // 6. At this point contents of src block and dst blocks are not sorted.
676 // Sort each of them according to tensor format tag to make longest
677 // possible continuous memory accesses.
678 src_block = fix_order_to(src_block, sremainder);
679 dst_block = fix_order_to(dst_block, dremainder);
680 fix_steps(src_block, src_packet);
681 fix_steps(dst_block, dst_packet);
682 return true;
683}
684
685bool fill_conf_vld(const memory_desc_wrapper &src,
686 const memory_desc_wrapper &dst, int scale_mask, size_t memlimit_bytes,
687 size_t optimal_burst_bytes, vectorize_last_dim_t &cfg, int &vect_dim,
688 int &vect_size, dim_t *blocks) {
689
690 const dimensions_t src_dims = query_dims_and_blocks(src);
691 const dimensions_t dst_dims = query_dims_and_blocks(dst);
692 dimensions_t src_packet, src_block, dst_packet, dst_block;
693 bool success = split_into_blocks_and_packets(16, memlimit_bytes,
694 optimal_burst_bytes, src.data_type_size(), dst.data_type_size(),
695 src_dims, dst_dims, src_packet, src_block, dst_packet, dst_block);
696 if (!success) { return false; }
697 // Below: unpack std vectors into POD arrays
698
699 cfg.src_vect_limit = (int)check_burst_length(src_packet, src_packet);
700 cfg.dst_vect_limit = (int)check_burst_length(dst_packet, dst_packet);
701
702 // reset packet and loop
703 for (size_t i = 0; i < LOOP_NEST_LEVEL; i++) {
704 cfg.src_vct[i].blk_size = 1;
705 cfg.dst_vct[i].blk_size = 1;
706 cfg.src_blk[i].blk_size = 1;
707 cfg.dst_blk[i].blk_size = 1;
708 cfg.src_vct[i].step_size = 1;
709 cfg.dst_vct[i].step_size = 1;
710 cfg.src_blk[i].step_size = 1;
711 cfg.dst_blk[i].step_size = 1;
712 cfg.src_vct[i].dim_idx = 0;
713 cfg.dst_vct[i].dim_idx = 0;
714 cfg.src_blk[i].dim_idx = 0;
715 cfg.dst_blk[i].dim_idx = 0;
716 }
717 cfg.src_vct[0].blk_size = src_packet[0].size;
718 cfg.src_vct[0].dim_idx = src_packet[0].idx;
719 cfg.dst_vct[0].blk_size = dst_packet[0].size;
720 cfg.dst_vct[0].dim_idx = dst_packet[0].idx;
721 for (size_t i = 0; i < src_packet.size(); i++) {
722 cfg.src_vct[i].dim_idx = src_packet[i].idx;
723 cfg.src_vct[i].blk_size = src_packet[i].size;
724 cfg.src_vct[i].step_size = src_packet[i].step;
725 }
726 for (size_t i = 0; i < dst_packet.size(); i++) {
727 cfg.dst_vct[i].dim_idx = dst_packet[i].idx;
728 cfg.dst_vct[i].blk_size = dst_packet[i].size;
729 cfg.dst_vct[i].step_size = dst_packet[i].step;
730 }
731
732 // fill src's and dst's loop recipe
733 for (size_t i = 0; i < src_block.size(); i++) {
734 cfg.src_blk[i].dim_idx = src_block[i].idx;
735 cfg.src_blk[i].blk_size = src_block[i].size;
736 cfg.src_blk[i].step_size = src_block[i].step;
737 }
738 for (size_t i = 0; i < dst_block.size(); i++) {
739 cfg.dst_blk[i].dim_idx = dst_block[i].idx;
740 cfg.dst_blk[i].blk_size = dst_block[i].size;
741 cfg.dst_blk[i].step_size = dst_block[i].step;
742 }
743 cfg.vector_dim = dst_packet[0].idx;
744 vect_dim = dst_packet[0].idx;
745 vect_size = 16;
746 for (int i = 0; i < LOOP_NEST_LEVEL; i++) {
747 if (cfg.dst_blk[i].blk_size != 1) {
748 blocks[cfg.dst_blk[i].dim_idx] *= cfg.dst_blk[i].blk_size;
749 }
750 }
751 // Multiply by 16 the size of the dimension that will be vectorized.
752 // This is workaround for 2 dispatcher problems:
753 // - it doesn't allow vectorization of dims that are not divisible by 16
754 // - vectorized dim's coordinate returned in openCL side is rounded to 16
755 // Here we multiply the dim-to-be-vectorized by 16 and it immediately
756 // solves 1st issue; we declare larger block on dim-to-be-vectorized to
757 // prevent dispatcher from spawning too many work items over this enlarged
758 // dim; and later on openCL side we'll divide this dim's coordinate by 16
759 // to get fine-grained coordinates not rounded to 16.
760 cfg.rescale_coeff = 16;
761
762 for (int i = 0; i < LOOP_NEST_LEVEL; i++) {
763 auto db = cfg.dst_vct[i];
764 blocks[db.dim_idx] *= db.blk_size;
765 }
766
767 return true;
768}
769
770status_t generic_reorder_t::pd_t::init_conf(engine_t *engine) {
771 using namespace format_tag;
772
773 size_t memlimit_bytes;
774 size_t optimal_burst_bytes;
775
776 const memory_desc_wrapper original_src_mdw(src_md());
777 const memory_desc_wrapper original_dst_mdw(dst_md());
778 quantization_t src_quant(attr(), original_src_mdw, DNNL_ARG_SRC);
779 quantization_t dst_quant(attr(), original_dst_mdw, DNNL_ARG_DST);
780
781 auto src_mask = src_quant.scale_mask();
782 auto dst_mask = dst_quant.scale_mask();
783
784 memory_desc_t new_a;
785 memory_desc_t new_b;
786 primitive_attr_t attr_copy = *attr();
787 memcpy(&new_a, src_md(), sizeof(new_a));
788 memcpy(&new_b, dst_md(), sizeof(new_b));
789 compress(new_a, new_b, src_mask, dst_mask);
790 if (src_mask) attr_copy.scales_.set(DNNL_ARG_SRC, src_mask);
791 if (dst_mask) attr_copy.scales_.set(DNNL_ARG_DST, dst_mask);
792
793 if (!is_generic_faster_than_ref(new_a, new_b)) return status::unimplemented;
794
795 const memory_desc_wrapper src_mdw(new_a);
796 const memory_desc_wrapper dst_mdw(new_b);
797 conf.src_md_info = memory_desc_info_t::create(src_mdw);
798 conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
799
800 conf.src_quant = {&attr_copy, src_mdw, DNNL_ARG_SRC};
801 conf.dst_quant = {&attr_copy, dst_mdw, DNNL_ARG_DST};
802 conf.sum_quant = {&attr_copy};
803
804 status_t status = status::success;
805
806 const auto &padded_dims = dst_mdw.padded_dims();
807 conf.has_padding = !src_mdw.is_dense() || !dst_mdw.is_dense();
808 conf.ndims = src_mdw.ndims();
809 conf.nelems = utils::array_product(padded_dims, conf.ndims);
810
811 conf.sub_group_size = 1;
812 if (conf.nelems == 0) { return status::success; }
813 auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
814
815 // Theoretically, bursts should be at least big enough to span whole
816 // cache line and bigger bursts should give better perf as long as
817 // local mem capacity is not exceeded. However, all tests show that
818 // burst size 64 gives best performance regardless of cache line size.
819 memlimit_bytes = 2048;
820 optimal_burst_bytes = 64;
821
822 dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
823 int vect_size = 1;
824 int vect_dim = 0;
825
826 if (!fill_conf_vld(src_mdw, dst_mdw, src_mask | dst_mask, memlimit_bytes,
827 optimal_burst_bytes, conf.aux_data.vld, vect_dim, vect_size,
828 &blocks[0])) {
829 return status::unimplemented;
830 }
831
832 conf.sub_group_size = vect_size;
833
834 conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_);
835
836 for (int i = 0; i < MAX_NDIMS; ++i) {
837 auto dim_str = utils::format("D%d", i);
838 if (i < dst_mdw.ndims()) {
839 uint64_t dim = padded_dims[i];
840 // Pad vectorized dim to multiple of block size (to make sure that
841 // enough work items will be generated to have only full subgroups,
842 // no fractions) then multiply it by vector size (to work around
843 // dispatcher's limitation that vectorized dim must be divisible by
844 // vector size).
845 if (i == vect_dim) {
846 dim = utils::rnd_up(dim, blocks[i]);
847 dim *= 16;
848 }
849 conf.dispatch.define_dim(dim_str, i, dim, blocks[i]);
850 } else {
851 conf.dispatch.define_dim(dim_str, 1);
852 }
853 }
854 if (vect_size != 1) {
855 const auto dim_str = utils::format("D%d", vect_dim);
856 conf.dispatch.vectorize_dim(dim_str, vect_size);
857 }
858
859 conf.dispatch.generate();
860
861 return status;
862}
863
864status_t generic_reorder_t::pd_t::init_kernel_ctx(
865 compute::kernel_ctx_t &kernel_ctx) const {
866 using namespace format_tag;
867
868 const memory_desc_wrapper src_mdw(src_md());
869 const memory_desc_wrapper dst_mdw(dst_md());
870
871 if (conf.nelems == 0) return status::success;
872
873 kernel_ctx.define_int("NDIMS", conf.ndims);
874 kernel_ctx.add_option("-cl-std=CL2.0");
875
876 conf.src_quant.define_macros(kernel_ctx, "SRC");
877 conf.dst_quant.define_macros(kernel_ctx, "DST");
878 conf.sum_quant.define_macros(kernel_ctx, "SUM");
879
880 def_dispatch(kernel_ctx, conf.dispatch);
881
882 kernel_ctx.define_int("SUB_GROUP_SIZE", conf.sub_group_size);
883
884 kernel_ctx.define_int("PAD_FILL_ZERO", conf.has_padding);
885
886 def_memory_desc_info(kernel_ctx, conf.src_md_info, "SRC");
887 def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST");
888
889 kernel_ctx.define_int("GENERIC_REORDER", 1);
890 kernel_ctx.define_int("VECT_DIM", conf.aux_data.vld.vector_dim);
891 kernel_ctx.define_int("VECT_SIZE", conf.sub_group_size);
892 kernel_ctx.define_int("RESCALE_COEFF", conf.aux_data.vld.rescale_coeff);
893 kernel_ctx.define_int("LIMIT_SSGID", conf.aux_data.vld.src_vect_limit);
894 kernel_ctx.define_int("LIMIT_DSGID", conf.aux_data.vld.dst_vect_limit);
895 auto r = conf.dispatch.nd_range();
896 auto *lr = r.local_range();
897 kernel_ctx.define_int(
898 "SG_PER_WG", (lr[0] * lr[1] * lr[2]) / conf.sub_group_size);
899 int i = 0;
900 int cache_dim[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
901 while (i < LOOP_NEST_LEVEL) {
902 cache_dim[conf.aux_data.vld.dst_vct[i].dim_idx]
903 *= conf.aux_data.vld.dst_vct[i].blk_size;
904 cache_dim[conf.aux_data.vld.dst_blk[i].dim_idx]
905 *= conf.aux_data.vld.dst_blk[i].blk_size;
906 kernel_ctx.define_int(std::string("S_BLK_SIZE_") + std::to_string(i),
907 conf.aux_data.vld.src_blk[i].blk_size);
908 kernel_ctx.define_int(std::string("S_BLK_STEP_") + std::to_string(i),
909 conf.aux_data.vld.src_blk[i].step_size);
910 kernel_ctx.define_int(std::string("S_BLK_IDX_") + std::to_string(i),
911 conf.aux_data.vld.src_blk[i].dim_idx);
912 kernel_ctx.define_int(std::string("D_BLK_SIZE_") + std::to_string(i),
913 conf.aux_data.vld.dst_blk[i].blk_size);
914 kernel_ctx.define_int(std::string("D_BLK_STEP_") + std::to_string(i),
915 conf.aux_data.vld.dst_blk[i].step_size);
916 kernel_ctx.define_int(std::string("D_BLK_IDX_") + std::to_string(i),
917 conf.aux_data.vld.dst_blk[i].dim_idx);
918 i++;
919 }
920 int cache_stride = 1;
921 for (int i = 0; i < MAX_NDIMS; i++) {
922 kernel_ctx.define_int(
923 std::string("CACHE_STRIDE_") + std::to_string(i), cache_stride);
924 cache_stride *= cache_dim[i];
925 }
926 int s_size_so_far = 1;
927 int d_size_so_far = 1;
928 for (int i = 0; i < LOOP_NEST_LEVEL; i++) {
929 auto s = conf.aux_data.vld.src_vct[i];
930 auto d = conf.aux_data.vld.dst_vct[i];
931 kernel_ctx.define_int(
932 std::string("S_MOD_") + std::to_string(i), s.blk_size);
933 kernel_ctx.define_int(
934 std::string("S_DIV_") + std::to_string(i), s_size_so_far);
935 kernel_ctx.define_int(
936 std::string("S_MUL_") + std::to_string(i), s.step_size);
937 kernel_ctx.define_int(
938 std::string("S_IDX_") + std::to_string(i), s.dim_idx);
939 kernel_ctx.define_int(
940 std::string("D_MOD_") + std::to_string(i), d.blk_size);
941 kernel_ctx.define_int(
942 std::string("D_DIV_") + std::to_string(i), d_size_so_far);
943 kernel_ctx.define_int(
944 std::string("D_MUL_") + std::to_string(i), d.step_size);
945 kernel_ctx.define_int(
946 std::string("D_IDX_") + std::to_string(i), d.dim_idx);
947
948 s_size_so_far *= s.blk_size;
949 d_size_so_far *= d.blk_size;
950 }
951
952 kernel_ctx.print_options();
953 return status::success;
954}
955
956void generic_reorder_t::pd_t::init_scratchpad() {
957 if (conf.src_quant.with_scale()) {
958 auto scratchpad = scratchpad_registry().registrar();
959 scratchpad.book(memory_tracking::names::key_reorder_src_scales,
960 conf.src_quant.num_scales(), sizeof(float),
961 OCL_BUFFER_ALIGNMENT);
962 }
963 if (conf.dst_quant.with_scale()) {
964 auto scratchpad = scratchpad_registry().registrar();
965 scratchpad.book(memory_tracking::names::key_reorder_dst_scales,
966 conf.dst_quant.num_scales(), sizeof(float),
967 OCL_BUFFER_ALIGNMENT);
968 }
969}
970
971status_t generic_reorder_t::execute(const exec_ctx_t &ctx) const {
972
973 status_t status = status::success;
974
975 auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM);
976 auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO);
977 CHECK(status);
978
979 const auto &conf = pd()->conf;
980 if (conf.nelems == 0) { return status::success; }
981
982 compute::kernel_arg_list_t arg_list;
983 arg_list.set(0, src);
984 arg_list.set(1, dst);
985
986 arg_list.set(2, conf.src_quant.scales(ctx));
987 arg_list.set(3, conf.src_quant.zero_points(ctx));
988 arg_list.set(4, conf.dst_quant.scales(ctx));
989 arg_list.set(5, conf.dst_quant.zero_points(ctx));
990
991 arg_list.set(6, conf.sum_quant.scales());
992 arg_list.set(7, conf.sum_quant.zero_points());
993
994 auto nd_range = conf.dispatch.nd_range();
995
996 status = parallel_for(ctx, nd_range, kernel_, arg_list);
997 return status;
998}
999
1000} // namespace ocl
1001} // namespace gpu
1002} // namespace impl
1003} // namespace dnnl
1004