generic_reorder.cpp source code [oneDNN/src/gpu/ocl/generic_reorder.cpp]

1	/*******************************************************************************
2	* Copyright 2021-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/*
18	Fast, generic reorder kernel.
19
20	Reorder kernel is strictly memory-bound. Performance is determined only by
21	memory access efficiency.
22
23	There's no perf difference between using single word reads and block read
24	functions (intel_sub_group_read8 etc.). Only thing that matters is that when
25	data is read/written from/to global memory, code must utilize whole cache
26	lines. Using smaller chunks causes cache eviction and requires the same data
27	to be accessed later again, wasting bandwidth.
28
29	This kernel tries to load/store data in packets that are at least as large as
30	cache line.
31
32	Example: abc -> bca, 32x32x32, data type f32
33	Assume SIMD16 and cache line size = 64B
34	To fill cache line kernel must load 16 consecutive items (16c) and
35	store 16 consecutive items (16a). So it needs to operate on a matrix of
36	16a x 16c.
37	It will load 16 non-adjacent (strided by A) sets of 16 adjacent data
38	(strided by C, src' innermost dimension), perform internal transposition,
39	then store 16 non-adjacent (strided by C) sets of 16 adjacent data (strided
40	by A, dst's innermost dimension).
41
42	Difficulty is in determining how to achieve the above goal for
43	any combination of tensor size and format tags.
44	*/
45
46	#include <algorithm>
47	#include "gpu/ocl/generic_reorder.hpp"
48
49	#include "common/utils.hpp"
50	#include "gpu/ocl/ocl_stream.hpp"
51	#include "gpu/ocl/ocl_utils.hpp"
52	namespace dnnl {
53	namespace impl {
54	namespace gpu {
55	namespace ocl {
56
57	using namespace dnnl::impl::memory_tracking::names;
58
59	struct dimension_t {
60	dim_t size;
61	dim_t step;
62	int idx;
63	};
64
65	using dimensions_t = std::vector<dimension_t>;
66
67	// Return a description of dimensions sorted by stride, i.e., nesting order.
68	dimensions_t dims_by_stride(const memory_desc_wrapper &mdw) {
69	const auto &desc = mdw.blocking_desc();
70	const auto &strides = desc.strides;
71
72	// Sort blocks by stride.
73	const auto cmp = [&](const dimension_t &a, const dimension_t &b) {
74	// Order by stride. Ties mean that we have at least one dim of size 1.
75	// We don't care about the order of those dims, just that that the dim
76	// with size > 1 is sorted last.
77	const auto a_stride = strides[a.idx];
78	const auto b_stride = strides[b.idx];
79	return a_stride < b_stride \|\| (a_stride == b_stride && a.size < b.size);
80	};
81
82	const int ndims = mdw.ndims();
83	dimensions_t dims(ndims);
84	for (int d = `0`; d < ndims; ++d) {
85	auto &blk = dims[d];
86	blk.idx = d;
87	blk.size = mdw.padded_dims()[d];
88	}
89	std::sort(dims.begin(), dims.end(), cmp);
90	return dims;
91	}
92
93	// Returns description of blocks and dimensions that constitute the format tag
94	// of tensor, starting from innermost. Blocks, if exist, take precedence before
95	// dimensions. Order of dimensions is determined by sorting strides; smallest
96	// stride is innermost dimension. Dimensions of size 1 are ignored. This may
97	// lead to illegal tensor tags, where the innermost dim is the same as the
98	// outermost block:
99	// 8x8x1x1 aBcd8b becomes cdaB8b (note: "B8b").
100	// In such cases, this function combines the last dim with the first block(s),
101	// so xB8b becomes xb. Dimensions are treated like blocks, that is they don't
102	// report whole tensor size across given axis but rather number of underlying
103	// blocks for given dimension.
104	// Example: ABcd8b8a2b 32x48x1x7 will return description that amounts to...
105	// outermost-> 1c:4a:3b:7d:8b:8a:2b <-innermost
106	dimensions_t query_dims_and_blocks(const memory_desc_wrapper &mdw) {
107	auto blocks = dims_by_stride(mdw);
108	const int ndims = mdw.ndims();
109	const auto &desc = mdw.blocking_desc();
110	const int nblks = desc.inner_nblks;
111
112	// Calculate info for inner blocks
113	dimensions_t inner_blks(nblks);
114	std::vector<int> steps(ndims, `1`);
115	dim_t blks_size = `1`;
116	for (int i = nblks - `1`; i >= `0`; --i) {
117	auto &blk = inner_blks[i];
118	blk.idx = desc.inner_idxs[i];
119	blk.size = desc.inner_blks[i];
120	blk.step = steps[blk.idx];
121	// steps increase in reverse order of how blocks are listed
122	steps[blk.idx] *= blk.size;
123	blks_size *= blk.size;
124	}
125
126	// Divide dim by its step to get block size
127	for (auto &blk : blocks) {
128	blk.step = steps[blk.idx];
129	blk.size = utils::div_up(blk.size, blk.step);
130	}
131
132	// If we have any dims with block size 1, we ignore them.
133	const auto size_1 = [](const dimension_t &b) { return b.size == `1`; };
134	const auto end = blocks.end();
135	blocks.erase(std::remove_if(blocks.begin(), end, size_1), end);
136
137	dim_t stride = blocks.empty() ? `1` : desc.strides[blocks[`0`].idx];
138	for (auto &blk : inner_blks) {
139	if (blk.size == `1`) continue; // Can safely ignore blocks of size 1
140	if (blocks.empty() \|\| blocks[`0`].idx != blk.idx \|\| blks_size != stride) {
141	blocks.insert(blocks.begin(), blk);
142	} else {
143	// Combine blocks with repeated index if there is no extra padding
144	blk.size *= blocks[`0`].size;
145	blocks[`0`] = blk;
146	}
147	blks_size /= blk.size;
148	stride = blks_size;
149	}
150
151	if (blocks.empty() && ndims > `0`) {
152	dimension_t blk;
153	blk.idx = `0`;
154	blk.size = `1`;
155	blk.step = `1`;
156	blocks.push_back(blk);
157	}
158	return blocks;
159	}
160
161	dimensions_t query_dims_and_blocks(const memory_desc_t &md) {
162	const memory_desc_wrapper mdw(md);
163	return query_dims_and_blocks(mdw);
164	}
165
166	bool is_generic_faster_than_ref(
167	const memory_desc_t &src_md, const memory_desc_t &dst_md) {
168	const dim_t max_1d_ref_nelems = `512`;
169	const dim_t max_nd_ref_nelems = `512` * `512`;
170	auto nelems
171	= std::max(utils::array_product(src_md.padded_dims, src_md.ndims),
172	utils::array_product(dst_md.padded_dims, dst_md.ndims));
173	if (src_md.ndims == `1` && dst_md.ndims == `1`)
174	return nelems > max_1d_ref_nelems;
175	auto src_blks = query_dims_and_blocks(src_md);
176	auto dst_blks = query_dims_and_blocks(dst_md);
177	if (src_blks.empty() \|\| dst_blks.empty()) return false;
178	auto src_inner_idx = src_blks[`0`].idx;
179	auto dst_inner_idx = dst_blks[`0`].idx;
180	auto scale = (src_inner_idx != dst_inner_idx) ? `2` : `1`;
181	return nelems > scale * max_nd_ref_nelems;
182	}
183
184	using dim_pair_t = std::array<dimension_t, `2`>;
185
186	// Return whether the two blocks represent an equal part of the same dimension.
187	bool equal_blocks(const dim_pair_t &a, const dim_pair_t &b) {
188	return (a[`0`].size == b[`0`].size && a[`1`].size == b[`1`].size);
189	}
190
191	// Combine dimension j into dimension i.
192	void combine(memory_desc_t &md, int i, int j) {
193	const int new_ndims = md.ndims - `1`;
194	if (new_ndims == `0`) return; // Don't delete the only dimension.
195	auto &desc = md.format_desc.blocking;
196	auto &strides = desc.strides;
197	const int outer = strides[i] < strides[j] ? j : i;
198	const int inner = strides[i] < strides[j] ? i : j;
199
200	const auto outer_stride = strides[outer];
201	const auto outer_size = md.padded_dims[outer];
202	md.offset0 += strides[outer] * md.padded_offsets[outer];
203	md.dims[i] = md.dims[outer] * md.padded_dims[inner];
204	md.padded_dims[i] = md.padded_dims[i] * md.padded_dims[j];
205	md.padded_offsets[i] = md.padded_offsets[inner];
206	strides[i] = strides[inner];
207	for (int k = j; k < new_ndims; ++k) {
208	md.dims[k] = md.dims[k + `1`];
209	md.padded_dims[k] = md.padded_dims[k + `1`];
210	md.padded_offsets[k] = md.padded_offsets[k + `1`];
211	strides[k] = strides[k + `1`];
212	}
213	md.dims[new_ndims] = `0`;
214	md.padded_dims[new_ndims] = `0`;
215	md.padded_offsets[new_ndims] = `0`;
216	strides[new_ndims] = `0`;
217
218	auto &idxs = desc.inner_idxs;
219	auto &blks = desc.inner_blks;
220	int nblks = desc.inner_nblks;
221	auto blks_size = utils::array_product(blks, nblks);
222	int count = `0`;
223	bool last_is_combined = false;
224	dim_t blocks = `1`;
225	for (int k = `0`; k < nblks; ++k) {
226	if (idxs[k] == i \|\| idxs[k] == j) {
227	blocks *= blks[k];
228	// Combine the innermost dim and outermost block when they have the
229	// same index and no extra padding, e.g., ...A8a... -> ...a...
230	if (count == `0` && strides[i] == blks_size) {
231	md.dims[i] = md.padded_dims[i];
232	strides[i] /= blks[k];
233	blks_size /= blks[k];
234	} else if (last_is_combined) {
235	blks[count - `1`] *= blks[k];
236	} else {
237	last_is_combined = true;
238	blks[count] = blks[k];
239	idxs[count] = i;
240	count++;
241	}
242	continue;
243	}
244	last_is_combined = false;
245	blks[count] = blks[k];
246	idxs[count] = (idxs[k] > j ? idxs[k] - `1` : idxs[k]);
247	count++;
248	}
249	// We've changed Nx1x...x1xM to 1x...x1xNM by combining dims, now fix the
250	// strides of the size-1 dims by multiplying by the step of the size-N dim.
251	auto outer_step = utils::div_up(outer_size, blocks);
252	for (int k = `0`; k < new_ndims; ++k) {
253	if (strides[k] == outer_stride) strides[k] *= outer_step;
254	}
255	desc.inner_nblks = count;
256	md.ndims = new_ndims;
257	}
258
259	void remove_bit(int &mask, int bit) {
260	const int lower_bits = (`1` << bit) - `1`;
261	mask = (mask & lower_bits) \| ((mask >> `1`) & ~lower_bits);
262	}
263
264	// For each dimension, determine if the inner dimensions do not account for its
265	// stride. We cannot combine a dimension that does not align with the stride of
266	// the next outer dimension.
267	int extended_dims(const memory_desc_t &md) {
268	int mask = `0`;
269	const int ndims = md.ndims;
270	const auto &blkg = md.format_desc.blocking;
271	const int nblks = blkg.inner_nblks;
272
273	auto dims = dims_by_stride(md);
274	std::vector<dim_t> blocks(ndims, `1`);
275	dim_t expected_stride = `1`;
276	for (int i = `0`; i < nblks; ++i) {
277	auto idx = blkg.inner_idxs[i];
278	auto blks = blkg.inner_blks[i];
279	blocks[idx] *= blks;
280	expected_stride *= blks;
281	}
282
283	for (int i = `0`; i < ndims; ++i) {
284	const auto &dim = dims[i];
285	auto stride = blkg.strides[dim.idx];
286	auto step = utils::div_up(dim.size, blocks[dim.idx]);
287	if (stride != expected_stride) {
288	mask \|= (`1` << dim.idx);
289	expected_stride = stride;
290	}
291	expected_stride *= step;
292	}
293	return mask;
294	}
295
296	struct pair_filter_t {
297	public:
298	using value_type = dim_pair_t;
299
300	private:
301	using const_dim_iterator_t = typename dimensions_t::const_iterator;
302	using predicate_t = std::function<bool(const value_type &)>;
303
304	public:
305	struct iterator_t {
306	bool operator==(const iterator_t &o) const { return it == o.it; }
307	bool operator!=(const iterator_t &o) const { return it != o.it; }
308	value_type operator() const* { return {it, (it + `1`)}; }
309	iterator_t &operator++() {
310	advance();
311	return *this;
312	}
313	iterator_t operator++(int) {
314	auto cpy = *this;
315	advance();
316	return cpy;
317	}
318	iterator_t(const_dim_iterator_t it, const_dim_iterator_t end,
319	predicate_t pred)
320	: it(it), end(end), pred(std::move(pred)) {
321	advance(true);
322	}
323
324	private:
325	void advance(bool check_first = false) {
326	if (it == end \|\| (check_first && pred(operator()))) return*;
327	while (++it != end && !pred(operator*())) {}
328	}
329
330	const_dim_iterator_t it, end;
331	predicate_t pred;
332	};
333
334	iterator_t begin() const { return {begin_, end_ - `1`, pred}; }
335	iterator_t end() const { return {end_ - `1`, end_ - `1`, pred}; }
336	bool empty() const { return begin() == end(); }
337
338	pair_filter_t(const dimensions_t &iter, const predicate_t &pred)
339	: begin_(iter.begin()), end_(iter.end()), pred(pred) {}
340
341	private:
342	const_dim_iterator_t begin_, end_;
343	predicate_t pred;
344	};
345
346	#define NO_IDX (-1)
347	// Find the index of the dimension that always and only follows the dimension
348	// with index idx. If none exists, return NO_IDX. If no dimension with index idx
349	// is present in the given block representation, return idx to delete the
350	// dimension
351	int successor(const dimensions_t &a, int idx) {
352	int succ;
353	auto match_idx = [&](const dim_pair_t &p) { return p[`0`].idx == idx; };
354	auto match_xor = [&](const dim_pair_t &p) {
355	return match_idx(p) ^ (p[`1`].idx == succ);
356	};
357	// idx is the index of outermost dim; it has no successor
358	if (a.back().idx == idx) return NO_IDX;
359	auto filtered = pair_filter_t(a, match_idx);
360	// no dim with index idx appears in block representation; delete it
361	if (filtered.empty()) return idx;
362	succ = (*filtered.begin())[`1`].idx;
363	// succ is the index of the innermost dim; it has no predecessor
364	if (a.front().idx == succ) return NO_IDX;
365	if (!pair_filter_t(a, match_xor).empty()) return NO_IDX;
366	return succ;
367	}
368
369	// Find the index of the dimension that ALWAYS follows dimension `idx` in the
370	// given block representations. The successor dimension will be combined with
371	// the given dimension, or, in the case that the given dimension does not appear
372	// in the block representation, it will be deleted.
373	int successor(const dimensions_t &a, const dimensions_t &b, int idx) {
374	auto succ = successor(a, idx);
375	if (succ == NO_IDX \|\| succ != successor(b, idx)) return NO_IDX;
376
377	auto pred = [&](const dim_pair_t &p) { return p[`0`].idx == idx; };
378	pair_filter_t iter_a(a, pred);
379	pair_filter_t iter_b(b, pred);
380
381	auto it_a = iter_a.begin();
382	auto it_b = iter_b.begin();
383	const auto end_a = iter_a.end();
384	const auto end_b = iter_b.end();
385
386	for (; it_a != end_a && it_b != end_b; ++it_a, ++it_b) {
387	if (!equal_blocks(it_a, it_b)) return NO_IDX;
388	}
389	return (it_a != end_a \|\| it_b != end_b) ? NO_IDX : succ;
390	}
391
392	bool can_be_combined(int idx, int mask) {
393	return !(idx == NO_IDX \|\| (mask & (`1` << idx)));
394	}
395
396	void compress(memory_desc_t &a, memory_desc_t &b, int &a_mask, int &b_mask) {
397	const auto blks_a = query_dims_and_blocks(a);
398	const auto blks_b = query_dims_and_blocks(b);
399	const int skip_mask = a_mask \| b_mask \| extended_dims(a) \| extended_dims(b);
400
401	const int ndims = a.ndims;
402	std::vector<int> successors(ndims, NO_IDX);
403	std::vector<int> aliases(ndims);
404	for (int i = `0`; i < ndims; ++i) {
405	aliases[i] = i;
406	if ((a_mask \| b_mask) & (`1` << i)) continue;
407	auto succ = successor(blks_a, blks_b, i);
408	if (!can_be_combined(succ, skip_mask)) continue;
409	successors[i] = succ;
410	}
411
412	for (int i = ndims - `1`; i >= `0`; --i) {
413	int succ = successors[i];
414	if (succ == NO_IDX) continue;
415	while (succ != aliases[succ])
416	succ = aliases[succ];
417	int from = std::max(i, succ);
418	int into = std::min(i, succ);
419	combine(a, into, from);
420	combine(b, into, from);
421	remove_bit(a_mask, from);
422	remove_bit(b_mask, from);
423	aliases[from] = into;
424	}
425	}
426	#undef NO_IDX
427
428	void fix_steps(dimensions_t &blk, dimensions_t pkt) {
429	int steps[MAX_NDIMS] = {`1`, `1`, `1`, `1`, `1`, `1`};
430	for (size_t i = `0`; i < pkt.size(); i++) {
431	steps[pkt[i].idx] *= pkt[i].size;
432	}
433	for (size_t i = `0`; i < blk.size(); i++) {
434	blk[i].step = steps[blk[i].idx];
435	steps[blk[i].idx] *= blk[i].size;
436	}
437	}
438
439	// Returns vector of blocks that were present in a but missing from b
440	dimensions_t find_missing_blocks(
441	dimensions_t all, dimensions_t subset, bool round_up) {
442	dimensions_t ret;
443	for (size_t ia = `0`; ia < all.size(); ia++) {
444	dimension_t from_a = all[ia];
445	for (size_t ib = `0`; ib < subset.size(); ib++) {
446	if (subset[ib].idx == from_a.idx) {
447	auto smaller = std::min(from_a.size, subset[ib].size);
448	if (round_up) {
449	from_a.size = utils::div_up(from_a.size, smaller);
450	subset[ib].size = utils::div_up(subset[ib].size, smaller);
451	} else {
452	from_a.size /= smaller;
453	subset[ib].size /= smaller;
454	}
455	}
456	}
457	if (from_a.size > `1`) { ret.push_back(from_a); }
458	}
459	return ret;
460	}
461
462	enum order_t { none, a_then_b, b_then_a };
463
464	dimensions_t remainder(dimensions_t all, dimensions_t subset) {
465	dimensions_t ret;
466	for (size_t i = `0`; i < all.size(); i++) {
467	if (i < subset.size()) {
468	if (all[i].size == subset[i].size) {
469	continue;
470	} else {
471	dimension_t item;
472	item.idx = all[i].idx;
473	item.size = all[i].size / subset[i].size;
474	item.step = all[i].step * subset[i].size;
475	ret.push_back(item);
476	}
477	} else {
478	ret.push_back(all[i]);
479	}
480	}
481	return ret;
482	}
483
484	// Given format description, try to find formula for 16 adjacent items to
485	// vectorize across.
486	// Examples:
487	// For 1024x1024 ab, it will be (16b)
488	// for 16x16x16 ABc2a2b, it will be (4c2a2b)
489	bool fill_to_vect(
490	int simd_size, const dimensions_t &all, dimensions_t &subset) {
491	const int min_full_vecs = `5`; // TODO: tune me
492	dim_t current_size = `1`;
493	subset.clear();
494	for (auto &dim : all) {
495	dim_t next_size = current_size * dim.size;
496	int next_full_vecs = next_size / simd_size;
497	if (next_full_vecs >= min_full_vecs \|\| next_size % simd_size == `0`) {
498	// Vectorize innermost dim(s). If it's not divisible by simd size,
499	// they will need to be padded. And for that the vectorised dim(s)
500	// should be large enough because otherwise the padding would be
501	// too significant fraction of tensor and it would hurt perf.
502	dimension_t tmp = dim;
503	tmp.size = simd_size / current_size;
504	subset.push_back(tmp);
505	return true;
506	}
507	// No hope of properly filling the vector.
508	if (simd_size % next_size != `0`) return false;
509	current_size = next_size;
510	subset.push_back(dim);
511	}
512	// there was not enough data in tensor to fill even a single packet
513	return false;
514	}
515
516	bool add_to_vector(dimensions_t &v, dimension_t item) {
517	if (v.empty() \|\| item.idx != v.back().idx) {
518	if (v.size() >= LOOP_NEST_LEVEL) { return false; }
519	v.push_back(item);
520	v.back().size = item.size;
521	} else {
522	v.back().size *= item.size;
523	}
524	return true;
525	}
526
527	bool no_more_such_idx(dimensions_t &vect, size_t iter) {
528	const int idx_to_search_for = vect[iter].idx;
529	for (size_t i = iter + `1`; i < vect.size(); i++) {
530	if (vect[i].idx == idx_to_search_for) { return false; }
531	}
532	return true;
533	}
534
535	// Given full description of tensor and subset of description,
536	// sort the subset in such way that it will describe longest possible
537	// sequence of continuous memory addresses.
538	// Example: full 32a32b4c4a, subset 12a2b4c,
539	// result = 3a2b4c4a, it gives 3 distant sets of 244 adjacent items
540	dimensions_t fix_order_to(dimensions_t input, dimensions_t ref) {
541	dimensions_t ret;
542	for (size_t i = `0`; i < ref.size(); i++) {
543	for (size_t j = `0`; j < input.size(); j++) {
544	if (ref[i].size != `1` && input[j].size != `1`
545	&& ref[i].idx == input[j].idx) {
546	int smaller = std::min(ref[i].size, input[j].size);
547	if (no_more_such_idx(ref, i) \|\| j == input.size() - `1`) {
548	smaller = input[j].size;
549	}
550	dimension_t item = ref[i];
551	item.size = smaller;
552	ref[i].size = utils::div_up(ref[i].size, smaller);
553	input[j].size = utils::div_up(input[j].size, smaller);
554	add_to_vector(ret, item);
555	}
556	}
557	}
558	// It is possible that requested block on a dimension of src is bigger than
559	// whole dimension in src. That happens when there's large padding in dst.
560	// Add this block at the end, it will be handled by padding in opencl code.
561	for (size_t i = `0`; i < input.size(); i++) {
562	if (input[i].size > `1`) { add_to_vector(ret, input[i]); }
563	}
564	return ret;
565	}
566
567	int check_size(dimensions_t block) {
568	int length = `1`;
569	for (size_t i = `0`; i < block.size(); i++) {
570	length *= block[i].size;
571	}
572	return length;
573	}
574
575	// Given full tensor description and subset of that description, find
576	// how many items are adjacent in memory
577	size_t check_burst_length(dimensions_t all, dimensions_t subset) {
578	size_t length = `1`;
579	for (size_t i = `0`; i < all.size(); i++) {
580	for (size_t j = `0`; j < subset.size(); j++) {
581	if (all[i].idx == subset[j].idx) {
582	auto smaller = std::min(all[i].size, subset[j].size);
583	length = (int*)smaller;
584	all[i].size /= smaller;
585	subset[j].size /= smaller;
586	}
587	}
588	if (all[i].size != `1`) {
589	return length;
590	} // dim not covered in block, so burst ends
591	}
592	return length;
593	}
594
595	// Given full tensor description and subset of that description which
596	// determines how many items will be read in a burst, try to enlarge subset
597	// to increase burst size to achieve better cache line utilizaton.
598	// Example: full 32a32b4c4a, subset 12a2b2c,
599	// current burst size = 8 (2c4a); enlarge subset to 12a2b4c to achieve*
600	// burst size = 32 (2b4c4a)
601	bool increase_burst(dimensions_t all, dimensions_t &subset, dimensions_t &other,
602	size_t itemlimit, size_t current_size, size_t optimal_size) {
603	const dim_t space_coeff = itemlimit / check_size(subset);
604	const dim_t request_coeff = utils::div_up(optimal_size, current_size);
605	dimensions_t subset_copy = subset;
606	if (space_coeff < `2`) { return false; }
607	for (size_t i = `0`; i < all.size(); i++) {
608	for (size_t j = `0`; j < subset_copy.size(); j++) {
609	if (all[i].idx == subset_copy[j].idx) {
610	auto smaller = std::min(all[i].size, subset_copy[j].size);
611	all[i].size /= smaller;
612	subset_copy[j].size /= smaller;
613	}
614	}
615	if (all[i].size != `1`) {
616	// add to subset new item or enlarge last item, if it was the same dim
617	auto incr = std::min(space_coeff, all[i].size);
618	incr = std::min(incr, request_coeff);
619	all[i].size = incr;
620	bool success = add_to_vector(subset, all[i]);
621	if (!success) { return false; }
622	add_to_vector(other, all[i]);
623	return true;
624	}
625	}
626	return false;
627	}
628
629	// "packet" - set of 16 adjacent data to be read in one go by a subgroup
630	// "block" - how many iterations of packet read should a subgroup do
631	// This function splits tensor description into blocks and packets in such way
632	// that optimizes burst length.
633	bool split_into_blocks_and_packets(size_t vect, size_t optimal_burst_bytes,
634	size_t memlimit_bytes, size_t sizeof_src, size_t sizeof_dst,
635	const dimensions_t &src, const dimensions_t &dst,
636	dimensions_t &src_packet, dimensions_t &src_block,
637	dimensions_t &dst_packet, dimensions_t &dst_block) {
638
639	// 1. determine composition of src and dst packet
640	if (!fill_to_vect((int)vect, src, src_packet)) { return false; }
641	if (!fill_to_vect((int)vect, dst, dst_packet)) { return false; }
642	// 2. determine which parts of tensor format tag are left after taking away packet
643	dimensions_t sremainder = remainder(src, src_packet);
644	dimensions_t dremainder = remainder(dst, dst_packet);
645	// 3. The same amount of data will be read and written. So, every dimension
646	// that's in src packet and not in dst packet must be in dst block.
647	src_block = find_missing_blocks(dst_packet, src_packet, true);
648	dst_block = find_missing_blocks(src_packet, dst_packet, false);
649	// 4a. Check how much continuous data will be read/written...
650	size_t burst_size_src
651	= vect * sizeof_src * check_burst_length(sremainder, src_block);
652	size_t burst_size_dst
653	= vect * sizeof_dst * check_burst_length(dremainder, dst_block);
654	bool success = true;
655	// TODO: use smaller of SRC_T, DST_T type to conserve local mem
656	size_t itemlimit = memlimit_bytes / (vect * sizeof_src);
657	// 4b. ... and determine if that's long enough to achieve good performance
658	while (success
659	&& (burst_size_src < optimal_burst_bytes
660	\|\| burst_size_dst < optimal_burst_bytes)) {
661	// 5. If burst needs to be longer, attempt to increase block size (but
662	// don't exceed local memory limits as that would hurt performance)
663	if (burst_size_src < burst_size_dst) {
664	success = increase_burst(sremainder, src_block, dst_block,
665	itemlimit, burst_size_src, optimal_burst_bytes);
666	} else {
667	success = increase_burst(dremainder, dst_block, src_block,
668	itemlimit, burst_size_dst, optimal_burst_bytes);
669	}
670	burst_size_src
671	= vect * sizeof_src * check_burst_length(sremainder, src_block);
672	burst_size_dst
673	= vect * sizeof_dst * check_burst_length(dremainder, dst_block);
674	}
675	// 6. At this point contents of src block and dst blocks are not sorted.
676	// Sort each of them according to tensor format tag to make longest
677	// possible continuous memory accesses.
678	src_block = fix_order_to(src_block, sremainder);
679	dst_block = fix_order_to(dst_block, dremainder);
680	fix_steps(src_block, src_packet);
681	fix_steps(dst_block, dst_packet);
682	return true;
683	}
684
685	bool fill_conf_vld(const memory_desc_wrapper &src,
686	const memory_desc_wrapper &dst, int scale_mask, size_t memlimit_bytes,
687	size_t optimal_burst_bytes, vectorize_last_dim_t &cfg, int &vect_dim,
688	int &vect_size, dim_t *blocks) {
689
690	const dimensions_t src_dims = query_dims_and_blocks(src);
691	const dimensions_t dst_dims = query_dims_and_blocks(dst);
692	dimensions_t src_packet, src_block, dst_packet, dst_block;
693	bool success = split_into_blocks_and_packets(`16`, memlimit_bytes,
694	optimal_burst_bytes, src.data_type_size(), dst.data_type_size(),
695	src_dims, dst_dims, src_packet, src_block, dst_packet, dst_block);
696	if (!success) { return false; }
697	// Below: unpack std vectors into POD arrays
698
699	cfg.src_vect_limit = (int)check_burst_length(src_packet, src_packet);
700	cfg.dst_vect_limit = (int)check_burst_length(dst_packet, dst_packet);
701
702	// reset packet and loop
703	for (size_t i = `0`; i < LOOP_NEST_LEVEL; i++) {
704	cfg.src_vct[i].blk_size = `1`;
705	cfg.dst_vct[i].blk_size = `1`;
706	cfg.src_blk[i].blk_size = `1`;
707	cfg.dst_blk[i].blk_size = `1`;
708	cfg.src_vct[i].step_size = `1`;
709	cfg.dst_vct[i].step_size = `1`;
710	cfg.src_blk[i].step_size = `1`;
711	cfg.dst_blk[i].step_size = `1`;
712	cfg.src_vct[i].dim_idx = `0`;
713	cfg.dst_vct[i].dim_idx = `0`;
714	cfg.src_blk[i].dim_idx = `0`;
715	cfg.dst_blk[i].dim_idx = `0`;
716	}
717	cfg.src_vct[`0`].blk_size = src_packet[`0`].size;
718	cfg.src_vct[`0`].dim_idx = src_packet[`0`].idx;
719	cfg.dst_vct[`0`].blk_size = dst_packet[`0`].size;
720	cfg.dst_vct[`0`].dim_idx = dst_packet[`0`].idx;
721	for (size_t i = `0`; i < src_packet.size(); i++) {
722	cfg.src_vct[i].dim_idx = src_packet[i].idx;
723	cfg.src_vct[i].blk_size = src_packet[i].size;
724	cfg.src_vct[i].step_size = src_packet[i].step;
725	}
726	for (size_t i = `0`; i < dst_packet.size(); i++) {
727	cfg.dst_vct[i].dim_idx = dst_packet[i].idx;
728	cfg.dst_vct[i].blk_size = dst_packet[i].size;
729	cfg.dst_vct[i].step_size = dst_packet[i].step;
730	}
731
732	// fill src's and dst's loop recipe
733	for (size_t i = `0`; i < src_block.size(); i++) {
734	cfg.src_blk[i].dim_idx = src_block[i].idx;
735	cfg.src_blk[i].blk_size = src_block[i].size;
736	cfg.src_blk[i].step_size = src_block[i].step;
737	}
738	for (size_t i = `0`; i < dst_block.size(); i++) {
739	cfg.dst_blk[i].dim_idx = dst_block[i].idx;
740	cfg.dst_blk[i].blk_size = dst_block[i].size;
741	cfg.dst_blk[i].step_size = dst_block[i].step;
742	}
743	cfg.vector_dim = dst_packet[`0`].idx;
744	vect_dim = dst_packet[`0`].idx;
745	vect_size = `16`;
746	for (int i = `0`; i < LOOP_NEST_LEVEL; i++) {
747	if (cfg.dst_blk[i].blk_size != `1`) {
748	blocks[cfg.dst_blk[i].dim_idx] *= cfg.dst_blk[i].blk_size;
749	}
750	}
751	// Multiply by 16 the size of the dimension that will be vectorized.
752	// This is workaround for 2 dispatcher problems:
753	// - it doesn't allow vectorization of dims that are not divisible by 16
754	// - vectorized dim's coordinate returned in openCL side is rounded to 16
755	// Here we multiply the dim-to-be-vectorized by 16 and it immediately
756	// solves 1st issue; we declare larger block on dim-to-be-vectorized to
757	// prevent dispatcher from spawning too many work items over this enlarged
758	// dim; and later on openCL side we'll divide this dim's coordinate by 16
759	// to get fine-grained coordinates not rounded to 16.
760	cfg.rescale_coeff = `16`;
761
762	for (int i = `0`; i < LOOP_NEST_LEVEL; i++) {
763	auto db = cfg.dst_vct[i];
764	blocks[db.dim_idx] *= db.blk_size;
765	}
766
767	return true;
768	}
769
770	status_t generic_reorder_t::pd_t::init_conf(engine_t *engine) {
771	using namespace format_tag;
772
773	size_t memlimit_bytes;
774	size_t optimal_burst_bytes;
775
776	const memory_desc_wrapper original_src_mdw(src_md());
777	const memory_desc_wrapper original_dst_mdw(dst_md());
778	quantization_t src_quant(attr(), original_src_mdw, DNNL_ARG_SRC);
779	quantization_t dst_quant(attr(), original_dst_mdw, DNNL_ARG_DST);
780
781	auto src_mask = src_quant.scale_mask();
782	auto dst_mask = dst_quant.scale_mask();
783
784	memory_desc_t new_a;
785	memory_desc_t new_b;
786	primitive_attr_t attr_copy = *attr();
787	memcpy(&new_a, src_md(), sizeof(new_a));
788	memcpy(&new_b, dst_md(), sizeof(new_b));
789	compress(new_a, new_b, src_mask, dst_mask);
790	if (src_mask) attr_copy.scales_.set(DNNL_ARG_SRC, src_mask);
791	if (dst_mask) attr_copy.scales_.set(DNNL_ARG_DST, dst_mask);
792
793	if (!is_generic_faster_than_ref(new_a, new_b)) return status::unimplemented;
794
795	const memory_desc_wrapper src_mdw(new_a);
796	const memory_desc_wrapper dst_mdw(new_b);
797	conf.src_md_info = memory_desc_info_t::create(src_mdw);
798	conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
799
800	conf.src_quant = {&attr_copy, src_mdw, DNNL_ARG_SRC};
801	conf.dst_quant = {&attr_copy, dst_mdw, DNNL_ARG_DST};
802	conf.sum_quant = {&attr_copy};
803
804	status_t status = status::success;
805
806	const auto &padded_dims = dst_mdw.padded_dims();
807	conf.has_padding = !src_mdw.is_dense() \|\| !dst_mdw.is_dense();
808	conf.ndims = src_mdw.ndims();
809	conf.nelems = utils::array_product(padded_dims, conf.ndims);
810
811	conf.sub_group_size = `1`;
812	if (conf.nelems == `0`) { return status::success; }
813	auto compute_engine = utils::downcast<compute::compute_engine_t >(engine);
814
815	// Theoretically, bursts should be at least big enough to span whole
816	// cache line and bigger bursts should give better perf as long as
817	// local mem capacity is not exceeded. However, all tests show that
818	// burst size 64 gives best performance regardless of cache line size.
819	memlimit_bytes = `2048`;
820	optimal_burst_bytes = `64`;
821
822	dim_t blocks[MAX_NDIMS] = {`1`, `1`, `1`, `1`, `1`, `1`};
823	int vect_size = `1`;
824	int vect_dim = `0`;
825
826	if (!fill_conf_vld(src_mdw, dst_mdw, src_mask \| dst_mask, memlimit_bytes,
827	optimal_burst_bytes, conf.aux_data.vld, vect_dim, vect_size,
828	&blocks[`0`])) {
829	return status::unimplemented;
830	}
831
832	conf.sub_group_size = vect_size;
833
834	conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_);
835
836	for (int i = `0`; i < MAX_NDIMS; ++i) {
837	auto dim_str = utils::format("D%d", i);
838	if (i < dst_mdw.ndims()) {
839	uint64_t dim = padded_dims[i];
840	// Pad vectorized dim to multiple of block size (to make sure that
841	// enough work items will be generated to have only full subgroups,
842	// no fractions) then multiply it by vector size (to work around
843	// dispatcher's limitation that vectorized dim must be divisible by
844	// vector size).
845	if (i == vect_dim) {
846	dim = utils::rnd_up(dim, blocks[i]);
847	dim *= `16`;
848	}
849	conf.dispatch.define_dim(dim_str, i, dim, blocks[i]);
850	} else {
851	conf.dispatch.define_dim(dim_str, `1`);
852	}
853	}
854	if (vect_size != `1`) {
855	const auto dim_str = utils::format("D%d", vect_dim);
856	conf.dispatch.vectorize_dim(dim_str, vect_size);
857	}
858
859	conf.dispatch.generate();
860
861	return status;
862	}
863
864	status_t generic_reorder_t::pd_t::init_kernel_ctx(
865	compute::kernel_ctx_t &kernel_ctx) const {
866	using namespace format_tag;
867
868	const memory_desc_wrapper src_mdw(src_md());
869	const memory_desc_wrapper dst_mdw(dst_md());
870
871	if (conf.nelems == `0`) return status::success;
872
873	kernel_ctx.define_int("NDIMS", conf.ndims);
874	kernel_ctx.add_option("-cl-std=CL2.0");
875
876	conf.src_quant.define_macros(kernel_ctx, "SRC");
877	conf.dst_quant.define_macros(kernel_ctx, "DST");
878	conf.sum_quant.define_macros(kernel_ctx, "SUM");
879
880	def_dispatch(kernel_ctx, conf.dispatch);
881
882	kernel_ctx.define_int("SUB_GROUP_SIZE", conf.sub_group_size);
883
884	kernel_ctx.define_int("PAD_FILL_ZERO", conf.has_padding);
885
886	def_memory_desc_info(kernel_ctx, conf.src_md_info, "SRC");
887	def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST");
888
889	kernel_ctx.define_int("GENERIC_REORDER", `1`);
890	kernel_ctx.define_int("VECT_DIM", conf.aux_data.vld.vector_dim);
891	kernel_ctx.define_int("VECT_SIZE", conf.sub_group_size);
892	kernel_ctx.define_int("RESCALE_COEFF", conf.aux_data.vld.rescale_coeff);
893	kernel_ctx.define_int("LIMIT_SSGID", conf.aux_data.vld.src_vect_limit);
894	kernel_ctx.define_int("LIMIT_DSGID", conf.aux_data.vld.dst_vect_limit);
895	auto r = conf.dispatch.nd_range();
896	auto *lr = r.local_range();
897	kernel_ctx.define_int(
898	"SG_PER_WG", (lr[`0`] * lr[`1`] * lr[`2`]) / conf.sub_group_size);
899	int i = `0`;
900	int cache_dim[MAX_NDIMS] = {`1`, `1`, `1`, `1`, `1`, `1`};
901	while (i < LOOP_NEST_LEVEL) {
902	cache_dim[conf.aux_data.vld.dst_vct[i].dim_idx]
903	*= conf.aux_data.vld.dst_vct[i].blk_size;
904	cache_dim[conf.aux_data.vld.dst_blk[i].dim_idx]
905	*= conf.aux_data.vld.dst_blk[i].blk_size;
906	kernel_ctx.define_int(std::string("S_BLK_SIZE_") + std::to_string(i),
907	conf.aux_data.vld.src_blk[i].blk_size);
908	kernel_ctx.define_int(std::string("S_BLK_STEP_") + std::to_string(i),
909	conf.aux_data.vld.src_blk[i].step_size);
910	kernel_ctx.define_int(std::string("S_BLK_IDX_") + std::to_string(i),
911	conf.aux_data.vld.src_blk[i].dim_idx);
912	kernel_ctx.define_int(std::string("D_BLK_SIZE_") + std::to_string(i),
913	conf.aux_data.vld.dst_blk[i].blk_size);
914	kernel_ctx.define_int(std::string("D_BLK_STEP_") + std::to_string(i),
915	conf.aux_data.vld.dst_blk[i].step_size);
916	kernel_ctx.define_int(std::string("D_BLK_IDX_") + std::to_string(i),
917	conf.aux_data.vld.dst_blk[i].dim_idx);
918	i++;
919	}
920	int cache_stride = `1`;
921	for (int i = `0`; i < MAX_NDIMS; i++) {
922	kernel_ctx.define_int(
923	std::string("CACHE_STRIDE_") + std::to_string(i), cache_stride);
924	cache_stride *= cache_dim[i];
925	}
926	int s_size_so_far = `1`;
927	int d_size_so_far = `1`;
928	for (int i = `0`; i < LOOP_NEST_LEVEL; i++) {
929	auto s = conf.aux_data.vld.src_vct[i];
930	auto d = conf.aux_data.vld.dst_vct[i];
931	kernel_ctx.define_int(
932	std::string("S_MOD_") + std::to_string(i), s.blk_size);
933	kernel_ctx.define_int(
934	std::string("S_DIV_") + std::to_string(i), s_size_so_far);
935	kernel_ctx.define_int(
936	std::string("S_MUL_") + std::to_string(i), s.step_size);
937	kernel_ctx.define_int(
938	std::string("S_IDX_") + std::to_string(i), s.dim_idx);
939	kernel_ctx.define_int(
940	std::string("D_MOD_") + std::to_string(i), d.blk_size);
941	kernel_ctx.define_int(
942	std::string("D_DIV_") + std::to_string(i), d_size_so_far);
943	kernel_ctx.define_int(
944	std::string("D_MUL_") + std::to_string(i), d.step_size);
945	kernel_ctx.define_int(
946	std::string("D_IDX_") + std::to_string(i), d.dim_idx);
947
948	s_size_so_far *= s.blk_size;
949	d_size_so_far *= d.blk_size;
950	}
951
952	kernel_ctx.print_options();
953	return status::success;
954	}
955
956	void generic_reorder_t::pd_t::init_scratchpad() {
957	if (conf.src_quant.with_scale()) {
958	auto scratchpad = scratchpad_registry().registrar();
959	scratchpad.book(memory_tracking::names::key_reorder_src_scales,
960	conf.src_quant.num_scales(), sizeof(float),
961	OCL_BUFFER_ALIGNMENT);
962	}
963	if (conf.dst_quant.with_scale()) {
964	auto scratchpad = scratchpad_registry().registrar();
965	scratchpad.book(memory_tracking::names::key_reorder_dst_scales,
966	conf.dst_quant.num_scales(), sizeof(float),
967	OCL_BUFFER_ALIGNMENT);
968	}
969	}
970
971	status_t generic_reorder_t::execute(const exec_ctx_t &ctx) const {
972
973	status_t status = status::success;
974
975	auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM);
976	auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO);
977	CHECK(status);
978
979	const auto &conf = pd()->conf;
980	if (conf.nelems == `0`) { return status::success; }
981
982	compute::kernel_arg_list_t arg_list;
983	arg_list.set(`0`, src);
984	arg_list.set(`1`, dst);
985
986	arg_list.set(`2`, conf.src_quant.scales(ctx));
987	arg_list.set(`3`, conf.src_quant.zero_points(ctx));
988	arg_list.set(`4`, conf.dst_quant.scales(ctx));
989	arg_list.set(`5`, conf.dst_quant.zero_points(ctx));
990
991	arg_list.set(`6`, conf.sum_quant.scales());
992	arg_list.set(`7`, conf.sum_quant.zero_points());
993
994	auto nd_range = conf.dispatch.nd_range();
995
996	status = parallel_for(ctx, nd_range, kernel_, arg_list);
997	return status;
998	}
999
1000	} // namespace ocl
1001	} // namespace gpu
1002	} // namespace impl
1003	} // namespace dnnl
1004

Browse the source code of oneDNN/src/gpu/ocl/generic_reorder.cpp