reorder.hpp source code [oneDNN/src/gpu/jit/codegen/reorder.hpp]

1	/*******************************************************************************
2	* Copyright 2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef GPU_JIT_CODEGEN_REORDER_HPP
18	#define GPU_JIT_CODEGEN_REORDER_HPP
19
20	#include "gpu/jit/codegen/operand.hpp"
21	#include "gpu/jit/codegen/register_scope.hpp"
22	#include "gpu/jit/ir/reorder.hpp"
23	#include "gpu/jit/ir/tensor.hpp"
24	#include "gpu/jit/ngen/ngen.hpp"
25
26	namespace dnnl {
27	namespace impl {
28	namespace gpu {
29	namespace jit {
30
31	// Rewrites a single-src instruction to avoid GRF boundary issues.
32	struct op_plan_t {
33	private:
34	template <typename... ArgT>
35	using op_t = std::function<void(ArgT...)>;
36
37	using inst_mod_t = ngen::InstructionModifier;
38	using reg_data_t = ngen::RegData;
39	using single_src_op_t = op_t<inst_mod_t, reg_data_t, reg_data_t>;
40
41	public:
42	op_plan_t(int grf_size) : grf_size_(grf_size) {}
43
44	void operator()(const single_src_op_t &op, inst_mod_t mod, reg_data_t dst,
45	reg_data_t src) const {
46	// Rewrite a single-src instruction that spans more than 2 GRFs into
47	// multiple ops
48	auto dst_esize = max_esize(dst, /is_dst=/true);
49	auto src_esize = max_esize(src, /is_dst=/false);
50	auto original_esize = mod.getExecSize();
51	auto original_width = src.getWidth();
52	auto esize = std::min(std::min(dst_esize, src_esize), original_esize);
53
54	mod.setExecSize(esize);
55	if (esize < original_width)
56	// Width must be at most esize
57	set_contiguous_region(src, esize, src.getHS());
58
59	for (int i = `0`; i < original_esize; i += esize) {
60	fixup(op, mod, dst, src);
61	shift_offset(dst, esize);
62	shift_offset(src, esize);
63	}
64	}
65
66	private:
67	int max_esize(const reg_data_t &reg, bool is_dst) const {
68	auto size = reg.getBytes();
69	auto width = reg.getWidth();
70	auto hs = reg.getHS();
71	auto vs = reg.getVS();
72	auto remaining_bytes = `2` * grf_size_ - reg.getByteOffset();
73	auto stride = hs;
74	if (!is_dst && width == `1`) stride = vs;
75	if (is_dst && stride == `0`) stride = `1`;
76	if (stride == `0`) return `16`; // Broadcast can have max step
77	auto max_step = (remaining_bytes - `1`) / (stride * size) + `1`;
78	return utils::rnd_down_pow2(max_step);
79	}
80
81	void fixup(const single_src_op_t &op, inst_mod_t mod, reg_data_t dst,
82	reg_data_t src) const {
83	// Rewrite src0 to cross GRF boundaries using vertical striding
84	auto exec_size = mod.getExecSize();
85	auto offset = src.getOffset();
86	auto width = src.getWidth();
87	auto hs = src.getHS();
88	auto vs = src.getVS();
89	auto size = src.getBytes();
90
91	if (!width) width = exec_size;
92	auto height = exec_size / width;
93	auto grf_elems = grf_size_ / size;
94
95	bool crosses_grf_boundary = false;
96	auto begin = offset;
97	for (int i = `0`; i < height; ++i) {
98	auto reg_off = begin % grf_elems;
99	crosses_grf_boundary
100	\|= (reg_off + (width - `1`) * hs + `1` > grf_elems);
101	begin += vs;
102	}
103
104	if (!crosses_grf_boundary) {
105	// op is valid
106	op(mod, dst, src);
107	} else if (vs == width * hs) {
108	// rewrite src as a valid access with shorter width and vs
109	auto elems_to_grf_boundary = (grf_elems - offset - `1`) / hs + `1`;
110	auto tentative_width = utils::rnd_down_pow2(elems_to_grf_boundary);
111	while (tentative_width > `1`) {
112	if (elems_to_grf_boundary % tentative_width == `0`) break;
113	tentative_width /= `2`;
114	}
115
116	set_contiguous_region(src, tentative_width, hs);
117	op(mod, dst, src);
118	} else {
119	// break op into multiple row-wise ops
120	mod.setExecSize(width);
121	set_contiguous_region(src, width, hs);
122	for (int i = `0`; i < height; ++i) {
123	fixup(op, mod, dst, src);
124	shift_offset(dst, width * dst.getHS());
125	shift_offset(src, vs);
126	}
127	}
128	}
129
130	void set_contiguous_region(reg_data_t &rr, int width, int hs) const {
131	if (width > `1`)
132	rr.setRegion(width * hs, width, hs);
133	else
134	// Each element occupies its own row. width = 1 requires hs = 0
135	rr.setRegion(hs, `1`, `0`);
136	}
137
138	void shift_offset(reg_data_t &rr, int offset) const {
139	auto new_offset = rr.getOffset() + offset;
140	auto type_size = rr.getBytes();
141	auto grf_elems = grf_size_ / type_size;
142	rr.setBase(rr.getBase() + new_offset / grf_elems);
143	rr.setOffset(new_offset % grf_elems);
144	};
145
146	int grf_size_;
147	};
148
149	// Aligns src offset with dst offset when src is not broadcasted.
150	template <typename GeneratorT>
151	void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
152	const ngen::InstructionModifier &mod, const reg_buf_data_t &dst,
153	reg_buf_data_t &src);
154
155	template <typename GeneratorT>
156	bool try_emit_batched_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
157	ngen_register_scope_t &scope, int width, const reg_buf_data_t &src,
158	int src_stride, const reg_buf_data_t &dst, int dst_stride) {
159	ngen::DataType src_type = src.type();
160	ngen::DataType dst_type = dst.type();
161	int src_type_size = ngen::getBytes(src_type);
162	int dst_type_size = ngen::getBytes(dst_type);
163	auto large_type = (src_type_size > dst_type_size) ? src_type : dst_type;
164	auto small_type = (src_type_size < dst_type_size) ? src_type : dst_type;
165	ngen_register_scope_t lex_scope {scope.register_allocator()};
166
167	if (!utils::one_of(large_type, ngen::DataType::f, ngen::DataType::d))
168	return false;
169	if (!utils::one_of(small_type, ngen::DataType::b, ngen::DataType::ub))
170	return false;
171	if (src_stride != `1`) return false;
172	if (dst_stride != `1`) return false;
173	// XeHPG seems to have a problem with scalar byte writes with saturation;
174	// defer to the non-batched implementation's workaround
175	if (width == `1`) return false;
176
177	int batch = `128`;
178	int max_step = `8`;
179
180	const int grf_size = ngen::GRF::bytes(hw);
181	op_plan_t plan = grf_size;
182	auto tmp = lex_scope.alloc_reg_buf_data(
183	utils::div_up(int(batch * sizeof(uint32_t)), grf_size));
184	using inst_mod_t = ngen::InstructionModifier;
185	using reg_data_t = ngen::RegData;
186	auto mov = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
187	host->emov(mod, dst, src);
188	};
189
190	for (int i = `0`; i < width; i += batch) {
191	int i_beg = i;
192	int i_end = std::min(width, i + batch);
193
194	for (int ii = i_beg; ii < i_end;) {
195	int esize = std::min(max_step, i_end - ii);
196	esize = utils::rnd_down_pow2(esize);
197
198	auto s = src.subregister(ii, esize, src_type_size);
199	auto t = tmp.subregister((ii - i_beg) * `4`, small_type)(`4`);
200	ngen::InstructionModifier mod = esize;
201	if (dst_type == small_type) mod \|= host->sat;
202	plan(mov, mod, t, s (`1`));
203	ii += esize;
204	}
205	for (int ii = i_beg; ii < i_end;) {
206	int esize = std::min(max_step, i_end - ii);
207	esize = utils::rnd_down_pow2(esize);
208
209	auto d = dst.subregister(ii, esize, dst_type_size);
210	auto t = tmp.subregister((ii - i_beg) * `4`, small_type)(`4`);
211	plan(mov, esize, d (`1`), t);
212	ii += esize;
213	}
214	}
215	return true;
216	}
217
218	// Performs 1D reorder, possibly with strides and type conversion.
219	template <typename GeneratorT>
220	void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
221	ngen_register_scope_t &scope, int width, const reg_buf_data_t &_src,
222	int src_stride, const reg_buf_data_t &_dst, int dst_stride) {
223
224	if (try_emit_batched_reorder_1d_tile(
225	hw, host, scope, width, _src, src_stride, _dst, dst_stride))
226	return;
227
228	auto src = _src;
229	auto dst = _dst;
230	ngen::DataType src_type = src.type();
231	ngen::DataType dst_type = dst.type();
232	// Replace (float -> float) by (int -> int) as word/dword moves have less
233	// restrictions.
234	if (src_type == dst_type
235	&& utils::one_of(src_type, ngen::DataType::bf, ngen::DataType::hf,
236	ngen::DataType::f, ngen::DataType::df)) {
237	int factor = (src_type == ngen::DataType::df ? `2` : `1`);
238	if (factor == `1` \|\| (src_stride == `1` && dst_stride == `1`)) {
239	src_type
240	= to_ngen(type_t::u(ngen::getBytes(src_type) / factor * `8`));
241	dst_type = src_type;
242	width *= factor;
243	src = src.reinterpret(src_type);
244	dst = dst.reinterpret(dst_type);
245	}
246	}
247
248	const int grf_size = ngen::GRF::bytes(hw);
249	int src_type_size = ngen::getBytes(src_type);
250	int dst_type_size = ngen::getBytes(dst_type);
251	int src_stride_bytes = src_stride * src_type_size;
252	int dst_stride_bytes = dst_stride * dst_type_size;
253	bool dst_b = ngen_is_b(dst_type);
254	bool dst_d = ngen_is_dw(dst_type);
255	bool dst_q = ngen_is_qw(dst_type);
256	bool dst_f = (dst_type == ngen::DataType::f);
257	bool dst_hf = (dst_type == ngen::DataType::hf);
258	bool dst_bf = (dst_type == ngen::DataType::bf);
259	bool dst_df = (dst_type == ngen::DataType::df);
260	bool dst_xf = dst_bf \|\| dst_f \|\| dst_hf \|\| dst_df;
261	bool src_b = ngen_is_b(src_type);
262	bool src_d = ngen_is_dw(src_type);
263	bool src_q = ngen_is_qw(src_type);
264	bool src_f = (src_type == ngen::DataType::f);
265	bool src_hf = (src_type == ngen::DataType::hf);
266	bool src_bf = (src_type == ngen::DataType::bf);
267	bool src_df = (src_type == ngen::DataType::df);
268	bool src_xf = src_bf \|\| src_f \|\| src_hf \|\| src_df;
269	bool f_to_xf = (src_f && (dst_bf \|\| dst_hf));
270	op_plan_t plan = grf_size;
271	ngen_register_scope_t lex_scope {scope.register_allocator()};
272
273	auto get_step = [&]() {
274	int step = (width < `16` ? `8` : `16`);
275
276	// f32 -> bf16 or f32 -> f16: SIMD16 does not support mixed mode move.
277	if (hw < ngen::HW::XeHPC)
278	if (f_to_xf) step = `8`;
279
280	if (src_df \|\| dst_df) step = `8`;
281
282	// Max supported stride is 4.
283	if (src_stride > `4` \|\| dst_stride > `4`) step = `1`;
284
285	// Qword does not appear to support swizzling.
286	if (src_q && dst_q && src_stride != dst_stride) step = `1`;
287
288	return step;
289	};
290
291	using inst_mod_t = ngen::InstructionModifier;
292	using reg_data_t = ngen::RegData;
293	auto shl16 = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
294	host->eshl(mod, dst, src, `16`);
295	};
296	auto mov = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
297	host->emov(mod, dst, src);
298	};
299
300	// bf16 -> f32:
301	// - bf16 must be packed: use left shift instead.
302	if (src_bf && dst_f) {
303	int step = get_step();
304	for (int i = `0`; i < width; i += step) {
305	step = std::min(step, width - i);
306	step = utils::rnd_down_pow2(step);
307	int esize = step;
308	auto s = src.subregister(
309	i, esize, src_stride_bytes, ngen::DataType::uw);
310	auto d = dst.subregister(
311	i, esize, dst_stride_bytes, ngen::DataType::ud);
312	plan(shl16, esize, d (dst_stride), s (src_stride));
313	}
314	return;
315	}
316
317	// d -> bf/hf:
318	// - Use d -> f -> bf/hf conversion with temporary
319	if (src_d && (dst_bf \|\| dst_hf)) {
320	const int nregs = utils::div_up(width * (int)sizeof(float), grf_size);
321	auto tmp = lex_scope.alloc_reg_buf_data(nregs).format(
322	`0`, ngen::DataType::f);
323	emit_reorder_1d_tile(hw, host, scope, width, src, src_stride, tmp, `1`);
324	emit_reorder_1d_tile(hw, host, scope, width, tmp, `1`, dst, dst_stride);
325	return;
326	}
327
328	// b -> hf
329	// - Direct b -> float conversion not supported: use s16 temporary
330	// - int -> hf must be DW-aligned & strided: use f temporary
331	// - Use b -> w -> f -> hf
332	if (src_b && dst_hf) {
333	ir_assert(utils::one_of(dst_stride_bytes, `2`, `4`));
334	ir_assert(utils::one_of(src_stride_bytes, `1`, `4`));
335	int step = get_step();
336	const int align_boundary = grf_size / `2`;
337	const int step_size = step * (int)sizeof(uint32_t);
338	const int nregs = utils::div_up(step_size, grf_size);
339	auto tmp1 = lex_scope.alloc_reg_buf_data(nregs);
340	auto tmp2 = lex_scope.alloc_reg_buf_data(nregs);
341	for (int i = `0`; i < width; i += step) {
342	step = std::min(step, width - i);
343	step = utils::rnd_down_pow2(step);
344	int esize = step;
345
346	auto s = src.subregister(i, esize, src_stride_bytes);
347	auto d = dst.subregister(i, esize, dst_stride_bytes);
348	auto byte_offset = `2` * (d.getByteOffset() % align_boundary);
349	auto t1 = tmp1.subregister(byte_offset, ngen::DataType::w);
350	auto t2 = tmp2.subregister(byte_offset, ngen::DataType::f);
351	auto t1_as_hf = t1.reinterpret(`0`, ngen::DataType::hf);
352	auto d_as_w = d.reinterpret(`0`, ngen::DataType::w);
353
354	plan(mov, esize, t1 (`2`), s (src_stride));
355	plan(mov, esize, t2 (`1`), t1 (`2`));
356	plan(mov, esize, t1_as_hf (`2`), t2 (`1`));
357	plan(mov, esize, d_as_w (dst_stride), t1 (`2`));
358	}
359	return;
360	}
361
362	// hf -> b
363	if (src_hf && dst_b) {
364	ir_assert(utils::one_of(src_stride_bytes, `2`, `4`));
365	ir_assert(utils::one_of(dst_stride_bytes, `1`, `4`));
366	int step = get_step();
367	const int tmp_stride = `4`;
368	const int tmp_stride_bytes = tmp_stride * dst_type_size;
369	const int step_size = step * tmp_stride_bytes;
370	const int nregs = `1` + utils::div_up(step_size, grf_size);
371	auto tmp1 = lex_scope.alloc_reg_buf_data(nregs);
372	auto tmp2 = lex_scope.alloc_reg_buf_data(nregs);
373	for (int i = `0`; i < width; i += step) {
374	step = std::min(step, width - i);
375	step = utils::rnd_down_pow2(step);
376	int esize = step;
377
378	auto s = src.subregister(i, esize, src_stride_bytes);
379	auto d = dst.subregister(i, esize, dst_stride_bytes);
380	const int t1_offset = s.getByteOffset();
381	const int t2_offset = (d.getOffset() % `16`) * tmp_stride_bytes;
382	auto t1 = tmp1.subregister(t1_offset, dst_type);
383	auto t2 = tmp2.subregister(t2_offset, dst_type);
384
385	if (dst_stride_bytes >= tmp_stride_bytes && esize > `1`) {
386	plan(mov, esize \| host->sat, d (dst_stride), s (src_stride));
387	continue;
388	}
389	auto wa_esize = std::max(`2`, esize);
390	plan(mov, wa_esize \| host->sat, t1(tmp_stride), s(src_stride));
391	if (t1_offset != t2_offset)
392	plan(mov, esize, t2 (tmp_stride), t1 (tmp_stride));
393	else
394	std::swap(t1, t2);
395	plan(mov, esize, d (dst_stride), t2 (tmp_stride));
396	}
397	return;
398	}
399
400	// f -> df
401	// - f/df mixed operands must be qword aligned
402	// - f -> f striding: use s32
403	if (src_f && dst_df) {
404	int step = get_step();
405	const auto tmp_type = src_type;
406	const int tmp_stride = `2`;
407	const int tmp_stride_bytes = tmp_stride * src_type_size;
408	const int reg_size = dst.byte_offset() + width * tmp_stride_bytes;
409	const int nregs = utils::div_up(reg_size, grf_size);
410	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
411	for (int i = `0`; i < width; i += step) {
412	step = std::min(step, width - i);
413	step = utils::rnd_down_pow2(step);
414	int esize = step;
415
416	auto s = src.subregister(i, esize, src_stride_bytes);
417	auto d = dst.subregister(i, esize, dst_stride_bytes);
418	auto t = tmp.subregister(d.getByteOffset(), tmp_type);
419	plan(mov, esize, t.d()(tmp_stride), s.d()(src_stride));
420	plan(mov, esize, d (dst_stride), t (tmp_stride));
421	}
422	return;
423	}
424
425	// df -> f
426	// - f/df mixed operands must be qword aligned
427	// - f -> f packing: use s32
428	if (dst_f && src_df) {
429	int step = get_step();
430	const auto tmp_type = dst_type;
431	const int tmp_stride = `2`;
432	const int tmp_stride_bytes = tmp_stride * src_type_size;
433	const int reg_size = dst.byte_offset() + width * tmp_stride_bytes;
434	const int nregs = utils::div_up(reg_size, grf_size);
435	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
436	for (int i = `0`; i < width; i += step) {
437	step = std::min(step, width - i);
438	step = utils::rnd_down_pow2(step);
439	int esize = step;
440
441	auto s = src.subregister(i, esize, src_stride_bytes);
442	auto d = dst.subregister(i, esize, dst_stride_bytes);
443	auto t = tmp.subregister(s.getByteOffset(), tmp_type);
444	plan(mov, esize, t (tmp_stride), s (src_stride));
445	// df -> f uses the float pipe. Override ngen setting the long pipe.
446	auto mod_with_pipe = esize \| ngen::SWSB<float>(`1`);
447	plan(mov, mod_with_pipe, d.d()(dst_stride), t.d()(tmp_stride));
448	}
449	return;
450	}
451
452	// f -> hf
453	if (src_f && dst_hf) {
454	int step = get_step();
455	const auto tmp_type = dst_type;
456	const int tmp_stride = `2`;
457	const int reg_size = step * tmp_stride * dst_type_size;
458	const int nregs = utils::div_up(reg_size, grf_size);
459	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
460	for (int i = `0`; i < width; i += step) {
461	step = std::min(step, width - i);
462	step = utils::rnd_down_pow2(step);
463	int esize = step;
464
465	auto s = src.subregister(i, esize, src_stride_bytes);
466	auto d = dst.subregister(i, esize, dst_stride_bytes);
467
468	const auto align_boundary = grf_size / `2`;
469	bool aligned = d.getByteOffset() % align_boundary == `0`
470	&& s.getByteOffset() == `0`;
471	if (esize > `1` && dst_stride == `1` && !aligned) {
472	auto t = tmp.subregister(s.getByteOffset(), tmp_type);
473	plan(mov, esize, t (tmp_stride), s (src_stride));
474	plan(mov, esize, d.w()(dst_stride), t.w()(tmp_stride));
475	continue;
476	}
477	plan(mov, esize, d (dst_stride), s (src_stride));
478	}
479	return;
480	}
481
482	// hf -> f
483	if (dst_f && src_hf) {
484	int step = get_step();
485	const auto tmp_type = src_type;
486	const int tmp_stride = `2`;
487	const int reg_size = step * tmp_stride * src_type_size;
488	const int nregs = utils::div_up(reg_size, grf_size);
489	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
490	for (int i = `0`; i < width; i += step) {
491	step = std::min(step, width - i);
492	step = utils::rnd_down_pow2(step);
493	int esize = step;
494
495	auto s = src.subregister(i, esize, src_stride_bytes);
496	auto d = dst.subregister(i, esize, dst_stride_bytes);
497
498	const auto align_boundary = grf_size / `2`;
499	bool aligned = s.getByteOffset() % align_boundary == `0`
500	&& d.getByteOffset() == `0`;
501	if (esize > `1` && src_stride == `1` && !aligned) {
502	auto t = tmp.subregister(d.getByteOffset(), tmp_type);
503	plan(mov, esize, t.w()(tmp_stride), s.w()(src_stride));
504	plan(mov, esize, d (dst_stride), t (tmp_stride));
505	continue;
506	}
507	plan(mov, esize, d (dst_stride), s (src_stride));
508	}
509	return;
510	}
511
512	// f32/f16/s32 -> s8/u8 and s8/u8 -> f32/s32
513	// - Use saturation
514	// - s8/u8 must be DW-strided: use temporary
515	bool d_or_f_to_b = (src_d \|\| src_f) && dst_b;
516	bool b_to_d_or_f = (dst_d \|\| dst_f) && src_b;
517	bool hf_to_b = src_hf && dst_b;
518	if (d_or_f_to_b \|\| b_to_d_or_f \|\| hf_to_b) {
519	if (dst_d \|\| dst_f) ir_assert(dst_stride_bytes == `4`);
520	if (src_d \|\| src_f) ir_assert(src_stride_bytes == `4`);
521	if (src_hf) ir_assert(utils::one_of(src_stride_bytes, `2`, `4`));
522	if (dst_b) ir_assert(utils::one_of(dst_stride_bytes, `1`, `4`));
523	if (src_b) ir_assert(utils::one_of(src_stride_bytes, `1`, `4`));
524	int step = get_step();
525	const int step_size = step * (int)sizeof(uint32_t);
526	const int nregs = `1` + utils::div_up(step_size, grf_size);
527	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
528	for (int i = `0`; i < width; i += step) {
529	step = std::min(step, width - i);
530	step = utils::rnd_down_pow2(step);
531	int esize = step;
532
533	auto s = src.subregister(i, esize, src_stride_bytes);
534	auto d = dst.subregister(i, esize, dst_stride_bytes);
535	if (src_d \|\| src_f \|\| src_hf) {
536	// d -> b.
537	if (dst_stride_bytes == `1` \|\| esize == `1`) {
538	auto offset_bytes = src_f ? s.getByteOffset()
539	: `4` * (d.getByteOffset() % `16`);
540	auto t = tmp.subregister(offset_bytes, dst_type)(`2`);
541	plan(mov, std::max(`2`, esize) \| host->sat, t, s);
542	plan(mov, esize, d (dst_stride), t);
543	} else {
544	plan(mov, esize \| host->sat, d (dst_stride), s (src_stride));
545	}
546	} else {
547	if (esize == `1`) {
548	// Direct x8 -> x32 scalar cast is not always
549	// supported. Use intermediate cast to s16.
550	auto t = tmp.subregister(`0`, ngen::DataType::w)(`1`);
551	plan(mov, esize, t, s (src_stride));
552	plan(mov, esize, d (dst_stride), t);
553	} else if (src_b) {
554	auto offset_bytes = dst_f ? d.getByteOffset() : `0`;
555	auto t = tmp.subregister(offset_bytes, src_type)(`4`);
556	plan(mov, esize, t, s (src_stride));
557	plan(mov, esize, d (dst_stride), t);
558	} else {
559	plan(mov, esize, d (dst_stride), s (src_stride));
560	}
561	}
562	}
563	return;
564	}
565
566	// Handle mov(src.uw(x)(1), dst.uw(y)(2)).
567	if (src_type_size == `2` && dst_type_size == `2` && src_stride == `2`
568	&& dst_stride == `1` && width > `1`) {
569	int step = get_step();
570	auto step_size = step * src_type_size * src_stride;
571	auto nregs = utils::div_up(step_size, grf_size);
572	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
573	for (int i = `0`; i < width; i += step) {
574	step = std::min(step, width - i);
575	step = utils::rnd_down_pow2(step);
576	int esize = step;
577	auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
578	esize, src_stride);
579	auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
580	esize, dst_stride);
581	auto d_old = d;
582	bool d_half_grf_aligned
583	= utils::one_of(d.byte_offset(), `0`, grf_size / `2`);
584	if (!d_half_grf_aligned) {
585	d = scope.alloc_reg_data(to_ir(dst_type).with_elems(esize));
586	}
587	if (s.offset() != `0`) {
588	auto t = tmp.format(`0`, src_type, esize, src_stride);
589	plan(mov, esize, t, s);
590	s = t;
591	}
592	plan(mov, esize, d, s);
593	if (!d_half_grf_aligned) plan(mov, esize, d_old, d);
594	}
595	return;
596	}
597
598	// Perform FP to FP move.
599	// Float pipe has some register regioning limitations. If mov is not
600	// allowed then fix regioning by switching to integer pipe which has
601	// less limitations.
602	if (src_xf \|\| dst_xf) {
603	int step = get_step();
604	for (int i = `0`; i < width; i += step) {
605	step = std::min(step, width - i);
606	step = utils::rnd_down_pow2(step);
607	int esize = step;
608	ir_assert(math::is_pow2(esize));
609	auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
610	esize, src_stride);
611	auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
612	esize, dst_stride);
613	auto d_old = d;
614
615	bool do_d0_align = false;
616	if (esize > `1` && dst_bf) {
617	bool d_0_aligned = (d.byte_offset() == `0`);
618	bool d_half_grf_aligned = (d.byte_offset() == grf_size / `2`);
619	if (!d_0_aligned && (!d_half_grf_aligned \|\| dst_stride != `1`)) {
620	do_d0_align = true;
621	}
622	}
623	if (do_d0_align) {
624	d = lex_scope.alloc_reg_data(to_ir(dst_type).with_elems(esize));
625	}
626
627	bool do_align = false;
628	if (esize > `1` && s.hs() != `0`) {
629	if ((src_f && dst_hf) \|\| (dst_f && src_hf)) {
630	if (s.byte_offset() != d.byte_offset()) do_align = true;
631	} else if (s.offset() != d.offset())
632	do_align = true;
633	}
634	if (do_align) {
635	bool s_half_grf_aligned = (src_hf \|\| src_bf)
636	&& utils::one_of(s.byte_offset(), `0`, grf_size / `2`);
637	bool d_half_grf_aligned = (dst_hf \|\| dst_bf)
638	&& utils::one_of(d.byte_offset(), `0`, grf_size / `2`);
639	if (dst_f && d.offset() == `0` && s_half_grf_aligned)
640	do_align = false;
641	if (src_f && s.offset() == `0` && d_half_grf_aligned)
642	do_align = false;
643	}
644
645	if (do_align) {
646	auto i_type = to_ngen(type_t::u(ngen::getBytes(src_type) * `8`));
647	s = s.reinterpret(i_type);
648	align_src_dst_offset(host, scope, esize, d, s);
649	s = s.reinterpret(src_type);
650	}
651	plan(mov, esize, d, s);
652
653	if (do_d0_align) {
654	auto i_type = to_ngen(type_t::u(ngen::getBytes(dst_type) * `8`));
655	auto d_int = d_old.reinterpret(i_type);
656	auto s_int = d.reinterpret(i_type);
657	plan(mov, esize, d_int, s_int);
658	}
659	}
660	return;
661	}
662
663	if (src_b && dst_b) {
664	const int tmp_stride = `4`;
665	const int tmp_stride_bytes = tmp_stride * dst_type_size;
666	// Any byte conversion requires saturation:
667	// - ub -> b loses 1 bit of precision
668	// - b -> ub loses sign bit
669	const bool needs_saturation = src_type != dst_type;
670
671	int step = get_step();
672	const int nregs = `1` + utils::div_up(step * tmp_stride_bytes, grf_size);
673	auto tmp = lex_scope.alloc_reg_buf_data(nregs);
674	for (int i = `0`; i < width; i += step) {
675	step = std::min(step, width - i);
676	step = utils::rnd_down_pow2(step);
677	int esize = step;
678	auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
679	esize, src_stride);
680	auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
681	esize, dst_stride);
682	ngen::InstructionModifier mod = esize;
683	if (needs_saturation) mod \|= host->sat;
684
685	bool aligned = true;
686	const bool needs_raw_mov = dst_stride == `1` \|\| esize == `1`;
687	if ((src_stride == `1` \|\| esize == `1`) && needs_raw_mov) {
688	// Note: This case does not appear to be documented. Experiments
689	// seem to indicate that packed byte-to-packed byte move must be
690	// word-aligned on the destination.
691	aligned = d.offset() % `2` == `0`;
692	} else if (dst_stride <= `2` && src_stride >= `2` * dst_stride) {
693	const int rel_stride = src_stride / dst_stride;
694	const int alignment_bdy = grf_size / rel_stride;
695	const int dst_aligned_offset = d.offset() % alignment_bdy;
696	const int src_aligned_offset = s.offset() / rel_stride;
697	aligned = dst_aligned_offset == src_aligned_offset;
698	}
699
700	// Workaround for scalar byte conversion:
701	// - Broadcast to two locations with stride and conversion
702	// - Move one copy to the destination
703	if (needs_saturation && esize == `1`) mod.setExecSize(`2`);
704
705	if (!aligned \|\| (needs_saturation && needs_raw_mov)) {
706	const int tmp_rel_stride = tmp_stride / dst_stride;
707	const int tmp_alignment_bdy = grf_size / tmp_rel_stride;
708	const int tmp_aligned_offset = d.offset() % tmp_alignment_bdy;
709	const int tmp_offset = tmp_rel_stride * tmp_aligned_offset;
710	const int allowed_bytes = `2` * grf_size - tmp_offset;
711
712	if ((mod.getExecSize() - `1`) * tmp_stride + `1` > allowed_bytes) {
713	// Workaround for cases where temporary is not grf aligned
714	// and esize == 16 on XeHPG and below
715	auto max_width = (allowed_bytes - `1`) / tmp_stride + `1`;
716	auto tmp_esize = utils::rnd_down_pow2(max_width);
717	mod.setExecSize(tmp_esize);
718	esize = tmp_esize;
719	step = tmp_esize;
720	}
721
722	auto t = tmp.format(
723	tmp_offset, dst_type, mod.getExecSize(), tmp_stride);
724	plan(mov, mod, t, s);
725	mod = esize;
726	s = tmp.format(tmp_offset, dst_type, esize, tmp_stride);
727	}
728	plan(mov, mod, d, s);
729	}
730	return;
731	}
732
733	// Perform regular move.
734	int step = get_step();
735	for (int i = `0`; i < width; i += step) {
736	step = std::min(step, width - i);
737	step = utils::rnd_down_pow2(step);
738	int esize = step;
739	ir_assert(math::is_pow2(esize));
740	auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
741	esize, src_stride);
742	auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
743	esize, dst_stride);
744	plan(mov, esize, d, s);
745	}
746	}
747
748	template <typename GeneratorT>
749	void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
750	const ngen::InstructionModifier &mod, const reg_buf_data_t &dst,
751	reg_buf_data_t &src) {
752	int src_stride = src.hs();
753	// src is broadcasted, no need to align, return.
754	if (src_stride == `0`) return;
755
756	bool is_xf = ngen_is_xf(src.type()) \|\| ngen_is_xf(dst.type());
757	bool is_bf_to_f = (src.type() == ngen::DataType::bf)
758	&& (dst.type() == ngen::DataType::f);
759	int src_type_size = ngen::getBytes(src.type());
760	int src_off = src.offset();
761	int dst_off = dst.offset();
762	int src_byte_off = src.byte_offset();
763	int dst_byte_off = dst.byte_offset();
764
765	// If src is aligned with dst, return.
766	if ((is_xf \|\| is_bf_to_f) && src_off == dst_off) return;
767	if (!is_xf && src_byte_off == dst_byte_off) return;
768
769	int new_src_byte_off = (is_xf ? dst_off * src_type_size : dst_byte_off);
770
771	int esize = mod.getExecSize();
772	int grf_size = ngen::GRF::bytes(scope.hw());
773	int src_size = std::max(src_type_size * esize * src_stride, src_type_size);
774
775	auto new_src = scope.alloc_reg_buf_data(
776	utils::div_up(src_size + new_src_byte_off, grf_size));
777	new_src = new_src.format(new_src_byte_off, src.type(), esize, src_stride);
778	emit_reorder_1d_tile(scope.hw(), host, scope, esize, src, src_stride,
779	new_src, src_stride);
780	src = new_src;
781	}
782
783	template <typename GeneratorT>
784	void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
785	const ngen::InstructionModifier &mod, const reg_buf_data_t &dst,
786	reg_buf_data_t &src0, reg_buf_data_t &src1) {
787	align_src_dst_offset(host, scope, mod, dst, src0);
788	align_src_dst_offset(host, scope, mod, dst, src1);
789	}
790
791	template <typename GeneratorT>
792	void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
793	const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
794	ngen_operand_t &src) {
795	if (!dst.is_reg_data()) return;
796	if (!src.is_reg_data()) return;
797
798	auto rd = src.reg_buf_data();
799	align_src_dst_offset(host, scope, mod, dst.reg_buf_data(), rd);
800	if (rd == src.reg_buf_data()) return;
801
802	bool is_negated = src.is_negated();
803	src = ngen_operand_t (rd, src.mod());
804	if (is_negated) src = -src;
805	}
806
807	template <typename GeneratorT>
808	void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
809	const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
810	ngen_operand_t &src0, ngen_operand_t &src1) {
811	align_src_dst_offset(host, scope, mod, dst, src0);
812	align_src_dst_offset(host, scope, mod, dst, src1);
813	}
814
815	// Implementation of GRF reorder between 2D dense layouts.
816	// Requirements for A -> B reorder:
817	// - A and B must have the same data type
818	// - Layouts must be 2D and dense
819	// Reorder may require several steps, in this case a temporary buffer T is
820	// allocated. For example: A -> T -> B or A -> B -> T -> B
821	class reorder_2d_impl_t {
822	public:
823	reorder_2d_impl_t(
824	ngen::HW hw, const layout_t &src_layout, const layout_t &dst_layout)
825	: hw_(hw), src_(src_layout), dst_(dst_layout) {
826	ir_assert(src_.type() == dst_.type());
827	tile_ = find_2d_tile(src_, dst_);
828	}
829
830	const tensor_t &tile() const { return tile_; }
831
832	template <typename GeneratorT>
833	void emit(GeneratorT *host, ngen_register_scope_t &scope,
834	const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
835	int a_idx, b_idx;
836	int tile_a, tile_b;
837	tile_to_2d_dims(tile_, a_idx, b_idx, tile_a, tile_b);
838
839	// Convert src/dst to 2D layouts.
840	dim_assignment_t to_ab(src_.ndims(), `2`);
841	to_ab.assign(a_idx, `0`);
842	to_ab.assign(b_idx, `1`);
843	auto src_ab = to_ab.map(src_);
844	auto dst_ab = to_ab.map(dst_);
845
846	// Find minimal cost reorder path between layouts.
847	auto path = find_min_cost_path(hw_, src_ab, dst_ab, tile_a, tile_b);
848
849	// Allocate a temporary GRF buffer if needed.
850	reg_buf_data_t tmp;
851	if (path.size() > `1`) {
852	const int grf_size = ngen::GRF::bytes(hw_);
853	tmp = scope.alloc_reg_buf_data(
854	utils::div_up(dst_ab.size(), grf_size));
855	}
856
857	// Iterate through found reorders.
858	auto *prev_layout = &src_ab;
859	auto prev_rd = src_rd;
860	int path_len = int(path.size());
861	auto &orig_type = src_ab.type();
862	for (int i = `0`; i < path_len; i++) {
863	auto &step = path[i];
864	auto &tile = step.tile;
865	auto &type = step.type;
866	auto *next_layout = &step.layout;
867
868	// x -> y reorder.
869	auto x = prev_layout->map(tile).reinterpret(type);
870	auto y = next_layout->map(tile).reinterpret(type);
871
872	bool use_dst = ((path_len - i) % `2` == `1`);
873	auto next_rd = (use_dst ? dst_rd : tmp);
874	auto &x_blocks = x.blocks();
875	auto &y_blocks = y.blocks();
876	ir_assert(x_blocks.size() <= `1`);
877	ir_assert(y_blocks.size() <= `1`);
878	int x_stride = (x_blocks.empty() ? `1` : int(x_blocks[`0`].stride));
879	int y_stride = (y_blocks.empty() ? `1` : int(y_blocks[`0`].stride));
880	int width = int(tile.elems()) * orig_type.size() / type.size();
881	next_layout->for_each_tile(
882	tile, [&](const std::vector<dim_t> &start) {
883	int prev_off = int(prev_layout->offset_in_bytes(start));
884	int next_off = int(next_layout->offset_in_bytes(start));
885	auto x_sub = prev_rd.format(prev_off, to_ngen(type), `1`);
886	auto y_sub = next_rd.format(next_off, to_ngen(type), `1`);
887	emit_reorder_1d_tile(hw_, host, scope, width, x_sub,
888	x_stride, y_sub, y_stride);
889	});
890	prev_layout = next_layout;
891	prev_rd = next_rd;
892	}
893	}
894
895	// Returns the biggest common 2D tile that is innermost for both layouts.
896	// The returned tile contains at most max_elems elements. If match_outer is
897	// true, then outer parts of both layouts are required to be equal.
898	// Returns an empty tensor if the requested tile is not found.
899	static tensor_t find_2d_tile(const layout_t &a, const layout_t &b,
900	int max_elems = std::numeric_limits<int>::max(),
901	bool match_outer = false) {
902	std::vector<dim_t> tile_dims(a.ndims(), `1`);
903	if (a.blocks().empty() \|\| b.blocks().empty())
904	return tensor_t(tile_dims);
905
906	auto non_one_ndims = [](const tensor_t &t) {
907	int ret = `0`;
908	for (dim_t d : t.dims())
909	ret += (d != `1` ? `1` : `0`);
910	return ret;
911	};
912
913	layout_iterator_t a_it(a);
914	layout_iterator_t b_it(b);
915
916	tensor_t max_tile;
917	for (;;) {
918	auto a_tile = a_it.tile();
919	auto b_tile = b_it.tile();
920	if (non_one_ndims(a_tile) > `2` \|\| non_one_ndims(b_tile) > `2`) break;
921	if (!a.map(a_tile).is_dense() \|\| !b.map(b_tile).is_dense()) break;
922	dim_t a_elems = a_tile.elems();
923	dim_t b_elems = b_tile.elems();
924
925	bool tile_ok = true;
926	if (!a_tile.is_equal(b_tile)) tile_ok = false;
927	if (match_outer) {
928	auto a_outer = a_it.outer_layout();
929	auto b_outer = b_it.outer_layout();
930	if (!a_outer.is_equal(b_outer)) tile_ok = false;
931	}
932	if (tile_ok) {
933	if (a_it.nblocks() > max_tile_blocks) break;
934	if (b_it.nblocks() > max_tile_blocks) break;
935	if (a_tile.elems() > max_elems) break;
936	max_tile = a_tile;
937	if (!a_it.has_next() \|\| !b_it.has_next()) break;
938	++a_it;
939	++b_it;
940	} else if (a_elems <= b_elems) {
941	if (!a_it.has_next()) break;
942	++a_it;
943	} else {
944	if (!b_it.has_next()) break;
945	++b_it;
946	}
947	}
948	return max_tile;
949	}
950
951	static const int max_tile_blocks = `4`;
952
953	private:
954	// Represents 2D reorder corresponding to (a x b) tile.
955	struct edge_t {
956	edge_t() = default;
957	edge_t(int idx, int a, int b) : idx(idx), a(a), b(b) {}
958
959	tensor_t tile() const { return tensor_t({a, b}); }
960
961	std::string str() const {
962	std::ostringstream oss;
963	oss << "edge(idx = " << idx << ", a = " << a << ", b = " << b
964	<< ")";
965	return oss.str();
966	}
967
968	int idx; // Identifier of the edge.
969	int a = `0`, b = `0`; // Specify tile (a x b).
970	};
971
972	// Represents GRF layout between edges-reorders.
973	struct vertex_t {
974	vertex_t(ngen::HW hw, int idx, const layout_t &layout)
975	: hw(hw), idx(idx), layout(layout) {}
976
977	std::string str() const {
978	std::ostringstream oss;
979	oss << "vertex(idx = " << idx << ", layout = " << layout << ")";
980	return oss.str();
981	}
982
983	void set_edges(const std::vector<edge_t> &edges) {
984	adj_edge_type_masks.resize(edges.size());
985	int type_size = layout.type().size();
986	for (int i = `0`; i < int(edges.size()); i++) {
987	auto &e = edges[i];
988	auto tile = e.tile();
989	int max_type_size;
990	bool ok = layout_t::try_reinterpret_to_wider_type(
991	layout, layout, tile, false, &max_type_size);
992	if (!ok) max_type_size = type_size;
993	int from = math::ilog2q(type_size);
994	int to = math::ilog2q(max_type_size);
995	for (int j = from; j <= to; j++) {
996	type_t type = type_t::u(`8` << j);
997	if (can_reorder(tile, type))
998	adj_edge_type_masks[i] \|= (`1` << j);
999	}
1000	}
1001	}
1002
1003	void add_neighbor(const vertex_t *v) { adj_vertices.push_back(v); }
1004
1005	bool is_neighbor(const vertex_t &v) const {
1006	for (auto *n : adj_vertices)
1007	if (n == &v) return true;
1008	return false;
1009	}
1010
1011	// Check the following limitations:
1012	// - Assume at most one block (maybe with non-dense stride)
1013	// - Horizontal stride must be <= 4 for GRF region
1014	// - GRF region can't span more than 2 registers
1015	bool can_reorder(const tensor_t &tile, const type_t &type) const {
1016	auto ab_layout = layout.map(tile).reinterpret(type);
1017	int nblocks = int(ab_layout.blocks().size());
1018	if (nblocks == `0`) return true;
1019	if (nblocks > `1`) return false;
1020	auto &last = ab_layout.blocks().back();
1021	int max_stride = int(last.stride * last.block);
1022	if (last.stride > `4`) return false;
1023	if ((int)last.stride == `4` && type.size() <= `2`) return false;
1024	if (!math::is_pow2(last.stride)) return false;
1025	int max_stride_bytes = max_stride * type.size();
1026	int grf_size = ngen::GRF::bytes(hw);
1027	if (max_stride_bytes > `2` * grf_size) return false;
1028	return true;
1029	}
1030
1031	// Finds the minimal cost of reordering from this vertex to vertex v.
1032	int cost(const vertex_t &v, const std::vector<edge_t> &edges,
1033	edge_t &min_edge, type_t &min_type) const {
1034	int min_cost = std::numeric_limits<int>::max();
1035	for (int i = `0`; i < int(edges.size()); i++) {
1036	type_t i_min_type;
1037	int new_cost = cost(edges[i], v, i_min_type);
1038	if (new_cost < min_cost) {
1039	min_cost = new_cost;
1040	min_edge = edges[i];
1041	min_type = i_min_type;
1042	}
1043	}
1044	return min_cost;
1045	}
1046
1047	// Finds the minimal cost of reordering from this vertex to vertex `v`
1048	// through edge `e`. If the reorder is possible, `type` contains the
1049	// reorder type with the minimal cost.
1050	int cost(const edge_t &e, const vertex_t &v, type_t &type) const {
1051	uint32_t mask = (adj_edge_type_masks[e.idx]
1052	& v.adj_edge_type_masks[e.idx]);
1053	if (mask == `0`) return std::numeric_limits<int>::max();
1054	int cur_size = layout.type().size();
1055	int cur_cost = layout.elems() / (e.a * e.b);
1056	int min_log_bytes = math::ilog2q(cur_size);
1057	int max_log_bytes = `3`;
1058	int min_cost = std::numeric_limits<int>::max();
1059	for (int i = min_log_bytes; i <= max_log_bytes; i++) {
1060	if ((mask & (`1` << i)) == `0`) continue;
1061	if (i > min_log_bytes) {
1062	ir_assert(!layout.blocks().empty());
1063	ir_assert(!v.layout.blocks().empty());
1064	int dim_idx0 = layout.blocks()[`0`].dim_idx;
1065	int dim_idx1 = v.layout.blocks()[`0`].dim_idx;
1066	if (dim_idx0 != dim_idx1) continue;
1067	}
1068	min_cost = cur_cost;
1069	type = type_t::u(`8` << i);
1070	break;
1071	}
1072	return min_cost;
1073	}
1074
1075	ngen::HW hw;
1076	int idx; // Identifier of the vertex.
1077	layout_t layout; // Layout of the vertex.
1078	// Specifies a bitmask for every edge: if adj_edge_type_masks[E_idx]
1079	// has b-th bit set then this vertex can be reordered through E edge
1080	// using the data type with size 2^b bytes.
1081	std::vector<uint32_t> adj_edge_type_masks;
1082	std::vector<const vertex_t > adj_vertices; // Adjacent vertices.*
1083	};
1084
1085	// Represents a reorder step.
1086	struct reorder_step_t {
1087	reorder_step_t() = default;
1088	reorder_step_t(const layout_t &layout, const tensor_t &tile,
1089	const type_t &type)
1090	: layout(layout), tile(tile), type (type) {}
1091
1092	layout_t layout; // Destination layout.
1093	tensor_t tile; // Tile corresponding to one instruction.
1094	type_t type; // Registers should be reinterpreted to `type` for reorder.
1095	};
1096
1097	// Extracts dimension sizes and their indices from a multidimensional
1098	// tensor.
1099	static void tile_to_2d_dims(
1100	const tensor_t &tile, int &a_idx, int &b_idx, int &a, int &b) {
1101	a_idx = -`1`;
1102	b_idx = -`1`;
1103	for (int i = `0`; i < tile.ndims(); i++) {
1104	if (tile.dims()[i] == `1`) continue;
1105	if (a_idx == -`1`) {
1106	a_idx = i;
1107	continue;
1108	}
1109	if (b_idx == -`1`) {
1110	b_idx = i;
1111	continue;
1112	}
1113	ir_error_not_expected();
1114	}
1115
1116	for (int i = `0`; i < tile.ndims(); i++) {
1117	if (utils::one_of(i, a_idx, b_idx)) continue;
1118	if (a_idx == -`1`) {
1119	a_idx = i;
1120	continue;
1121	}
1122	if (b_idx == -`1`) {
1123	b_idx = i;
1124	continue;
1125	}
1126	}
1127
1128	if (a_idx > b_idx) std::swap(a_idx, b_idx);
1129
1130	a = tile.dims()[a_idx];
1131	b = tile.dims()[b_idx];
1132	}
1133
1134	// Finds the optimal sequence of reorders between src and dst layouts.
1135	static std::vector<reorder_step_t> find_min_cost_path(ngen::HW hw,
1136	const layout_t &src, const layout_t &dst, int tile_a, int tile_b) {
1137	// Create all possible edges - 2D reorders.
1138	std::vector<edge_t> edges;
1139	for (int a = `1`; a <= tile_a; a *= `2`) {
1140	for (int b = `1`; b <= tile_b; b *= `2`) {
1141	if (src.dim(`0`) % a != `0`) continue;
1142	if (src.dim(`1`) % b != `0`) continue;
1143	int idx = int(edges.size());
1144	edges.emplace_back(idx, a, b);
1145	}
1146	}
1147
1148	int nedges = int(edges.size());
1149
1150	// Create all possible layouts for tile_a x tile_b tensor.
1151	std::vector<vertex_t> vertices;
1152	std::vector<std::vector<std::pair<int, uint32_t>>> edge_vertices(
1153	nedges);
1154	auto all_layouts = generate_all_layouts(src.type(), tile_a, tile_b);
1155	for (auto &l : all_layouts) {
1156	// Skip if too many blocks.
1157	if (int(l.blocks().size()) > max_tile_blocks) continue;
1158	int v_idx = int(vertices.size());
1159	vertices.emplace_back(hw, v_idx, l);
1160	auto &v = vertices.back();
1161	// Pass all known reorders, the vertex/layout will filter out
1162	// incompatible reorders.
1163	v.set_edges(edges);
1164	// Store all vertices adjacent to a specific edge.
1165	for (int i = `0`; i < nedges; i++) {
1166	uint32_t mask = v.adj_edge_type_masks[i];
1167	if (mask != `0`) edge_vertices[i].emplace_back(v_idx, mask);
1168	}
1169	}
1170
1171	// Find neighbors between all vertices.
1172	int nvertices = int(vertices.size());
1173	for (int i = `0`; i < nvertices; i++) {
1174	auto &v = vertices[i];
1175	for (int j = `0`; j < nedges; j++) {
1176	uint32_t mask = v.adj_edge_type_masks[j];
1177	if (mask != `0`) {
1178	for (auto &idx_mask : edge_vertices[j]) {
1179	int v_idx = idx_mask.first;
1180	if (v_idx == i) continue;
1181	uint32_t common_mask = (mask
1182	& vertices[v_idx].adj_edge_type_masks[j]);
1183	if (common_mask != `0`) v.add_neighbor(&vertices[v_idx]);
1184	}
1185	}
1186	}
1187	}
1188
1189	// Identify source and destination vertices.
1190	int src_idx = -`1`;
1191	int dst_idx = -`1`;
1192	for (int i = `0`; i < nvertices; i++) {
1193	auto &v = vertices[i];
1194	if (src_idx == -`1`
1195	&& v.layout.is_strictly_equal(
1196	src, /compare_offset=/false))
1197	src_idx = i;
1198	if (dst_idx == -`1`
1199	&& v.layout.is_strictly_equal(
1200	dst, /compare_offset=/false))
1201	dst_idx = i;
1202	}
1203
1204	ir_assert(src_idx != -`1`);
1205	ir_assert(dst_idx != -`1`);
1206
1207	// Layouts are the same, just copy.
1208	if (src_idx == dst_idx) {
1209	auto &v = vertices[src_idx];
1210	edge_t min_edge;
1211	type_t min_type;
1212	v.cost(v, edges, min_edge, min_type);
1213	reorder_step_t step(v.layout, min_edge.tile(), min_type);
1214	return {step};
1215	}
1216
1217	// Dijkstra's algorithm, find the minimal cost path between src and
1218	// dst. Use the number of instructions to estimate the cost.
1219	int inf_cost = std::numeric_limits<int>::max();
1220	std::vector<int> cost(nvertices, inf_cost);
1221	std::vector<int> prev(nvertices);
1222	std::vector<reorder_step_t> reorder_steps(nvertices);
1223	std::vector<bool> seen(nvertices, false);
1224	cost[src_idx] = `0`;
1225	for (int i = `0`; i < nvertices; i++) {
1226	int min_idx = -`1`;
1227	int min_cost = inf_cost;
1228	for (int j = `0`; j < nvertices; j++) {
1229	if (seen[j]) continue;
1230	if (cost[j] < min_cost) {
1231	min_idx = j;
1232	min_cost = cost[j];
1233	}
1234	}
1235	seen[min_idx] = true;
1236	auto &v_min = vertices[min_idx];
1237	for (auto *v : v_min.adj_vertices) {
1238	edge_t min_edge;
1239	type_t min_type;
1240	int new_cost = cost[min_idx]
1241	+ v_min.cost(*v, edges, min_edge, min_type);
1242	if (new_cost < cost[v->idx]) {
1243	cost[v->idx] = new_cost;
1244	prev[v->idx] = min_idx;
1245	reorder_steps[v->idx] = reorder_step_t(
1246	v->layout, min_edge.tile(), min_type);
1247	}
1248	}
1249	}
1250
1251	// Sanity check, ensure the reorder sequence is not too long.
1252	int max_cost = `256`;
1253	ir_assert(cost[dst_idx] <= max_cost);
1254	MAYBE_UNUSED(max_cost);
1255
1256	// Restore the shortest reorder path.
1257	std::vector<reorder_step_t> ret;
1258	int idx = dst_idx;
1259	while (idx != src_idx) {
1260	ret.push_back(reorder_steps[idx]);
1261	idx = prev[idx];
1262	}
1263	std::reverse(ret.begin(), ret.end());
1264	return ret;
1265	}
1266
1267	// Returns all possible layouts for (a x b) tensor.
1268	static std::vector<layout_t> generate_all_layouts(
1269	const type_t &type, int a, int b) {
1270	std::vector<layout_t> ret;
1271	std::vector<block_t> blocks;
1272	generate_all_layouts_impl(ret, blocks, type, a, b, `1`);
1273	return ret;
1274	}
1275
1276	static void generate_all_layouts_impl(std::vector<layout_t> &layouts,
1277	std::vector<block_t> &blocks, const type_t &type, int a, int b,
1278	int stride) {
1279	if (a == `1` && b == `1`) {
1280	layouts.emplace_back(type, `2`, `0`, blocks);
1281	return;
1282	}
1283	bool iterate_a = true;
1284	bool iterate_b = true;
1285
1286	// Avoid repeating indices to keep only unique layouts.
1287	if (!blocks.empty()) {
1288	auto &last = blocks.back();
1289	iterate_a &= (last.dim_idx != `0`);
1290	iterate_b &= (last.dim_idx != `1`);
1291	}
1292
1293	if (iterate_a) {
1294	for (int a_blk = `2`; a_blk <= a; a_blk++) {
1295	if (a % a_blk != `0`) continue;
1296	blocks.emplace_back(`0`, a_blk, stride);
1297	generate_all_layouts_impl(
1298	layouts, blocks, type, a / a_blk, b, stride * a_blk);
1299	blocks.pop_back();
1300	}
1301	}
1302	if (iterate_b) {
1303	for (int b_blk = `2`; b_blk <= b; b_blk++) {
1304	if (b % b_blk != `0`) continue;
1305	blocks.emplace_back(`1`, b_blk, stride);
1306	generate_all_layouts_impl(
1307	layouts, blocks, type, a, b / b_blk, stride * b_blk);
1308	blocks.pop_back();
1309	}
1310	}
1311	}
1312
1313	ngen::HW hw_;
1314
1315	layout_t src_;
1316	layout_t dst_;
1317
1318	tensor_t tile_;
1319	};
1320
1321	class dense_2d_block_filter_t {
1322	public:
1323	bool operator()(const block_t &b) {
1324	if (b.block == `1`) return false;
1325	if ((dim_t)b.stride != stride_) return false;
1326	stride_ *= b.block;
1327	if (!have_seen(b.dim_idx)) non_one_ndims_++;
1328	return non_one_ndims_ <= `2`;
1329	}
1330
1331	dense_2d_block_filter_t() = default;
1332
1333	private:
1334	bool have_seen(int idx) {
1335	auto ret = seen_.insert(idx);
1336	return !ret.second;
1337	}
1338
1339	dim_t stride_ = `1`;
1340	int non_one_ndims_ = `0`;
1341	std::unordered_set<int> seen_;
1342	};
1343
1344	template <typename IterT>
1345	class filter_t {
1346	using inner_iter_t = decltype(std::declval<const IterT>().begin());
1347	using iter_value_t = decltype(*std::declval<inner_iter_t>());
1348	using predicate_t = std::function<bool(const iter_value_t &)>;
1349
1350	public:
1351	class iterator_t {
1352	public:
1353	bool operator==(const iterator_t &it) const { return it_ == it.it_; }
1354	bool operator!=(const iterator_t &it) const { return !operator==(it); }
1355	const iter_value_t &operator() const* { return *it_; }
1356	iterator_t &operator++() {
1357	if (it_ == end_) return *this;
1358	while (++it_ != end_ && !predicate_(*it_))
1359	;
1360	return *this;
1361	}
1362
1363	iterator_t(inner_iter_t it, inner_iter_t end, predicate_t predicate)
1364	: it_(it), end_(end), predicate_(predicate) {
1365	if (it_ != end_ && !predicate_(it_)) operator*++();
1366	}
1367
1368	private:
1369	inner_iter_t it_, end_;
1370	std::function<bool(const iter_value_t &)> predicate_;
1371	};
1372
1373	iterator_t begin() const { return {begin_, end_, predicate_}; }
1374	iterator_t end() const { return {end_, end_, predicate_}; }
1375
1376	filter_t(const IterT &it, predicate_t predicate)
1377	: begin_(it.begin()), end_(it.end()), predicate_(predicate) {}
1378
1379	private:
1380	inner_iter_t begin_, end_;
1381	predicate_t predicate_;
1382	};
1383
1384	class shared_inner_tiles_t {
1385	using blocks_t = std::vector<block_t>;
1386	using block_iterator_t = filter_t<blocks_t>::iterator_t;
1387
1388	class inner_tile_iterator_t {
1389	public:
1390	bool operator==(const inner_tile_iterator_t &it) const {
1391	return curr_ == it.curr_;
1392	}
1393	bool operator!=(const inner_tile_iterator_t &it) const {
1394	return !operator==(it);
1395	}
1396
1397	inner_tile_iterator_t &operator++() {
1398	if (curr_ == end_) return *this;
1399
1400	auto size = (*curr_).block;
1401	while (++factor_ <= size) {
1402	if (size % factor_ == `0`) return *this;
1403	}
1404
1405	dims_[(curr_).dim_idx] = size;
1406	++curr_;
1407	factor_ = `1`;
1408	return operator++();
1409	}
1410
1411	tensor_t operator() const* {
1412	auto dims = dims_;
1413	dims[(curr_).dim_idx] = factor_;
1414	return tensor_t(dims);
1415	}
1416
1417	inner_tile_iterator_t(block_iterator_t curr, block_iterator_t end,
1418	const tensor_t &tile)
1419	: curr_(std::move(curr))
1420	, end_(std::move(end))
1421	, dims_(tile.dims())
1422	, factor_(`1`) {}
1423
1424	private:
1425	block_iterator_t curr_, end_;
1426	std::vector<dim_t> dims_;
1427	dim_t factor_;
1428	};
1429
1430	class iterator_t {
1431	public:
1432	bool operator==(const iterator_t &it) const {
1433	return a_it_ == it.a_it_ && b_it_ == it.b_it_;
1434	}
1435	bool operator!=(const iterator_t &it) const { return !operator==(it); }
1436
1437	iterator_t &operator++() {
1438	bool adv_a = true, adv_b = true;
1439	while (adv_a \|\| adv_b) {
1440	adv_a = (a_it_ != a_end_)
1441	&& (b_it_ == b_end_
1442	\|\| (a_it_).elems() <= (b_it_).elems());
1443	adv_b = (b_it_ != b_end_)
1444	&& (a_it_ == a_end_
1445	\|\| (b_it_).elems() <= (a_it_).elems());
1446	if (adv_a) ++a_it_;
1447	if (adv_b) ++b_it_;
1448	if (a_it_ != a_end_) a_tile_ = *a_it_;
1449	if (b_it_ != b_end_) b_tile_ = *b_it_;
1450	if (a_tile_.is_equal(b_tile_)) break;
1451	}
1452	return *this;
1453	}
1454
1455	const tensor_t &operator() const* { return a_tile_; }
1456
1457	iterator_t(const block_iterator_t &a_it, const block_iterator_t &a_end,
1458	const block_iterator_t &b_it, const block_iterator_t &b_end,
1459	const tensor_t &tile)
1460	: a_it_(a_it, a_end, tile)
1461	, a_end_(a_end, a_end, tile)
1462	, b_it_(b_it, b_end, tile)
1463	, b_end_(b_end, b_end, tile) {}
1464
1465	private:
1466	inner_tile_iterator_t a_it_, a_end_;
1467	inner_tile_iterator_t b_it_, b_end_;
1468	tensor_t a_tile_, b_tile_;
1469	};
1470
1471	public:
1472	iterator_t begin() const {
1473	return {a_.begin(), a_.end(), b_.begin(), b_.end(), default_tile_};
1474	}
1475	iterator_t end() const {
1476	return {a_.end(), a_.end(), b_.end(), b_.end(), default_tile_};
1477	}
1478
1479	shared_inner_tiles_t(const layout_t &a, const layout_t &b)
1480	: a_(a.blocks(), dense_2d_block_filter_t())
1481	, b_(b.blocks(), dense_2d_block_filter_t())
1482	, default_tile_(std::vector<dim_t>(a.ndims(), `1`)) {}
1483
1484	private:
1485	filter_t<blocks_t> a_;
1486	filter_t<blocks_t> b_;
1487	tensor_t default_tile_;
1488	};
1489
1490	class reorder_impl_t {
1491	public:
1492	reorder_impl_t(ngen::HW hw, const reorder_t &reorder)
1493	: hw_(hw)
1494	, src_layout_(reorder.src_layout)
1495	, dst_layout_(reorder.dst_layout) {
1496	layout_t::try_reinterpret_to_wider_type(src_layout_, dst_layout_);
1497
1498	// Pure bf moves are not supported.
1499	if (utils::everyone_is(
1500	type_t::bf16(), src_layout_.type(), dst_layout_.type())) {
1501	src_layout_ = src_layout_.retype(type_t::u16());
1502	dst_layout_ = dst_layout_.retype(type_t::u16());
1503	}
1504	}
1505
1506	template <typename GeneratorT>
1507	void emit(GeneratorT *host, ngen_register_scope_t &scope,
1508	const reg_buf_data_t &src, const reg_buf_data_t &dst) {
1509	if (try_emit_2d(host, scope, src, dst)) return;
1510	emit_1d(host, scope, src, dst);
1511	}
1512
1513	private:
1514	template <typename GeneratorT>
1515	void emit_1d(GeneratorT *host, ngen_register_scope_t &scope,
1516	const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
1517	int src_stride;
1518	int dst_stride;
1519	auto tile = find_max_tile_with_fixed_stride(
1520	src_layout_, dst_layout_, src_stride, dst_stride);
1521
1522	int tile_elems = int(tile.elems());
1523	auto &src_type = src_layout_.type();
1524	auto &dst_type = dst_layout_.type();
1525	dst_layout_.for_each_tile(tile, [&](const std::vector<dim_t> &start) {
1526	int src_off = int(src_layout_(start) * src_type.size());
1527	int dst_off = int(dst_layout_(start) * dst_type.size());
1528	auto sub_src = src_rd.format(src_off, to_ngen(src_type), `1`);
1529	auto sub_dst = dst_rd.format(dst_off, to_ngen(dst_type), `1`);
1530
1531	ngen_register_scope_t tile_scope(scope.register_allocator());
1532	emit_reorder_1d_tile(hw_, host, tile_scope, tile_elems, sub_src,
1533	src_stride, sub_dst, dst_stride);
1534	});
1535	}
1536
1537	static tensor_t find_max_2d_dense_tile(const layout_t &a_layout,
1538	const layout_t &b_layout, dim_t max_elems) {
1539	shared_inner_tiles_t tiles {a_layout, b_layout};
1540	tensor_t max_tile;
1541	auto all_pow2 = [](const tensor_t &tile) {
1542	for (auto d : tile.dims())
1543	if (!math::is_pow2(d)) return false;
1544	return true;
1545	};
1546
1547	for (auto &tile : tiles) {
1548	if (tile.elems() > max_elems) break;
1549	if (all_pow2(tile)) max_tile = tile;
1550	}
1551	// No point in tiling with a 1x1 tile
1552	return max_tile.elems() > `1` ? max_tile : tensor_t ();
1553	}
1554
1555	template <typename GeneratorT>
1556	bool try_emit_2d(GeneratorT *host, ngen_register_scope_t &scope,
1557	const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
1558	if (src_layout_.type() != dst_layout_.type()) return false;
1559	if (!src_layout_.is_dense()) return false;
1560	if (!dst_layout_.is_dense()) return false;
1561
1562	int max_tile_size = `512`;
1563	int max_tile_elems = max_tile_size / src_layout_.type().size();
1564	auto tile = find_max_2d_dense_tile(
1565	src_layout_, dst_layout_, max_tile_elems);
1566
1567	// Couldn't find tile, 2D reorder is not supported.
1568	if (tile.is_empty()) return false;
1569
1570	auto src_tile_layout = src_layout_.map(tile);
1571	auto dst_tile_layout = dst_layout_.map(tile);
1572	if (!dst_tile_layout.is_dense()) return false;
1573	auto layout_ok = [](const layout_t &l) {
1574	if (l.blocks().size() < `2`) return false;
1575	for (auto &b : l.blocks()) {
1576	if (math::is_pow2(b.block)) continue;
1577	for (int i = `2`; i < (int)b.block / `2`; i++)
1578	if (b.block % i != `0`) return false;
1579	}
1580	return true;
1581	};
1582
1583	if (!layout_ok(src_tile_layout)) return false;
1584	if (!layout_ok(dst_tile_layout)) return false;
1585
1586	// Set layout offset to 0 since the offset is handled by fixing up the
1587	// register input to try_emit_2d_impl
1588	src_tile_layout.set_offset(`0`);
1589	dst_tile_layout.set_offset(`0`);
1590
1591	bool ok = true;
1592	auto type = to_ngen(src_layout_.type());
1593	src_layout_.for_each_tile(tile, [&](const std::vector<dim_t> &start) {
1594	auto src_off = src_layout_.offset_in_bytes<dim_t>(start);
1595	auto dst_off = dst_layout_.offset_in_bytes<dim_t>(start);
1596	auto src_tile_rd = src_rd.format(int(src_off), type);
1597	auto dst_tile_rd = dst_rd.format(int(dst_off), type);
1598
1599	ngen_register_scope_t tile_scope(scope.register_allocator());
1600	ok &= try_emit_2d_impl(host, tile_scope, src_tile_layout,
1601	dst_tile_layout, src_tile_rd, dst_tile_rd);
1602	});
1603	return ok;
1604	}
1605
1606	template <typename GeneratorT>
1607	bool try_emit_2d_impl(GeneratorT *host, ngen_register_scope_t &scope,
1608	const layout_t &src_layout, const layout_t &dst_layout,
1609	const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
1610	// Try to allocate/release a temporary buffer to avoid out_of_registers
1611	// exception.
1612	const int grf_size = ngen::GRF::bytes(hw_);
1613	auto dummy = scope.try_alloc_range(
1614	utils::div_up(dst_layout.size(), grf_size));
1615	if (dummy.isInvalid()) {
1616	ir_warning() << "Can't allocate buffer for 2D reorder. Reorder "
1617	"performance may be suboptimal.\n";
1618	return false;
1619	}
1620
1621	// Allocation succeeded, can proceed further.
1622	scope.safeRelease(dummy);
1623
1624	reorder_2d_impl_t r(hw_, src_layout, dst_layout);
1625	int tile_elems = int(r.tile().elems());
1626	if (tile_elems < `16` \|\| tile_elems > `512`) return false;
1627
1628	r.emit(host, scope, src_rd, dst_rd);
1629	return true;
1630	}
1631
1632	static tensor_t find_max_tile_with_fixed_stride(const layout_t &src,
1633	const layout_t &dst, int &src_stride, int &dst_stride) {
1634	// 1. Split layouts to have aligned blocks.
1635	auto a = src;
1636	auto b = dst;
1637	layout_t::align_layouts(a, b);
1638
1639	// 2. Find the max innermost tile.
1640	auto a_blocks = a.blocks();
1641	auto b_blocks = b.blocks();
1642
1643	std::vector<dim_t> tile_dims(a.ndims(), `1`);
1644	src_stride = (a_blocks.empty() ? `1` : int(a_blocks[`0`].stride));
1645	dst_stride = (b_blocks.empty() ? `1` : int(b_blocks[`0`].stride));
1646	int src_cur_stride = src_stride;
1647	int dst_cur_stride = dst_stride;
1648
1649	int min_blocks = int(std::min(a_blocks.size(), b_blocks.size()));
1650	for (int i = `0`; i < min_blocks; i++) {
1651	auto &ab = a_blocks[i];
1652	auto &bb = b_blocks[i];
1653	if (ab.dim_idx != bb.dim_idx \|\| ab.block != bb.block) break;
1654
1655	// Strides are supported for the innermost block only.
1656	if (src_cur_stride != int(ab.stride)) break;
1657	if (dst_cur_stride != int(bb.stride)) break;
1658
1659	src_cur_stride = int(ab.block * ab.stride);
1660	dst_cur_stride = int(bb.block * bb.stride);
1661	tile_dims[ab.dim_idx] *= ab.block;
1662	}
1663	return tensor_t(tile_dims);
1664	}
1665
1666	ngen::HW hw_;
1667	layout_t src_layout_;
1668	layout_t dst_layout_;
1669	};
1670
1671	} // namespace jit
1672	} // namespace gpu
1673	} // namespace impl
1674	} // namespace dnnl
1675
1676	#endif
1677

Browse the source code of oneDNN/src/gpu/jit/codegen/reorder.hpp