kernel.hpp source code [oneDNN/src/gpu/jit/codegen/kernel.hpp]

1	/*******************************************************************************
2	* Copyright 2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef GPU_JIT_CODEGEN_KERNEL_HPP
18	#define GPU_JIT_CODEGEN_KERNEL_HPP
19
20	#include "common/cpp_compat.hpp"
21
22	#include "gpu/jit/codegen/operand.hpp"
23	#include "gpu/jit/codegen/register_allocator.hpp"
24	#include "gpu/jit/ir/ir.hpp"
25	#include "gpu/jit/ir/kernel_info.hpp"
26	#include "gpu/jit/ir/message.hpp"
27	#include "gpu/jit/ir/tensor.hpp"
28	#include "gpu/jit/jit_generator.hpp"
29	#include "gpu/jit/ngen/ngen.hpp"
30	#include "gpu/jit/ngen/ngen_register_allocator.hpp"
31
32	#include "gpu/jit/gemm/emulation.hpp"
33
34	namespace dnnl {
35	namespace impl {
36	namespace gpu {
37	namespace jit {
38
39	inline size_t icache_size(ngen::HW arch) {
40	switch (arch) {
41	case gpu_gen9: return `48` * `1024`;
42	case gpu_gen11: return `48` * `1024`;
43	case gpu_xe_lp: return `48` * `1024`;
44	case gpu_xe_hp: return `48` * `1024`;
45	case gpu_xe_hpg: return `96` * `1024`;
46	case gpu_xe_hpc: return `80` * `1024`;
47	default: return `0`;
48	}
49	}
50
51	template <template <ngen::HW> class KernelT, ngen::HW arch, typename... ArgsT>
52	std::unique_ptr<jit::jit_generator_base> make_generator(ArgsT &&... args) {
53
54	auto raw_kernel = new KernelT<arch>(std::forward<ArgsT>(args)...);
55	if (raw_kernel->getRootStreamLength() > icache_size(arch)) {
56	ir_warning() << raw_kernel->kernel_name()
57	<< " larger than icache, kernel: "
58	<< raw_kernel->getRootStreamLength()
59	<< " bytes, icache: " << icache_size(arch) << " bytes\n";
60	}
61	return std::unique_ptr<jit::jit_generator_base>(raw_kernel);
62	}
63
64	template <template <ngen::HW> class KernelT, typename... ArgsT>
65	compute::kernel_t make_kernel(
66	gpu_primitive_t primitive, engine_t engine, ArgsT &&... args) {
67	using namespace compute;
68	kernel_t kernel;
69
70	if (primitive->cache_blob()) {
71	status_t status = primitive->create_kernel(engine, &kernel, nullptr);
72	if (status != status::success) return kernel_t ();
73	return kernel;
74	}
75
76	auto compute_engine = utils::downcast<compute_engine_t >(engine);
77	auto *device_info = compute_engine->device_info();
78	auto arch = convert_dnnl_arch_to_ngen(device_info->gpu_arch());
79
80	std::unique_ptr<jit::jit_generator_base> jit_kernel;
81	#define CASE(gpu_arch) \
82	case gpu_arch: \
83	jit_kernel = make_generator<KernelT, gpu_arch>( \
84	std::forward<ArgsT>(args)...); \
85	break;
86	switch (arch) {
87	REG_GEN9_ISA(CASE(gpu_gen9));
88	REG_GEN11_ISA(CASE(gpu_gen11));
89	REG_XELP_ISA(CASE(gpu_xe_lp));
90	REG_XEHP_ISA(CASE(gpu_xe_hp));
91	REG_XEHPG_ISA(CASE(gpu_xe_hpg));
92	REG_XEHPC_ISA(CASE(gpu_xe_hpc));
93	default: break;
94	}
95	#undef CASE
96
97	#ifdef GEN_CONV_DEBUG
98	gpu_gen_t actual_arch = ngen::HW::Unknown;
99	switch (device_info->gpu_arch()) {
100	case gpu_arch_t::gen9: actual_arch = gpu_gen9; break;
101	case gpu_arch_t::gen11: actual_arch = gpu_gen11; break;
102	case gpu_arch_t::xe_lp: actual_arch = gpu_xe_lp; break;
103	case gpu_arch_t::xe_hp: actual_arch = gpu_xe_hp; break;
104	case gpu_arch_t::xe_hpg: actual_arch = gpu_xe_hpg; break;
105	case gpu_arch_t::xe_hpc: actual_arch = gpu_xe_hpc; break;
106	case gpu_arch_t::unknown: actual_arch = ngen::HW::Unknown; break;
107	}
108	ir_assert(actual_arch == arch)
109	<< "Cannot emulate executing gpu_arch environment";
110	#endif
111
112	if (!jit_kernel) return kernel_t ();
113
114	status_t status
115	= primitive->create_kernel(engine, &kernel, jit_kernel.get());
116	if (status != status::success) return kernel_t ();
117	return kernel;
118	}
119
120	class expr_binding_t {
121	public:
122	expr_binding_t(ngen::HW hw) : hw_(hw) {}
123
124	~expr_binding_t() {
125	if (!cpp_compat::uncaught_exceptions()) {
126	ir_assert(expr2dst_.empty()) << "Detected missing unbind_dst().";
127	}
128	}
129
130	bool is_dst_bound(const expr_t &expr) const {
131	return expr2dst_.count(expr) == `1`;
132	}
133
134	ngen_operand_t get_dst(const expr_t &expr) const {
135	ir_assert(is_dst_bound(expr)) << "Destination is not bound: " << expr;
136	return expr2dst_.at(expr);
137	}
138
139	void bind_dst(const expr_t &expr, const ngen_operand_t &operand) {
140	ir_assert(!expr.is_empty());
141	auto ret = expr2dst_.insert({expr, operand});
142	ir_assert(ret.second) << "Already bound: " << expr;
143	}
144
145	void unbind_dst(const expr_t &expr) {
146	ir_assert(!expr.is_empty());
147	auto it = expr2dst_.find(expr);
148	ir_assert(it != expr2dst_.end());
149	expr2dst_.erase(it);
150	}
151
152	bool is_bound(const expr_t &expr) const {
153	return expr2operand_.count(expr) == `1`;
154	}
155
156	ngen_operand_t get(const expr_t &expr, bool allow_empty = false) const {
157	if (expr.is_empty()) return ngen_operand_t ();
158	if (!is_bound(expr)) {
159	if (!allow_empty)
160	ir_assert(false) << "Operand is not bound: " << expr;
161	return ngen_operand_t ();
162	}
163	return expr2operand_.at(expr);
164	}
165
166	void bind(const expr_t &expr, const ngen::Subregister &sub) {
167	bind(expr, ngen_operand_t (reg_buf_data_t (hw_, sub)));
168	}
169
170	void bind(const expr_t &expr, const ngen_operand_t &operand) {
171	if (is_dst_bound(expr)) unbind_dst(expr);
172
173	auto op_to_bind = operand;
174
175	// Operand is with predicate - can't bind.
176	if (operand.mod().getPredCtrl() != ngen::PredCtrl::None) return;
177
178	int esize = operand.mod().getExecSize();
179	if (esize == `0`) esize = `1`;
180	if (esize != expr.type().elems()) {
181	ir_assert(expr.type().is_scalar() \|\| esize == `1`)
182	<< "Expected broadcast.";
183	if (operand.is_reg_buf_data() && esize != `1`) {
184	// Bind scalar expression to the first vector element.
185	op_to_bind = operand.reg_buf_data().format(
186	`0`, ngen::DataType::invalid, `1`);
187	}
188	}
189
190	auto ret = expr2operand_.insert({expr, op_to_bind});
191	ir_assert(ret.second) << "Already bound: " << expr;
192	}
193
194	void unbind(const expr_t &expr) {
195	ir_assert(!expr.is_empty());
196
197	auto it = expr2operand_.find(expr);
198	ir_assert(it != expr2operand_.end());
199	expr2operand_.erase(it);
200	}
201
202	private:
203	ngen::HW hw_;
204	object_map_t<expr_t, ngen_operand_t> expr2dst_;
205	object_map_t<expr_t, ngen_operand_t> expr2operand_;
206	};
207
208	template <ngen::HW hw>
209	class expr_evaluator_t;
210
211	template <ngen::HW hw>
212	class ir_to_ngen_t;
213
214	enum class grf_mode_t {
215	any, // Kernel sets optimal grf mode
216	matches, // Propogate grf mode to avoid context switch
217	small, // Force small grf_mode
218	large, // Force large grf_mode
219	};
220
221	template <ngen::HW hw>
222	class ir_kernel_t : public jit_generator<hw> {
223	public:
224	NGEN_FORWARD_OPENCL(hw);
225
226	friend class expr_evaluator_t<hw>;
227	friend class ir_to_ngen_t<hw>;
228	friend class send_impl_t;
229
230	ir_kernel_t(const std::string &kernel_name, const exec_config_t &exec_cfg,
231	const kernel_info_t &kernel_info, bool require_dpas,
232	grf_mode_t grf_mode = grf_mode_t::any)
233	: kernel_name_(kernel_name)
234	, exec_cfg_(exec_cfg)
235	, kernel_info_(kernel_info)
236	, require_dpas_(require_dpas)
237	, regs_((grf_mode == grf_mode_t::large)
238	? `256`
239	: (grf_mode == grf_mode_t::small) ? `128`
240	: exec_cfg.regs())
241	, ra_(hw, kernel_name,
242	grf_mode == grf_mode_t::any ? reg_allocator_t::warn_all
243	: reg_allocator_t::warn_default)
244	, emu_strategy (hw, exec_cfg.hw_cfg().stepping_id()) {
245	ra_.setRegisterCount(regs_);
246	}
247
248	void setup_interface(const stmt_t &kernel_body = stmt_t ()) {
249	externalName(kernel_name_);
250	requireLocalID(`3`);
251	requireLocalSize();
252	requireGRF(regs_);
253	requireSIMD(exec_cfg_.simd());
254	requireBarrier();
255	if (require_dpas_) requireDPAS();
256	if (has_send_atomics(kernel_body)) requireGlobalAtomics();
257
258	for (int i = `0`; i < kernel_info_.nargs(); i++) {
259	auto &name = kernel_info_.arg_name(i);
260	auto &type = kernel_info_.arg_type(i);
261	if (type.is_ptr()) {
262	newArgument(name, ngen::ExternalArgumentType::GlobalPtr);
263	} else {
264	newArgument(name, to_ngen(type));
265	}
266	}
267
268	if (!kernel_body.is_empty()) {
269	int slm_size = alloc_manager_t (kernel_body)
270	.total_size(alloc_kind_t::slm);
271	requireSLM(slm_size);
272	}
273
274	finalizeInterface();
275	}
276
277	void generate_prologue() {
278	setDefaultNoMask();
279	setDefaultAutoSWSB(true);
280
281	prologue();
282
283	// Claim registers.
284	ra_.claim(r0);
285	for (int i = `0`; i < `3`; i++)
286	ra_.claim(getLocalID(i));
287
288	for (int i = `0`; i < kernel_info_.nargs(); i++) {
289	ra_.claim(getArgument(kernel_info_.arg_name(i)));
290	}
291
292	if (emu_strategy.emulate64) {
293	emu_state.temp[`0`] = ra_.alloc();
294	emu_state.temp[`1`] = ra_.alloc();
295	}
296	// Enable IEEE f32 -> s32 rounding and f32/f16 denormals.
297	or_(`1`, cr0, cr0, uint16_t(`0x1480`));
298
299	// Allocate and initialize signal header for future use.
300	if (require_signal_header_) {
301	signal_header_ = ra_.alloc();
302	barrierheader(signal_header_);
303	}
304	}
305
306	void bind_external_vars(const stmt_t &kernel_body,
307	const grid_info_t &kernel_grid,
308	const std::array<expr_t, `3`> &local_id,
309	expr_binding_t &expr_binding) {
310	alloc_manager_t alloc_mgr(kernel_body);
311
312	// Bind grid indices.
313	int r0_sub_idxs[] = {`1`, `6`, `7`};
314	for (int i = `0`; i < `3`; i++) {
315	auto tmp = ra_.template alloc_sub<int32_t>();
316	mov(`1`, tmp, r0.ud(r0_sub_idxs[i]));
317	expr_binding.bind(kernel_grid.idx(i), tmp);
318	}
319
320	// Bind local IDs.
321	for (int i = `0`; i < `3`; i++) {
322	expr_binding.bind(local_id[i], getLocalID(i).uw(`0`));
323	}
324
325	// Bind arguments.
326	for (int i = `0`; i < kernel_info_.nargs(); i++) {
327	auto &arg_var = kernel_info_.arg_var(i);
328	auto &name = kernel_info_.arg_name(i);
329	if (arg_var.type().is_ptr()) {
330	auto alloc_buf = alloc_mgr.find_buffer(name);
331	ir_assert(alloc_buf.is_same(arg_var));
332	}
333	expr_binding.bind(arg_var, getArgument(name));
334	}
335
336	// Bind SLM buffer (SLM loads/stores use 0-based offsets).
337	auto slm_buf = alloc_mgr.find_buffer("slm", /allow_empty=/true);
338	if (!slm_buf.is_empty()) expr_binding.bind(slm_buf, to_ngen(expr_t(`0`)));
339	}
340
341	void generate_epilogue() {
342	epilogue();
343	pad_kernel();
344	}
345
346	// Kernel padding for instruction prefetch.
347	void pad_kernel() {
348	for (int rep = `0`; rep < `8`; rep++)
349	nop();
350	}
351
352	void emov(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
353	const ngen_operand_t &src0) {
354	if (dst.is_reg_data()) {
355	if (src0.is_reg_data()) {
356	emov(mod, dst.reg_data(), src0.reg_data());
357	} else if (src0.is_reg_buf_data()) {
358	emov(mod, dst.reg_data(), src0.reg_buf_data().reg_data());
359	} else if (src0.is_immediate()) {
360	emov(mod, dst.reg_data(), src0.immediate());
361	} else if (dst.type() == ngen::DataType::uw) {
362	emov(mod, dst.reg_data(), src0.flag_register());
363	} else {
364	emov(mod \| src0.flag_register_mod(), dst.reg_data(), `1`);
365	emov(mod \| ~src0.flag_register_mod(), dst.reg_data(), `0`);
366	}
367	} else {
368	// dst is a flag register.
369	ir_assert(!dst.is_negated());
370	auto _mod = mod;
371	_mod.setExecSize(`1`);
372	if (src0.is_reg_data()) {
373	emov(_mod, dst.flag_register(), src0.reg_data());
374	} else {
375	emov(_mod, dst.flag_register(), src0.immediate());
376	}
377	}
378	}
379
380	void eadd(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
381	const ngen_operand_t &src0, const ngen_operand_t &src1) {
382	if (src0.is_immediate()) {
383	ir_assert(src1.is_reg_data());
384	eadd(mod, dst, src1, src0);
385	return;
386	}
387	if (src1.is_reg_data()) {
388	eadd(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
389	} else {
390	eadd(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
391	}
392	}
393
394	void emul(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
395	const ngen_operand_t &src0, const ngen_operand_t &src1) {
396	if (src0.is_immediate()) {
397	ir_assert(src1.is_reg_data());
398	emul(mod, dst, src1, src0);
399	return;
400	}
401	if (src1.is_reg_data()) {
402	if (ngen_is_dw(src1.type()) && ngen_is_w(src0.type())) {
403	emul(mod, dst.reg_data(), src1.reg_data(), src0.reg_data());
404	} else {
405	emul(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
406	}
407	} else {
408	auto &src1_imm = src1.immediate();
409	if (ngen_is_qw(dst.type()) \|\| ngen_is_w(src1_imm.getType())) {
410	emul(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
411	return;
412	}
413	if (ngen_is_dw(src1_imm.getType())) {
414	ir_assert(mod.getExecSize() == `1`);
415	auto tmp = ra_.alloc_sub<int64_t>();
416	if (ngen_is_w(src0.type())) {
417	auto tmp_src1 = ra_.alloc_sub<int32_t>();
418	emov(mod, tmp_src1.d(`0`), src0.reg_data());
419	emul(mod, tmp.q(`0`), tmp_src1.d(`0`), src1_imm);
420	ra_.safeRelease(tmp_src1);
421	} else {
422	emul(mod, tmp.q(`0`), src0.reg_data(), src1_imm);
423	}
424	emov(mod, dst.reg_data(), tmp.reinterpret(`0`, dst.type()));
425	ra_.safeRelease(tmp);
426	return;
427	}
428	emul(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
429	}
430	}
431
432	void edp4a(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
433	const ngen_operand_t &src0, const ngen_operand_t &src1,
434	const ngen_operand_t &src2) {
435	ir_assert(!src0.is_immediate() \|\| !src2.is_immediate());
436	if (src0.is_immediate()) {
437	dp4a(mod, dst.reg_data(), src0.immediate(), src1.reg_data(),
438	src2.reg_data());
439	} else if (src2.is_immediate()) {
440	dp4a(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
441	src2.immediate());
442	} else {
443	dp4a(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
444	src2.reg_data());
445	}
446	}
447
448	void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
449	const ngen_operand_t &src0, const ngen_operand_t &src1,
450	const ngen_operand_t &src2) {
451	if (hw >= ngen::HW::XeHP) {
452	if (src2.is_reg_data()) {
453	add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
454	src2.reg_data());
455	} else {
456	add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
457	src2.immediate());
458	}
459	return;
460	}
461	add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
462	if (src2.is_reg_data()) {
463	add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data());
464	} else {
465	add(mod, dst.reg_data(), dst.reg_data(), src2.immediate());
466	}
467	}
468
469	void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
470	const ngen_operand_t &src0, const ngen_operand_t &src1,
471	const ngen_operand_t &src2) {
472	if (src2.is_reg_data()) {
473	mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
474	src2.reg_data());
475	} else if (hw < ngen::HW::XeLP) {
476	mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate());
477	add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data());
478	} else if (src0.is_immediate()
479	&& (ngen_is_dw(src0.type())
480	\|\| src0.type() == ngen::DataType::uw)) {
481	// dword immediate src0 is not supported, move to a register.
482	auto tmp_src0 = ra_.alloc_sub(src0.type());
483	mov(`1`, tmp_src0, src0.immediate());
484	mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(),
485	src2.immediate());
486	ra_.safeRelease(tmp_src0);
487	} else {
488	mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
489	src2.immediate());
490	}
491	}
492
493	void ediv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
494	const ngen_operand_t &src0, const ngen_operand_t &src1) {
495	if (!src1.is_immediate()) {
496	efdiv(mod, dst, src0, src1);
497	} else {
498	auto &src1_imm = src1.immediate();
499	int32_t src1_value = to_cpp<int32_t>(src1_imm);
500	ir_assert(`0` < src1_value && src1_value <= INT32_MAX) << src1_value;
501	eidiv(mod, dst.reg_data(), ngen::Subregister (), src0.reg_data(),
502	src1_value);
503	}
504	}
505
506	void efdiv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
507	const ngen_operand_t &src0, const ngen_operand_t &src1) {
508	ir_assert(!src1.is_immediate());
509	auto one = ra_.alloc().f();
510	auto zero = ra_.alloc().f();
511
512	auto tmp = ra_.alloc_range(`4`);
513
514	int esize = mod.getExecSize();
515	int grf_size = ngen::GRF::bytes(hw);
516	int div_esize = std::min(esize, grf_size / int(sizeof(float)));
517
518	int tmp_regs = utils::div_up(esize * int(sizeof(float)), grf_size);
519	auto src0_tmp = ra_.alloc_range(tmp_regs);
520	auto src1_tmp = ra_.alloc_range(tmp_regs);
521
522	// Copy to temporary registers to ensure dst, num and denom are
523	// distinct as required for fdiv_ieee.
524	mov(mod, src0_tmp[`0`].f(), src0.reg_data());
525	mov(mod, src1_tmp[`0`].f(), src1.reg_data());
526
527	auto div_mod = ngen::InstructionModifier (mod);
528	div_mod.setExecSize(div_esize);
529
530	mov(div_mod, one, ngen::Immediate(`1`));
531	mov(div_mod, zero, ngen::Immediate(`0`));
532
533	// Enable mask as fdiv_ieee relies on masked if/endif flow.
534	setDefaultNoMask(false);
535
536	for (int i = `0`; i < mod.getExecSize(); i += div_esize) {
537	fdiv_ieee(div_mod, f0[`0`], dst.sub_reg_data(i, div_esize).reg_data(),
538	src0_tmp[i / div_esize].f(), src1_tmp[i / div_esize].f(),
539	zero, one, tmp);
540	}
541
542	ra_.safeRelease(one);
543	ra_.safeRelease(zero);
544	ra_.safeRelease(src0_tmp);
545	ra_.safeRelease(src1_tmp);
546	ra_.safeRelease(tmp);
547
548	setDefaultNoMask(true);
549	}
550
551	void emod(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
552	const ngen_operand_t &src0, const ngen_operand_t &src1) {
553	ir_assert(src1.is_immediate());
554	auto &src1_imm = src1.immediate();
555	int32_t src1_value = to_cpp<int32_t>(src1_imm);
556	ir_assert(`0` < src1_value && src1_value <= INT32_MAX) << src1_value;
557	eidiv(mod, ngen::Subregister (), dst.reg_data(), src0.reg_data(),
558	src1_value);
559	}
560
561	void eshl(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
562	const ngen_operand_t &src0, const ngen_operand_t &src1) {
563	if (src1.is_reg_data()) {
564	shl(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
565	} else {
566	shl(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
567	}
568	}
569
570	void eshr(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
571	const ngen_operand_t &src0, const ngen_operand_t &src1) {
572	if (src1.is_reg_data()) {
573	shr(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
574	} else {
575	shr(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
576	}
577	}
578
579	void emin(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
580	const ngen_operand_t &src0, const ngen_operand_t &src1) {
581	if (src1.is_reg_data()) {
582	min_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
583	} else {
584	min_(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
585	}
586	}
587
588	void emax(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
589	const ngen_operand_t &src0, const ngen_operand_t &src1) {
590	if (src1.is_reg_data()) {
591	max_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
592	} else {
593	max_(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
594	}
595	}
596
597	void ecmp(const ngen::InstructionModifier &mod, const ngen_operand_t &src0,
598	const ngen_operand_t &src1) {
599	if (src1.is_reg_data()) {
600	cmp(mod, src0.reg_data(), src1.reg_data());
601	} else {
602	cmp(mod, src0.reg_data(), src1.immediate());
603	}
604	}
605
606	void ecmp(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
607	const ngen_operand_t &src0, const ngen_operand_t &src1) {
608	if (src1.is_reg_data()) {
609	cmp(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
610	} else {
611	cmp(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
612	}
613	}
614
615	void eand(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
616	const ngen_operand_t &src0, const ngen_operand_t &src1) {
617	if (src1.is_reg_data()) {
618	and_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
619	} else {
620	and_(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
621	}
622	}
623
624	// Adapted version of magicgu function from Hacker's Delight 10-15.
625	static void eidiv_magicgu(uint32_t d, uint32_t &m, uint32_t &p) {
626	uint32_t s32_max = std::numeric_limits<int32_t>::max();
627	ir_assert(d != `0` && d <= s32_max);
628	uint64_t nc = (s32_max / d) * d - `1`;
629	for (p = `32`; p < `64`; p++) {
630	uint64_t _2p = `1LL` << p;
631	if (_2p > nc * (d - `1` - (_2p - `1`) % d)) {
632	m = (_2p + d - `1` - (_2p - `1`) % d) / d;
633	return;
634	}
635	}
636	ir_error_not_expected();
637	}
638
639	// Emulates integer division by a constant.
640	// Requirements:
641	// 0 <= x <= UINT32_MAX
642	// 0 < y <= INT32_MAX
643	// Computes:
644	// qot = x / y
645	// rem = x % y
646	void eidiv(const ngen::InstructionModifier &mod, const ngen::RegData &qot,
647	const ngen::RegData &rem, const ngen::RegData &x, uint32_t y) {
648	ir_assert(x.getHS() == `0`);
649	if (ngen::utils::is_zero_or_pow2(y)) {
650	auto _x = get_subregister(x);
651	if (x.getNeg()) {
652	// Negation modifier has bitwise semantics with shr/and so x
653	// needs to be arithmetically negated first.
654	_x = ra_.alloc_sub(x.getType());
655	mov(`1`, _x, x);
656	}
657	if (!qot.isInvalid()) shr(mod, qot, _x, ngen::utils::log2(y));
658	if (!rem.isInvalid()) and_(mod, rem, _x, y - `1`);
659	if (_x != x) ra_.safeRelease(_x);
660	return;
661	}
662
663	uint32_t m = `0`, p = `0`;
664	eidiv_magicgu(y, m, p);
665
666	auto x_tmp = ra_.alloc().ud();
667	auto qot_tmp = ra_.alloc().ud();
668	auto _x = x_tmp[`0`];
669	auto _qot = qot_tmp[`0`];
670	mov(`1`, _x, x);
671
672	// qot = (x m) >> p*
673	mul(`1`, acc0.ud(`0`), _x, m & `0xFFFF`);
674	mach(`1`, _qot, _x, m);
675	shr<uint32_t>(`1`, _qot, _qot, p - `32`);
676	if (!qot.isInvalid()) mov(mod, qot, _qot);
677
678	if (!rem.isInvalid()) {
679	// rem = x - qot y*
680	bool y_is_16_bit = (y <= static_cast<uint32_t>(
681	std::numeric_limits<int16_t>::max()));
682	if (hw >= ngen::HW::XeLP && y_is_16_bit) {
683	mad(mod, rem, x, _qot, -int16_t(y));
684	} else {
685	auto tmp = ra_.alloc_sub<uint64_t>();
686	mul(`1`, tmp.ud(`0`), _qot, y & `0xFFFF`);
687	mul(`1`, tmp.ud(`1`), _qot, y >> `16`);
688	shl<uint32_t>(`1`, tmp.ud(`1`), tmp.ud(`1`), `16`);
689	add(`1`, tmp.ud(`0`), tmp.ud(`1`), tmp.ud(`0`));
690	add(mod, rem, x, -tmp.ud(`0`));
691	ra_.safeRelease(tmp);
692	}
693	}
694
695	ra_.safeRelease(x_tmp);
696	ra_.safeRelease(qot_tmp);
697	}
698
699	template <typename DT = void>
700	void emov(const ngen::InstructionModifier &mod, ngen::RegData dst,
701	ngen::RegData src0) {
702	EmulationImplementation::emov<DT>(*this, mod, dst, src0, emu_strategy);
703	}
704	template <typename DT = void>
705	void emov(const ngen::InstructionModifier &mod, ngen::RegData dst,
706	ngen::Immediate src0) {
707	EmulationImplementation::emov<DT>(*this, mod, dst, src0, emu_strategy);
708	}
709	template <typename DT = void>
710	void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
711	const ngen::RegData &src0, const ngen::RegData &src1) {
712	EmulationImplementation::eadd<DT>(
713	*this, mod, dst, src0, src1, emu_strategy, emu_state);
714	}
715	template <typename DT = void>
716	void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
717	const ngen::RegData &src0, ngen::Immediate src1) {
718	EmulationImplementation::eadd<DT>(
719	*this, mod, dst, src0, src1, emu_strategy, emu_state);
720	}
721	template <typename DT = void>
722	void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
723	const ngen::RegData &src0, const ngen::RegData &src1) {
724	if (ngen_is_xf(dst.getType())) {
725	mul(mod, dst, src0, src1);
726	return;
727	}
728	EmulationImplementation::emul<DT>(
729	*this, mod, dst, src0, src1, emu_strategy, emu_state);
730	}
731	template <typename DT = void>
732	void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
733	const ngen::RegData &src0, ngen::Immediate src1) {
734	if (ngen_is_xf(dst.getType())) {
735	mul(mod, dst, src0, src1);
736	return;
737	}
738	EmulationImplementation::emul<DT>(
739	*this, mod, dst, src0, src1, emu_strategy, emu_state);
740	}
741	template <typename DT = void>
742	void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst,
743	ngen::RegData src0, uint16_t src1) {
744	EmulationImplementation::eshl<DT>(
745	*this, mod, dst, src0, src1, emu_strategy, emu_state);
746	}
747	template <typename DT = void>
748	void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst,
749	ngen::RegData src0, uint16_t src1) {
750	EmulationImplementation::eshr<DT>(
751	*this, mod, dst, src0, src1, emu_strategy, emu_state);
752	}
753
754	protected:
755	std::string kernel_name_;
756	exec_config_t exec_cfg_;
757	kernel_info_t kernel_info_;
758	bool require_dpas_;
759	bool require_signal_header_ = false;
760	int regs_;
761	reg_allocator_t ra_;
762	ngen::GRF signal_header_;
763
764	EmulationStrategy emu_strategy;
765	EmulationState emu_state;
766	};
767
768	#define IR_KERNEL_EMULATION_FORWARD(hw) \
769	using ir_kernel_t<hw>::emov; \
770	using ir_kernel_t<hw>::eadd; \
771	using ir_kernel_t<hw>::emul; \
772	using ir_kernel_t<hw>::eshl; \
773	using ir_kernel_t<hw>::eshr;
774
775	#define IR_KERNEL_FORWARD(hw) \
776	NGEN_FORWARD_OPENCL(hw) \
777	IR_KERNEL_EMULATION_FORWARD(hw) \
778	using ir_kernel_t<hw>::setup_interface; \
779	using ir_kernel_t<hw>::bind_external_vars; \
780	using ir_kernel_t<hw>::generate_prologue; \
781	using ir_kernel_t<hw>::generate_epilogue; \
782	using ir_kernel_t<hw>::emu_strategy; \
783	using ir_kernel_t<hw>::ra_;
784
785	} // namespace jit
786	} // namespace gpu
787	} // namespace impl
788	} // namespace dnnl
789
790	#endif
791

Browse the source code of oneDNN/src/gpu/jit/codegen/kernel.hpp