1/*******************************************************************************
2* Copyright 2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_JIT_CODEGEN_KERNEL_HPP
18#define GPU_JIT_CODEGEN_KERNEL_HPP
19
20#include "common/cpp_compat.hpp"
21
22#include "gpu/jit/codegen/operand.hpp"
23#include "gpu/jit/codegen/register_allocator.hpp"
24#include "gpu/jit/ir/ir.hpp"
25#include "gpu/jit/ir/kernel_info.hpp"
26#include "gpu/jit/ir/message.hpp"
27#include "gpu/jit/ir/tensor.hpp"
28#include "gpu/jit/jit_generator.hpp"
29#include "gpu/jit/ngen/ngen.hpp"
30#include "gpu/jit/ngen/ngen_register_allocator.hpp"
31
32#include "gpu/jit/gemm/emulation.hpp"
33
34namespace dnnl {
35namespace impl {
36namespace gpu {
37namespace jit {
38
39inline size_t icache_size(ngen::HW arch) {
40 switch (arch) {
41 case gpu_gen9: return 48 * 1024;
42 case gpu_gen11: return 48 * 1024;
43 case gpu_xe_lp: return 48 * 1024;
44 case gpu_xe_hp: return 48 * 1024;
45 case gpu_xe_hpg: return 96 * 1024;
46 case gpu_xe_hpc: return 80 * 1024;
47 default: return 0;
48 }
49}
50
51template <template <ngen::HW> class KernelT, ngen::HW arch, typename... ArgsT>
52std::unique_ptr<jit::jit_generator_base> make_generator(ArgsT &&... args) {
53
54 auto raw_kernel = new KernelT<arch>(std::forward<ArgsT>(args)...);
55 if (raw_kernel->getRootStreamLength() > icache_size(arch)) {
56 ir_warning() << raw_kernel->kernel_name()
57 << " larger than icache, kernel: "
58 << raw_kernel->getRootStreamLength()
59 << " bytes, icache: " << icache_size(arch) << " bytes\n";
60 }
61 return std::unique_ptr<jit::jit_generator_base>(raw_kernel);
62}
63
64template <template <ngen::HW> class KernelT, typename... ArgsT>
65compute::kernel_t make_kernel(
66 gpu_primitive_t *primitive, engine_t *engine, ArgsT &&... args) {
67 using namespace compute;
68 kernel_t kernel;
69
70 if (primitive->cache_blob()) {
71 status_t status = primitive->create_kernel(engine, &kernel, nullptr);
72 if (status != status::success) return kernel_t();
73 return kernel;
74 }
75
76 auto *compute_engine = utils::downcast<compute_engine_t *>(engine);
77 auto *device_info = compute_engine->device_info();
78 auto arch = convert_dnnl_arch_to_ngen(device_info->gpu_arch());
79
80 std::unique_ptr<jit::jit_generator_base> jit_kernel;
81#define CASE(gpu_arch) \
82 case gpu_arch: \
83 jit_kernel = make_generator<KernelT, gpu_arch>( \
84 std::forward<ArgsT>(args)...); \
85 break;
86 switch (arch) {
87 REG_GEN9_ISA(CASE(gpu_gen9));
88 REG_GEN11_ISA(CASE(gpu_gen11));
89 REG_XELP_ISA(CASE(gpu_xe_lp));
90 REG_XEHP_ISA(CASE(gpu_xe_hp));
91 REG_XEHPG_ISA(CASE(gpu_xe_hpg));
92 REG_XEHPC_ISA(CASE(gpu_xe_hpc));
93 default: break;
94 }
95#undef CASE
96
97#ifdef GEN_CONV_DEBUG
98 gpu_gen_t actual_arch = ngen::HW::Unknown;
99 switch (device_info->gpu_arch()) {
100 case gpu_arch_t::gen9: actual_arch = gpu_gen9; break;
101 case gpu_arch_t::gen11: actual_arch = gpu_gen11; break;
102 case gpu_arch_t::xe_lp: actual_arch = gpu_xe_lp; break;
103 case gpu_arch_t::xe_hp: actual_arch = gpu_xe_hp; break;
104 case gpu_arch_t::xe_hpg: actual_arch = gpu_xe_hpg; break;
105 case gpu_arch_t::xe_hpc: actual_arch = gpu_xe_hpc; break;
106 case gpu_arch_t::unknown: actual_arch = ngen::HW::Unknown; break;
107 }
108 ir_assert(actual_arch == arch)
109 << "Cannot emulate executing gpu_arch environment";
110#endif
111
112 if (!jit_kernel) return kernel_t();
113
114 status_t status
115 = primitive->create_kernel(engine, &kernel, jit_kernel.get());
116 if (status != status::success) return kernel_t();
117 return kernel;
118}
119
120class expr_binding_t {
121public:
122 expr_binding_t(ngen::HW hw) : hw_(hw) {}
123
124 ~expr_binding_t() {
125 if (!cpp_compat::uncaught_exceptions()) {
126 ir_assert(expr2dst_.empty()) << "Detected missing unbind_dst().";
127 }
128 }
129
130 bool is_dst_bound(const expr_t &expr) const {
131 return expr2dst_.count(expr) == 1;
132 }
133
134 ngen_operand_t get_dst(const expr_t &expr) const {
135 ir_assert(is_dst_bound(expr)) << "Destination is not bound: " << expr;
136 return expr2dst_.at(expr);
137 }
138
139 void bind_dst(const expr_t &expr, const ngen_operand_t &operand) {
140 ir_assert(!expr.is_empty());
141 auto ret = expr2dst_.insert({expr, operand});
142 ir_assert(ret.second) << "Already bound: " << expr;
143 }
144
145 void unbind_dst(const expr_t &expr) {
146 ir_assert(!expr.is_empty());
147 auto it = expr2dst_.find(expr);
148 ir_assert(it != expr2dst_.end());
149 expr2dst_.erase(it);
150 }
151
152 bool is_bound(const expr_t &expr) const {
153 return expr2operand_.count(expr) == 1;
154 }
155
156 ngen_operand_t get(const expr_t &expr, bool allow_empty = false) const {
157 if (expr.is_empty()) return ngen_operand_t();
158 if (!is_bound(expr)) {
159 if (!allow_empty)
160 ir_assert(false) << "Operand is not bound: " << expr;
161 return ngen_operand_t();
162 }
163 return expr2operand_.at(expr);
164 }
165
166 void bind(const expr_t &expr, const ngen::Subregister &sub) {
167 bind(expr, ngen_operand_t(reg_buf_data_t(hw_, sub)));
168 }
169
170 void bind(const expr_t &expr, const ngen_operand_t &operand) {
171 if (is_dst_bound(expr)) unbind_dst(expr);
172
173 auto op_to_bind = operand;
174
175 // Operand is with predicate - can't bind.
176 if (operand.mod().getPredCtrl() != ngen::PredCtrl::None) return;
177
178 int esize = operand.mod().getExecSize();
179 if (esize == 0) esize = 1;
180 if (esize != expr.type().elems()) {
181 ir_assert(expr.type().is_scalar() || esize == 1)
182 << "Expected broadcast.";
183 if (operand.is_reg_buf_data() && esize != 1) {
184 // Bind scalar expression to the first vector element.
185 op_to_bind = operand.reg_buf_data().format(
186 0, ngen::DataType::invalid, 1);
187 }
188 }
189
190 auto ret = expr2operand_.insert({expr, op_to_bind});
191 ir_assert(ret.second) << "Already bound: " << expr;
192 }
193
194 void unbind(const expr_t &expr) {
195 ir_assert(!expr.is_empty());
196
197 auto it = expr2operand_.find(expr);
198 ir_assert(it != expr2operand_.end());
199 expr2operand_.erase(it);
200 }
201
202private:
203 ngen::HW hw_;
204 object_map_t<expr_t, ngen_operand_t> expr2dst_;
205 object_map_t<expr_t, ngen_operand_t> expr2operand_;
206};
207
208template <ngen::HW hw>
209class expr_evaluator_t;
210
211template <ngen::HW hw>
212class ir_to_ngen_t;
213
214enum class grf_mode_t {
215 any, // Kernel sets optimal grf mode
216 matches, // Propogate grf mode to avoid context switch
217 small, // Force small grf_mode
218 large, // Force large grf_mode
219};
220
221template <ngen::HW hw>
222class ir_kernel_t : public jit_generator<hw> {
223public:
224 NGEN_FORWARD_OPENCL(hw);
225
226 friend class expr_evaluator_t<hw>;
227 friend class ir_to_ngen_t<hw>;
228 friend class send_impl_t;
229
230 ir_kernel_t(const std::string &kernel_name, const exec_config_t &exec_cfg,
231 const kernel_info_t &kernel_info, bool require_dpas,
232 grf_mode_t grf_mode = grf_mode_t::any)
233 : kernel_name_(kernel_name)
234 , exec_cfg_(exec_cfg)
235 , kernel_info_(kernel_info)
236 , require_dpas_(require_dpas)
237 , regs_((grf_mode == grf_mode_t::large)
238 ? 256
239 : (grf_mode == grf_mode_t::small) ? 128
240 : exec_cfg.regs())
241 , ra_(hw, kernel_name,
242 grf_mode == grf_mode_t::any ? reg_allocator_t::warn_all
243 : reg_allocator_t::warn_default)
244 , emu_strategy(hw, exec_cfg.hw_cfg().stepping_id()) {
245 ra_.setRegisterCount(regs_);
246 }
247
248 void setup_interface(const stmt_t &kernel_body = stmt_t()) {
249 externalName(kernel_name_);
250 requireLocalID(3);
251 requireLocalSize();
252 requireGRF(regs_);
253 requireSIMD(exec_cfg_.simd());
254 requireBarrier();
255 if (require_dpas_) requireDPAS();
256 if (has_send_atomics(kernel_body)) requireGlobalAtomics();
257
258 for (int i = 0; i < kernel_info_.nargs(); i++) {
259 auto &name = kernel_info_.arg_name(i);
260 auto &type = kernel_info_.arg_type(i);
261 if (type.is_ptr()) {
262 newArgument(name, ngen::ExternalArgumentType::GlobalPtr);
263 } else {
264 newArgument(name, to_ngen(type));
265 }
266 }
267
268 if (!kernel_body.is_empty()) {
269 int slm_size = alloc_manager_t(kernel_body)
270 .total_size(alloc_kind_t::slm);
271 requireSLM(slm_size);
272 }
273
274 finalizeInterface();
275 }
276
277 void generate_prologue() {
278 setDefaultNoMask();
279 setDefaultAutoSWSB(true);
280
281 prologue();
282
283 // Claim registers.
284 ra_.claim(r0);
285 for (int i = 0; i < 3; i++)
286 ra_.claim(getLocalID(i));
287
288 for (int i = 0; i < kernel_info_.nargs(); i++) {
289 ra_.claim(getArgument(kernel_info_.arg_name(i)));
290 }
291
292 if (emu_strategy.emulate64) {
293 emu_state.temp[0] = ra_.alloc();
294 emu_state.temp[1] = ra_.alloc();
295 }
296 // Enable IEEE f32 -> s32 rounding and f32/f16 denormals.
297 or_(1, cr0, cr0, uint16_t(0x1480));
298
299 // Allocate and initialize signal header for future use.
300 if (require_signal_header_) {
301 signal_header_ = ra_.alloc();
302 barrierheader(signal_header_);
303 }
304 }
305
306 void bind_external_vars(const stmt_t &kernel_body,
307 const grid_info_t &kernel_grid,
308 const std::array<expr_t, 3> &local_id,
309 expr_binding_t &expr_binding) {
310 alloc_manager_t alloc_mgr(kernel_body);
311
312 // Bind grid indices.
313 int r0_sub_idxs[] = {1, 6, 7};
314 for (int i = 0; i < 3; i++) {
315 auto tmp = ra_.template alloc_sub<int32_t>();
316 mov(1, tmp, r0.ud(r0_sub_idxs[i]));
317 expr_binding.bind(kernel_grid.idx(i), tmp);
318 }
319
320 // Bind local IDs.
321 for (int i = 0; i < 3; i++) {
322 expr_binding.bind(local_id[i], getLocalID(i).uw(0));
323 }
324
325 // Bind arguments.
326 for (int i = 0; i < kernel_info_.nargs(); i++) {
327 auto &arg_var = kernel_info_.arg_var(i);
328 auto &name = kernel_info_.arg_name(i);
329 if (arg_var.type().is_ptr()) {
330 auto alloc_buf = alloc_mgr.find_buffer(name);
331 ir_assert(alloc_buf.is_same(arg_var));
332 }
333 expr_binding.bind(arg_var, getArgument(name));
334 }
335
336 // Bind SLM buffer (SLM loads/stores use 0-based offsets).
337 auto slm_buf = alloc_mgr.find_buffer("slm", /*allow_empty=*/true);
338 if (!slm_buf.is_empty()) expr_binding.bind(slm_buf, to_ngen(expr_t(0)));
339 }
340
341 void generate_epilogue() {
342 epilogue();
343 pad_kernel();
344 }
345
346 // Kernel padding for instruction prefetch.
347 void pad_kernel() {
348 for (int rep = 0; rep < 8; rep++)
349 nop();
350 }
351
352 void emov(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
353 const ngen_operand_t &src0) {
354 if (dst.is_reg_data()) {
355 if (src0.is_reg_data()) {
356 emov(mod, dst.reg_data(), src0.reg_data());
357 } else if (src0.is_reg_buf_data()) {
358 emov(mod, dst.reg_data(), src0.reg_buf_data().reg_data());
359 } else if (src0.is_immediate()) {
360 emov(mod, dst.reg_data(), src0.immediate());
361 } else if (dst.type() == ngen::DataType::uw) {
362 emov(mod, dst.reg_data(), src0.flag_register());
363 } else {
364 emov(mod | src0.flag_register_mod(), dst.reg_data(), 1);
365 emov(mod | ~src0.flag_register_mod(), dst.reg_data(), 0);
366 }
367 } else {
368 // dst is a flag register.
369 ir_assert(!dst.is_negated());
370 auto _mod = mod;
371 _mod.setExecSize(1);
372 if (src0.is_reg_data()) {
373 emov(_mod, dst.flag_register(), src0.reg_data());
374 } else {
375 emov(_mod, dst.flag_register(), src0.immediate());
376 }
377 }
378 }
379
380 void eadd(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
381 const ngen_operand_t &src0, const ngen_operand_t &src1) {
382 if (src0.is_immediate()) {
383 ir_assert(src1.is_reg_data());
384 eadd(mod, dst, src1, src0);
385 return;
386 }
387 if (src1.is_reg_data()) {
388 eadd(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
389 } else {
390 eadd(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
391 }
392 }
393
394 void emul(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
395 const ngen_operand_t &src0, const ngen_operand_t &src1) {
396 if (src0.is_immediate()) {
397 ir_assert(src1.is_reg_data());
398 emul(mod, dst, src1, src0);
399 return;
400 }
401 if (src1.is_reg_data()) {
402 if (ngen_is_dw(src1.type()) && ngen_is_w(src0.type())) {
403 emul(mod, dst.reg_data(), src1.reg_data(), src0.reg_data());
404 } else {
405 emul(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
406 }
407 } else {
408 auto &src1_imm = src1.immediate();
409 if (ngen_is_qw(dst.type()) || ngen_is_w(src1_imm.getType())) {
410 emul(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
411 return;
412 }
413 if (ngen_is_dw(src1_imm.getType())) {
414 ir_assert(mod.getExecSize() == 1);
415 auto tmp = ra_.alloc_sub<int64_t>();
416 if (ngen_is_w(src0.type())) {
417 auto tmp_src1 = ra_.alloc_sub<int32_t>();
418 emov(mod, tmp_src1.d(0), src0.reg_data());
419 emul(mod, tmp.q(0), tmp_src1.d(0), src1_imm);
420 ra_.safeRelease(tmp_src1);
421 } else {
422 emul(mod, tmp.q(0), src0.reg_data(), src1_imm);
423 }
424 emov(mod, dst.reg_data(), tmp.reinterpret(0, dst.type()));
425 ra_.safeRelease(tmp);
426 return;
427 }
428 emul(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
429 }
430 }
431
432 void edp4a(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
433 const ngen_operand_t &src0, const ngen_operand_t &src1,
434 const ngen_operand_t &src2) {
435 ir_assert(!src0.is_immediate() || !src2.is_immediate());
436 if (src0.is_immediate()) {
437 dp4a(mod, dst.reg_data(), src0.immediate(), src1.reg_data(),
438 src2.reg_data());
439 } else if (src2.is_immediate()) {
440 dp4a(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
441 src2.immediate());
442 } else {
443 dp4a(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
444 src2.reg_data());
445 }
446 }
447
448 void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
449 const ngen_operand_t &src0, const ngen_operand_t &src1,
450 const ngen_operand_t &src2) {
451 if (hw >= ngen::HW::XeHP) {
452 if (src2.is_reg_data()) {
453 add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
454 src2.reg_data());
455 } else {
456 add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
457 src2.immediate());
458 }
459 return;
460 }
461 add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
462 if (src2.is_reg_data()) {
463 add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data());
464 } else {
465 add(mod, dst.reg_data(), dst.reg_data(), src2.immediate());
466 }
467 }
468
469 void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
470 const ngen_operand_t &src0, const ngen_operand_t &src1,
471 const ngen_operand_t &src2) {
472 if (src2.is_reg_data()) {
473 mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
474 src2.reg_data());
475 } else if (hw < ngen::HW::XeLP) {
476 mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate());
477 add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data());
478 } else if (src0.is_immediate()
479 && (ngen_is_dw(src0.type())
480 || src0.type() == ngen::DataType::uw)) {
481 // dword immediate src0 is not supported, move to a register.
482 auto tmp_src0 = ra_.alloc_sub(src0.type());
483 mov(1, tmp_src0, src0.immediate());
484 mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(),
485 src2.immediate());
486 ra_.safeRelease(tmp_src0);
487 } else {
488 mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
489 src2.immediate());
490 }
491 }
492
493 void ediv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
494 const ngen_operand_t &src0, const ngen_operand_t &src1) {
495 if (!src1.is_immediate()) {
496 efdiv(mod, dst, src0, src1);
497 } else {
498 auto &src1_imm = src1.immediate();
499 int32_t src1_value = to_cpp<int32_t>(src1_imm);
500 ir_assert(0 < src1_value && src1_value <= INT32_MAX) << src1_value;
501 eidiv(mod, dst.reg_data(), ngen::Subregister(), src0.reg_data(),
502 src1_value);
503 }
504 }
505
506 void efdiv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
507 const ngen_operand_t &src0, const ngen_operand_t &src1) {
508 ir_assert(!src1.is_immediate());
509 auto one = ra_.alloc().f();
510 auto zero = ra_.alloc().f();
511
512 auto tmp = ra_.alloc_range(4);
513
514 int esize = mod.getExecSize();
515 int grf_size = ngen::GRF::bytes(hw);
516 int div_esize = std::min(esize, grf_size / int(sizeof(float)));
517
518 int tmp_regs = utils::div_up(esize * int(sizeof(float)), grf_size);
519 auto src0_tmp = ra_.alloc_range(tmp_regs);
520 auto src1_tmp = ra_.alloc_range(tmp_regs);
521
522 // Copy to temporary registers to ensure dst, num and denom are
523 // distinct as required for fdiv_ieee.
524 mov(mod, src0_tmp[0].f(), src0.reg_data());
525 mov(mod, src1_tmp[0].f(), src1.reg_data());
526
527 auto div_mod = ngen::InstructionModifier(mod);
528 div_mod.setExecSize(div_esize);
529
530 mov(div_mod, one, ngen::Immediate(1));
531 mov(div_mod, zero, ngen::Immediate(0));
532
533 // Enable mask as fdiv_ieee relies on masked if/endif flow.
534 setDefaultNoMask(false);
535
536 for (int i = 0; i < mod.getExecSize(); i += div_esize) {
537 fdiv_ieee(div_mod, f0[0], dst.sub_reg_data(i, div_esize).reg_data(),
538 src0_tmp[i / div_esize].f(), src1_tmp[i / div_esize].f(),
539 zero, one, tmp);
540 }
541
542 ra_.safeRelease(one);
543 ra_.safeRelease(zero);
544 ra_.safeRelease(src0_tmp);
545 ra_.safeRelease(src1_tmp);
546 ra_.safeRelease(tmp);
547
548 setDefaultNoMask(true);
549 }
550
551 void emod(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
552 const ngen_operand_t &src0, const ngen_operand_t &src1) {
553 ir_assert(src1.is_immediate());
554 auto &src1_imm = src1.immediate();
555 int32_t src1_value = to_cpp<int32_t>(src1_imm);
556 ir_assert(0 < src1_value && src1_value <= INT32_MAX) << src1_value;
557 eidiv(mod, ngen::Subregister(), dst.reg_data(), src0.reg_data(),
558 src1_value);
559 }
560
561 void eshl(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
562 const ngen_operand_t &src0, const ngen_operand_t &src1) {
563 if (src1.is_reg_data()) {
564 shl(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
565 } else {
566 shl(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
567 }
568 }
569
570 void eshr(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
571 const ngen_operand_t &src0, const ngen_operand_t &src1) {
572 if (src1.is_reg_data()) {
573 shr(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
574 } else {
575 shr(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
576 }
577 }
578
579 void emin(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
580 const ngen_operand_t &src0, const ngen_operand_t &src1) {
581 if (src1.is_reg_data()) {
582 min_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
583 } else {
584 min_(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
585 }
586 }
587
588 void emax(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
589 const ngen_operand_t &src0, const ngen_operand_t &src1) {
590 if (src1.is_reg_data()) {
591 max_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
592 } else {
593 max_(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
594 }
595 }
596
597 void ecmp(const ngen::InstructionModifier &mod, const ngen_operand_t &src0,
598 const ngen_operand_t &src1) {
599 if (src1.is_reg_data()) {
600 cmp(mod, src0.reg_data(), src1.reg_data());
601 } else {
602 cmp(mod, src0.reg_data(), src1.immediate());
603 }
604 }
605
606 void ecmp(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
607 const ngen_operand_t &src0, const ngen_operand_t &src1) {
608 if (src1.is_reg_data()) {
609 cmp(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
610 } else {
611 cmp(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
612 }
613 }
614
615 void eand(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
616 const ngen_operand_t &src0, const ngen_operand_t &src1) {
617 if (src1.is_reg_data()) {
618 and_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
619 } else {
620 and_(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
621 }
622 }
623
624 // Adapted version of magicgu function from Hacker's Delight 10-15.
625 static void eidiv_magicgu(uint32_t d, uint32_t &m, uint32_t &p) {
626 uint32_t s32_max = std::numeric_limits<int32_t>::max();
627 ir_assert(d != 0 && d <= s32_max);
628 uint64_t nc = (s32_max / d) * d - 1;
629 for (p = 32; p < 64; p++) {
630 uint64_t _2p = 1LL << p;
631 if (_2p > nc * (d - 1 - (_2p - 1) % d)) {
632 m = (_2p + d - 1 - (_2p - 1) % d) / d;
633 return;
634 }
635 }
636 ir_error_not_expected();
637 }
638
639 // Emulates integer division by a constant.
640 // Requirements:
641 // 0 <= x <= UINT32_MAX
642 // 0 < y <= INT32_MAX
643 // Computes:
644 // qot = x / y
645 // rem = x % y
646 void eidiv(const ngen::InstructionModifier &mod, const ngen::RegData &qot,
647 const ngen::RegData &rem, const ngen::RegData &x, uint32_t y) {
648 ir_assert(x.getHS() == 0);
649 if (ngen::utils::is_zero_or_pow2(y)) {
650 auto _x = get_subregister(x);
651 if (x.getNeg()) {
652 // Negation modifier has bitwise semantics with shr/and so x
653 // needs to be arithmetically negated first.
654 _x = ra_.alloc_sub(x.getType());
655 mov(1, _x, x);
656 }
657 if (!qot.isInvalid()) shr(mod, qot, _x, ngen::utils::log2(y));
658 if (!rem.isInvalid()) and_(mod, rem, _x, y - 1);
659 if (_x != x) ra_.safeRelease(_x);
660 return;
661 }
662
663 uint32_t m = 0, p = 0;
664 eidiv_magicgu(y, m, p);
665
666 auto x_tmp = ra_.alloc().ud();
667 auto qot_tmp = ra_.alloc().ud();
668 auto _x = x_tmp[0];
669 auto _qot = qot_tmp[0];
670 mov(1, _x, x);
671
672 // qot = (x * m) >> p
673 mul(1, acc0.ud(0), _x, m & 0xFFFF);
674 mach(1, _qot, _x, m);
675 shr<uint32_t>(1, _qot, _qot, p - 32);
676 if (!qot.isInvalid()) mov(mod, qot, _qot);
677
678 if (!rem.isInvalid()) {
679 // rem = x - qot * y
680 bool y_is_16_bit = (y <= static_cast<uint32_t>(
681 std::numeric_limits<int16_t>::max()));
682 if (hw >= ngen::HW::XeLP && y_is_16_bit) {
683 mad(mod, rem, x, _qot, -int16_t(y));
684 } else {
685 auto tmp = ra_.alloc_sub<uint64_t>();
686 mul(1, tmp.ud(0), _qot, y & 0xFFFF);
687 mul(1, tmp.ud(1), _qot, y >> 16);
688 shl<uint32_t>(1, tmp.ud(1), tmp.ud(1), 16);
689 add(1, tmp.ud(0), tmp.ud(1), tmp.ud(0));
690 add(mod, rem, x, -tmp.ud(0));
691 ra_.safeRelease(tmp);
692 }
693 }
694
695 ra_.safeRelease(x_tmp);
696 ra_.safeRelease(qot_tmp);
697 }
698
699 template <typename DT = void>
700 void emov(const ngen::InstructionModifier &mod, ngen::RegData dst,
701 ngen::RegData src0) {
702 EmulationImplementation::emov<DT>(*this, mod, dst, src0, emu_strategy);
703 }
704 template <typename DT = void>
705 void emov(const ngen::InstructionModifier &mod, ngen::RegData dst,
706 ngen::Immediate src0) {
707 EmulationImplementation::emov<DT>(*this, mod, dst, src0, emu_strategy);
708 }
709 template <typename DT = void>
710 void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
711 const ngen::RegData &src0, const ngen::RegData &src1) {
712 EmulationImplementation::eadd<DT>(
713 *this, mod, dst, src0, src1, emu_strategy, emu_state);
714 }
715 template <typename DT = void>
716 void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
717 const ngen::RegData &src0, ngen::Immediate src1) {
718 EmulationImplementation::eadd<DT>(
719 *this, mod, dst, src0, src1, emu_strategy, emu_state);
720 }
721 template <typename DT = void>
722 void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
723 const ngen::RegData &src0, const ngen::RegData &src1) {
724 if (ngen_is_xf(dst.getType())) {
725 mul(mod, dst, src0, src1);
726 return;
727 }
728 EmulationImplementation::emul<DT>(
729 *this, mod, dst, src0, src1, emu_strategy, emu_state);
730 }
731 template <typename DT = void>
732 void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst,
733 const ngen::RegData &src0, ngen::Immediate src1) {
734 if (ngen_is_xf(dst.getType())) {
735 mul(mod, dst, src0, src1);
736 return;
737 }
738 EmulationImplementation::emul<DT>(
739 *this, mod, dst, src0, src1, emu_strategy, emu_state);
740 }
741 template <typename DT = void>
742 void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst,
743 ngen::RegData src0, uint16_t src1) {
744 EmulationImplementation::eshl<DT>(
745 *this, mod, dst, src0, src1, emu_strategy, emu_state);
746 }
747 template <typename DT = void>
748 void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst,
749 ngen::RegData src0, uint16_t src1) {
750 EmulationImplementation::eshr<DT>(
751 *this, mod, dst, src0, src1, emu_strategy, emu_state);
752 }
753
754protected:
755 std::string kernel_name_;
756 exec_config_t exec_cfg_;
757 kernel_info_t kernel_info_;
758 bool require_dpas_;
759 bool require_signal_header_ = false;
760 int regs_;
761 reg_allocator_t ra_;
762 ngen::GRF signal_header_;
763
764 EmulationStrategy emu_strategy;
765 EmulationState emu_state;
766};
767
768#define IR_KERNEL_EMULATION_FORWARD(hw) \
769 using ir_kernel_t<hw>::emov; \
770 using ir_kernel_t<hw>::eadd; \
771 using ir_kernel_t<hw>::emul; \
772 using ir_kernel_t<hw>::eshl; \
773 using ir_kernel_t<hw>::eshr;
774
775#define IR_KERNEL_FORWARD(hw) \
776 NGEN_FORWARD_OPENCL(hw) \
777 IR_KERNEL_EMULATION_FORWARD(hw) \
778 using ir_kernel_t<hw>::setup_interface; \
779 using ir_kernel_t<hw>::bind_external_vars; \
780 using ir_kernel_t<hw>::generate_prologue; \
781 using ir_kernel_t<hw>::generate_epilogue; \
782 using ir_kernel_t<hw>::emu_strategy; \
783 using ir_kernel_t<hw>::ra_;
784
785} // namespace jit
786} // namespace gpu
787} // namespace impl
788} // namespace dnnl
789
790#endif
791