1 | /******************************************************************************* |
2 | * Copyright 2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_JIT_CODEGEN_KERNEL_HPP |
18 | #define GPU_JIT_CODEGEN_KERNEL_HPP |
19 | |
20 | #include "common/cpp_compat.hpp" |
21 | |
22 | #include "gpu/jit/codegen/operand.hpp" |
23 | #include "gpu/jit/codegen/register_allocator.hpp" |
24 | #include "gpu/jit/ir/ir.hpp" |
25 | #include "gpu/jit/ir/kernel_info.hpp" |
26 | #include "gpu/jit/ir/message.hpp" |
27 | #include "gpu/jit/ir/tensor.hpp" |
28 | #include "gpu/jit/jit_generator.hpp" |
29 | #include "gpu/jit/ngen/ngen.hpp" |
30 | #include "gpu/jit/ngen/ngen_register_allocator.hpp" |
31 | |
32 | #include "gpu/jit/gemm/emulation.hpp" |
33 | |
34 | namespace dnnl { |
35 | namespace impl { |
36 | namespace gpu { |
37 | namespace jit { |
38 | |
39 | inline size_t icache_size(ngen::HW arch) { |
40 | switch (arch) { |
41 | case gpu_gen9: return 48 * 1024; |
42 | case gpu_gen11: return 48 * 1024; |
43 | case gpu_xe_lp: return 48 * 1024; |
44 | case gpu_xe_hp: return 48 * 1024; |
45 | case gpu_xe_hpg: return 96 * 1024; |
46 | case gpu_xe_hpc: return 80 * 1024; |
47 | default: return 0; |
48 | } |
49 | } |
50 | |
51 | template <template <ngen::HW> class KernelT, ngen::HW arch, typename... ArgsT> |
52 | std::unique_ptr<jit::jit_generator_base> make_generator(ArgsT &&... args) { |
53 | |
54 | auto raw_kernel = new KernelT<arch>(std::forward<ArgsT>(args)...); |
55 | if (raw_kernel->getRootStreamLength() > icache_size(arch)) { |
56 | ir_warning() << raw_kernel->kernel_name() |
57 | << " larger than icache, kernel: " |
58 | << raw_kernel->getRootStreamLength() |
59 | << " bytes, icache: " << icache_size(arch) << " bytes\n" ; |
60 | } |
61 | return std::unique_ptr<jit::jit_generator_base>(raw_kernel); |
62 | } |
63 | |
64 | template <template <ngen::HW> class KernelT, typename... ArgsT> |
65 | compute::kernel_t make_kernel( |
66 | gpu_primitive_t *primitive, engine_t *engine, ArgsT &&... args) { |
67 | using namespace compute; |
68 | kernel_t kernel; |
69 | |
70 | if (primitive->cache_blob()) { |
71 | status_t status = primitive->create_kernel(engine, &kernel, nullptr); |
72 | if (status != status::success) return kernel_t(); |
73 | return kernel; |
74 | } |
75 | |
76 | auto *compute_engine = utils::downcast<compute_engine_t *>(engine); |
77 | auto *device_info = compute_engine->device_info(); |
78 | auto arch = convert_dnnl_arch_to_ngen(device_info->gpu_arch()); |
79 | |
80 | std::unique_ptr<jit::jit_generator_base> jit_kernel; |
81 | #define CASE(gpu_arch) \ |
82 | case gpu_arch: \ |
83 | jit_kernel = make_generator<KernelT, gpu_arch>( \ |
84 | std::forward<ArgsT>(args)...); \ |
85 | break; |
86 | switch (arch) { |
87 | REG_GEN9_ISA(CASE(gpu_gen9)); |
88 | REG_GEN11_ISA(CASE(gpu_gen11)); |
89 | REG_XELP_ISA(CASE(gpu_xe_lp)); |
90 | REG_XEHP_ISA(CASE(gpu_xe_hp)); |
91 | REG_XEHPG_ISA(CASE(gpu_xe_hpg)); |
92 | REG_XEHPC_ISA(CASE(gpu_xe_hpc)); |
93 | default: break; |
94 | } |
95 | #undef CASE |
96 | |
97 | #ifdef GEN_CONV_DEBUG |
98 | gpu_gen_t actual_arch = ngen::HW::Unknown; |
99 | switch (device_info->gpu_arch()) { |
100 | case gpu_arch_t::gen9: actual_arch = gpu_gen9; break; |
101 | case gpu_arch_t::gen11: actual_arch = gpu_gen11; break; |
102 | case gpu_arch_t::xe_lp: actual_arch = gpu_xe_lp; break; |
103 | case gpu_arch_t::xe_hp: actual_arch = gpu_xe_hp; break; |
104 | case gpu_arch_t::xe_hpg: actual_arch = gpu_xe_hpg; break; |
105 | case gpu_arch_t::xe_hpc: actual_arch = gpu_xe_hpc; break; |
106 | case gpu_arch_t::unknown: actual_arch = ngen::HW::Unknown; break; |
107 | } |
108 | ir_assert(actual_arch == arch) |
109 | << "Cannot emulate executing gpu_arch environment" ; |
110 | #endif |
111 | |
112 | if (!jit_kernel) return kernel_t(); |
113 | |
114 | status_t status |
115 | = primitive->create_kernel(engine, &kernel, jit_kernel.get()); |
116 | if (status != status::success) return kernel_t(); |
117 | return kernel; |
118 | } |
119 | |
120 | class expr_binding_t { |
121 | public: |
122 | expr_binding_t(ngen::HW hw) : hw_(hw) {} |
123 | |
124 | ~expr_binding_t() { |
125 | if (!cpp_compat::uncaught_exceptions()) { |
126 | ir_assert(expr2dst_.empty()) << "Detected missing unbind_dst()." ; |
127 | } |
128 | } |
129 | |
130 | bool is_dst_bound(const expr_t &expr) const { |
131 | return expr2dst_.count(expr) == 1; |
132 | } |
133 | |
134 | ngen_operand_t get_dst(const expr_t &expr) const { |
135 | ir_assert(is_dst_bound(expr)) << "Destination is not bound: " << expr; |
136 | return expr2dst_.at(expr); |
137 | } |
138 | |
139 | void bind_dst(const expr_t &expr, const ngen_operand_t &operand) { |
140 | ir_assert(!expr.is_empty()); |
141 | auto ret = expr2dst_.insert({expr, operand}); |
142 | ir_assert(ret.second) << "Already bound: " << expr; |
143 | } |
144 | |
145 | void unbind_dst(const expr_t &expr) { |
146 | ir_assert(!expr.is_empty()); |
147 | auto it = expr2dst_.find(expr); |
148 | ir_assert(it != expr2dst_.end()); |
149 | expr2dst_.erase(it); |
150 | } |
151 | |
152 | bool is_bound(const expr_t &expr) const { |
153 | return expr2operand_.count(expr) == 1; |
154 | } |
155 | |
156 | ngen_operand_t get(const expr_t &expr, bool allow_empty = false) const { |
157 | if (expr.is_empty()) return ngen_operand_t(); |
158 | if (!is_bound(expr)) { |
159 | if (!allow_empty) |
160 | ir_assert(false) << "Operand is not bound: " << expr; |
161 | return ngen_operand_t(); |
162 | } |
163 | return expr2operand_.at(expr); |
164 | } |
165 | |
166 | void bind(const expr_t &expr, const ngen::Subregister &sub) { |
167 | bind(expr, ngen_operand_t(reg_buf_data_t(hw_, sub))); |
168 | } |
169 | |
170 | void bind(const expr_t &expr, const ngen_operand_t &operand) { |
171 | if (is_dst_bound(expr)) unbind_dst(expr); |
172 | |
173 | auto op_to_bind = operand; |
174 | |
175 | // Operand is with predicate - can't bind. |
176 | if (operand.mod().getPredCtrl() != ngen::PredCtrl::None) return; |
177 | |
178 | int esize = operand.mod().getExecSize(); |
179 | if (esize == 0) esize = 1; |
180 | if (esize != expr.type().elems()) { |
181 | ir_assert(expr.type().is_scalar() || esize == 1) |
182 | << "Expected broadcast." ; |
183 | if (operand.is_reg_buf_data() && esize != 1) { |
184 | // Bind scalar expression to the first vector element. |
185 | op_to_bind = operand.reg_buf_data().format( |
186 | 0, ngen::DataType::invalid, 1); |
187 | } |
188 | } |
189 | |
190 | auto ret = expr2operand_.insert({expr, op_to_bind}); |
191 | ir_assert(ret.second) << "Already bound: " << expr; |
192 | } |
193 | |
194 | void unbind(const expr_t &expr) { |
195 | ir_assert(!expr.is_empty()); |
196 | |
197 | auto it = expr2operand_.find(expr); |
198 | ir_assert(it != expr2operand_.end()); |
199 | expr2operand_.erase(it); |
200 | } |
201 | |
202 | private: |
203 | ngen::HW hw_; |
204 | object_map_t<expr_t, ngen_operand_t> expr2dst_; |
205 | object_map_t<expr_t, ngen_operand_t> expr2operand_; |
206 | }; |
207 | |
208 | template <ngen::HW hw> |
209 | class expr_evaluator_t; |
210 | |
211 | template <ngen::HW hw> |
212 | class ir_to_ngen_t; |
213 | |
214 | enum class grf_mode_t { |
215 | any, // Kernel sets optimal grf mode |
216 | matches, // Propogate grf mode to avoid context switch |
217 | small, // Force small grf_mode |
218 | large, // Force large grf_mode |
219 | }; |
220 | |
221 | template <ngen::HW hw> |
222 | class ir_kernel_t : public jit_generator<hw> { |
223 | public: |
224 | NGEN_FORWARD_OPENCL(hw); |
225 | |
226 | friend class expr_evaluator_t<hw>; |
227 | friend class ir_to_ngen_t<hw>; |
228 | friend class send_impl_t; |
229 | |
230 | ir_kernel_t(const std::string &kernel_name, const exec_config_t &exec_cfg, |
231 | const kernel_info_t &kernel_info, bool require_dpas, |
232 | grf_mode_t grf_mode = grf_mode_t::any) |
233 | : kernel_name_(kernel_name) |
234 | , exec_cfg_(exec_cfg) |
235 | , kernel_info_(kernel_info) |
236 | , require_dpas_(require_dpas) |
237 | , regs_((grf_mode == grf_mode_t::large) |
238 | ? 256 |
239 | : (grf_mode == grf_mode_t::small) ? 128 |
240 | : exec_cfg.regs()) |
241 | , ra_(hw, kernel_name, |
242 | grf_mode == grf_mode_t::any ? reg_allocator_t::warn_all |
243 | : reg_allocator_t::warn_default) |
244 | , emu_strategy(hw, exec_cfg.hw_cfg().stepping_id()) { |
245 | ra_.setRegisterCount(regs_); |
246 | } |
247 | |
248 | void setup_interface(const stmt_t &kernel_body = stmt_t()) { |
249 | externalName(kernel_name_); |
250 | requireLocalID(3); |
251 | requireLocalSize(); |
252 | requireGRF(regs_); |
253 | requireSIMD(exec_cfg_.simd()); |
254 | requireBarrier(); |
255 | if (require_dpas_) requireDPAS(); |
256 | if (has_send_atomics(kernel_body)) requireGlobalAtomics(); |
257 | |
258 | for (int i = 0; i < kernel_info_.nargs(); i++) { |
259 | auto &name = kernel_info_.arg_name(i); |
260 | auto &type = kernel_info_.arg_type(i); |
261 | if (type.is_ptr()) { |
262 | newArgument(name, ngen::ExternalArgumentType::GlobalPtr); |
263 | } else { |
264 | newArgument(name, to_ngen(type)); |
265 | } |
266 | } |
267 | |
268 | if (!kernel_body.is_empty()) { |
269 | int slm_size = alloc_manager_t(kernel_body) |
270 | .total_size(alloc_kind_t::slm); |
271 | requireSLM(slm_size); |
272 | } |
273 | |
274 | finalizeInterface(); |
275 | } |
276 | |
277 | void generate_prologue() { |
278 | setDefaultNoMask(); |
279 | setDefaultAutoSWSB(true); |
280 | |
281 | prologue(); |
282 | |
283 | // Claim registers. |
284 | ra_.claim(r0); |
285 | for (int i = 0; i < 3; i++) |
286 | ra_.claim(getLocalID(i)); |
287 | |
288 | for (int i = 0; i < kernel_info_.nargs(); i++) { |
289 | ra_.claim(getArgument(kernel_info_.arg_name(i))); |
290 | } |
291 | |
292 | if (emu_strategy.emulate64) { |
293 | emu_state.temp[0] = ra_.alloc(); |
294 | emu_state.temp[1] = ra_.alloc(); |
295 | } |
296 | // Enable IEEE f32 -> s32 rounding and f32/f16 denormals. |
297 | or_(1, cr0, cr0, uint16_t(0x1480)); |
298 | |
299 | // Allocate and initialize signal header for future use. |
300 | if (require_signal_header_) { |
301 | signal_header_ = ra_.alloc(); |
302 | barrierheader(signal_header_); |
303 | } |
304 | } |
305 | |
306 | void bind_external_vars(const stmt_t &kernel_body, |
307 | const grid_info_t &kernel_grid, |
308 | const std::array<expr_t, 3> &local_id, |
309 | expr_binding_t &expr_binding) { |
310 | alloc_manager_t alloc_mgr(kernel_body); |
311 | |
312 | // Bind grid indices. |
313 | int r0_sub_idxs[] = {1, 6, 7}; |
314 | for (int i = 0; i < 3; i++) { |
315 | auto tmp = ra_.template alloc_sub<int32_t>(); |
316 | mov(1, tmp, r0.ud(r0_sub_idxs[i])); |
317 | expr_binding.bind(kernel_grid.idx(i), tmp); |
318 | } |
319 | |
320 | // Bind local IDs. |
321 | for (int i = 0; i < 3; i++) { |
322 | expr_binding.bind(local_id[i], getLocalID(i).uw(0)); |
323 | } |
324 | |
325 | // Bind arguments. |
326 | for (int i = 0; i < kernel_info_.nargs(); i++) { |
327 | auto &arg_var = kernel_info_.arg_var(i); |
328 | auto &name = kernel_info_.arg_name(i); |
329 | if (arg_var.type().is_ptr()) { |
330 | auto alloc_buf = alloc_mgr.find_buffer(name); |
331 | ir_assert(alloc_buf.is_same(arg_var)); |
332 | } |
333 | expr_binding.bind(arg_var, getArgument(name)); |
334 | } |
335 | |
336 | // Bind SLM buffer (SLM loads/stores use 0-based offsets). |
337 | auto slm_buf = alloc_mgr.find_buffer("slm" , /*allow_empty=*/true); |
338 | if (!slm_buf.is_empty()) expr_binding.bind(slm_buf, to_ngen(expr_t(0))); |
339 | } |
340 | |
341 | void generate_epilogue() { |
342 | epilogue(); |
343 | pad_kernel(); |
344 | } |
345 | |
346 | // Kernel padding for instruction prefetch. |
347 | void pad_kernel() { |
348 | for (int rep = 0; rep < 8; rep++) |
349 | nop(); |
350 | } |
351 | |
352 | void emov(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
353 | const ngen_operand_t &src0) { |
354 | if (dst.is_reg_data()) { |
355 | if (src0.is_reg_data()) { |
356 | emov(mod, dst.reg_data(), src0.reg_data()); |
357 | } else if (src0.is_reg_buf_data()) { |
358 | emov(mod, dst.reg_data(), src0.reg_buf_data().reg_data()); |
359 | } else if (src0.is_immediate()) { |
360 | emov(mod, dst.reg_data(), src0.immediate()); |
361 | } else if (dst.type() == ngen::DataType::uw) { |
362 | emov(mod, dst.reg_data(), src0.flag_register()); |
363 | } else { |
364 | emov(mod | src0.flag_register_mod(), dst.reg_data(), 1); |
365 | emov(mod | ~src0.flag_register_mod(), dst.reg_data(), 0); |
366 | } |
367 | } else { |
368 | // dst is a flag register. |
369 | ir_assert(!dst.is_negated()); |
370 | auto _mod = mod; |
371 | _mod.setExecSize(1); |
372 | if (src0.is_reg_data()) { |
373 | emov(_mod, dst.flag_register(), src0.reg_data()); |
374 | } else { |
375 | emov(_mod, dst.flag_register(), src0.immediate()); |
376 | } |
377 | } |
378 | } |
379 | |
380 | void eadd(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
381 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
382 | if (src0.is_immediate()) { |
383 | ir_assert(src1.is_reg_data()); |
384 | eadd(mod, dst, src1, src0); |
385 | return; |
386 | } |
387 | if (src1.is_reg_data()) { |
388 | eadd(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
389 | } else { |
390 | eadd(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
391 | } |
392 | } |
393 | |
394 | void emul(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
395 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
396 | if (src0.is_immediate()) { |
397 | ir_assert(src1.is_reg_data()); |
398 | emul(mod, dst, src1, src0); |
399 | return; |
400 | } |
401 | if (src1.is_reg_data()) { |
402 | if (ngen_is_dw(src1.type()) && ngen_is_w(src0.type())) { |
403 | emul(mod, dst.reg_data(), src1.reg_data(), src0.reg_data()); |
404 | } else { |
405 | emul(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
406 | } |
407 | } else { |
408 | auto &src1_imm = src1.immediate(); |
409 | if (ngen_is_qw(dst.type()) || ngen_is_w(src1_imm.getType())) { |
410 | emul(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
411 | return; |
412 | } |
413 | if (ngen_is_dw(src1_imm.getType())) { |
414 | ir_assert(mod.getExecSize() == 1); |
415 | auto tmp = ra_.alloc_sub<int64_t>(); |
416 | if (ngen_is_w(src0.type())) { |
417 | auto tmp_src1 = ra_.alloc_sub<int32_t>(); |
418 | emov(mod, tmp_src1.d(0), src0.reg_data()); |
419 | emul(mod, tmp.q(0), tmp_src1.d(0), src1_imm); |
420 | ra_.safeRelease(tmp_src1); |
421 | } else { |
422 | emul(mod, tmp.q(0), src0.reg_data(), src1_imm); |
423 | } |
424 | emov(mod, dst.reg_data(), tmp.reinterpret(0, dst.type())); |
425 | ra_.safeRelease(tmp); |
426 | return; |
427 | } |
428 | emul(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
429 | } |
430 | } |
431 | |
432 | void edp4a(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
433 | const ngen_operand_t &src0, const ngen_operand_t &src1, |
434 | const ngen_operand_t &src2) { |
435 | ir_assert(!src0.is_immediate() || !src2.is_immediate()); |
436 | if (src0.is_immediate()) { |
437 | dp4a(mod, dst.reg_data(), src0.immediate(), src1.reg_data(), |
438 | src2.reg_data()); |
439 | } else if (src2.is_immediate()) { |
440 | dp4a(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), |
441 | src2.immediate()); |
442 | } else { |
443 | dp4a(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), |
444 | src2.reg_data()); |
445 | } |
446 | } |
447 | |
448 | void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
449 | const ngen_operand_t &src0, const ngen_operand_t &src1, |
450 | const ngen_operand_t &src2) { |
451 | if (hw >= ngen::HW::XeHP) { |
452 | if (src2.is_reg_data()) { |
453 | add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), |
454 | src2.reg_data()); |
455 | } else { |
456 | add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), |
457 | src2.immediate()); |
458 | } |
459 | return; |
460 | } |
461 | add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
462 | if (src2.is_reg_data()) { |
463 | add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data()); |
464 | } else { |
465 | add(mod, dst.reg_data(), dst.reg_data(), src2.immediate()); |
466 | } |
467 | } |
468 | |
469 | void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
470 | const ngen_operand_t &src0, const ngen_operand_t &src1, |
471 | const ngen_operand_t &src2) { |
472 | if (src2.is_reg_data()) { |
473 | mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), |
474 | src2.reg_data()); |
475 | } else if (hw < ngen::HW::XeLP) { |
476 | mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate()); |
477 | add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data()); |
478 | } else if (src0.is_immediate() |
479 | && (ngen_is_dw(src0.type()) |
480 | || src0.type() == ngen::DataType::uw)) { |
481 | // dword immediate src0 is not supported, move to a register. |
482 | auto tmp_src0 = ra_.alloc_sub(src0.type()); |
483 | mov(1, tmp_src0, src0.immediate()); |
484 | mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(), |
485 | src2.immediate()); |
486 | ra_.safeRelease(tmp_src0); |
487 | } else { |
488 | mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(), |
489 | src2.immediate()); |
490 | } |
491 | } |
492 | |
493 | void ediv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
494 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
495 | if (!src1.is_immediate()) { |
496 | efdiv(mod, dst, src0, src1); |
497 | } else { |
498 | auto &src1_imm = src1.immediate(); |
499 | int32_t src1_value = to_cpp<int32_t>(src1_imm); |
500 | ir_assert(0 < src1_value && src1_value <= INT32_MAX) << src1_value; |
501 | eidiv(mod, dst.reg_data(), ngen::Subregister(), src0.reg_data(), |
502 | src1_value); |
503 | } |
504 | } |
505 | |
506 | void efdiv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
507 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
508 | ir_assert(!src1.is_immediate()); |
509 | auto one = ra_.alloc().f(); |
510 | auto zero = ra_.alloc().f(); |
511 | |
512 | auto tmp = ra_.alloc_range(4); |
513 | |
514 | int esize = mod.getExecSize(); |
515 | int grf_size = ngen::GRF::bytes(hw); |
516 | int div_esize = std::min(esize, grf_size / int(sizeof(float))); |
517 | |
518 | int tmp_regs = utils::div_up(esize * int(sizeof(float)), grf_size); |
519 | auto src0_tmp = ra_.alloc_range(tmp_regs); |
520 | auto src1_tmp = ra_.alloc_range(tmp_regs); |
521 | |
522 | // Copy to temporary registers to ensure dst, num and denom are |
523 | // distinct as required for fdiv_ieee. |
524 | mov(mod, src0_tmp[0].f(), src0.reg_data()); |
525 | mov(mod, src1_tmp[0].f(), src1.reg_data()); |
526 | |
527 | auto div_mod = ngen::InstructionModifier(mod); |
528 | div_mod.setExecSize(div_esize); |
529 | |
530 | mov(div_mod, one, ngen::Immediate(1)); |
531 | mov(div_mod, zero, ngen::Immediate(0)); |
532 | |
533 | // Enable mask as fdiv_ieee relies on masked if/endif flow. |
534 | setDefaultNoMask(false); |
535 | |
536 | for (int i = 0; i < mod.getExecSize(); i += div_esize) { |
537 | fdiv_ieee(div_mod, f0[0], dst.sub_reg_data(i, div_esize).reg_data(), |
538 | src0_tmp[i / div_esize].f(), src1_tmp[i / div_esize].f(), |
539 | zero, one, tmp); |
540 | } |
541 | |
542 | ra_.safeRelease(one); |
543 | ra_.safeRelease(zero); |
544 | ra_.safeRelease(src0_tmp); |
545 | ra_.safeRelease(src1_tmp); |
546 | ra_.safeRelease(tmp); |
547 | |
548 | setDefaultNoMask(true); |
549 | } |
550 | |
551 | void emod(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
552 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
553 | ir_assert(src1.is_immediate()); |
554 | auto &src1_imm = src1.immediate(); |
555 | int32_t src1_value = to_cpp<int32_t>(src1_imm); |
556 | ir_assert(0 < src1_value && src1_value <= INT32_MAX) << src1_value; |
557 | eidiv(mod, ngen::Subregister(), dst.reg_data(), src0.reg_data(), |
558 | src1_value); |
559 | } |
560 | |
561 | void eshl(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
562 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
563 | if (src1.is_reg_data()) { |
564 | shl(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
565 | } else { |
566 | shl(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
567 | } |
568 | } |
569 | |
570 | void eshr(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
571 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
572 | if (src1.is_reg_data()) { |
573 | shr(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
574 | } else { |
575 | shr(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
576 | } |
577 | } |
578 | |
579 | void emin(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
580 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
581 | if (src1.is_reg_data()) { |
582 | min_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
583 | } else { |
584 | min_(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
585 | } |
586 | } |
587 | |
588 | void emax(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
589 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
590 | if (src1.is_reg_data()) { |
591 | max_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
592 | } else { |
593 | max_(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
594 | } |
595 | } |
596 | |
597 | void ecmp(const ngen::InstructionModifier &mod, const ngen_operand_t &src0, |
598 | const ngen_operand_t &src1) { |
599 | if (src1.is_reg_data()) { |
600 | cmp(mod, src0.reg_data(), src1.reg_data()); |
601 | } else { |
602 | cmp(mod, src0.reg_data(), src1.immediate()); |
603 | } |
604 | } |
605 | |
606 | void ecmp(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
607 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
608 | if (src1.is_reg_data()) { |
609 | cmp(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
610 | } else { |
611 | cmp(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
612 | } |
613 | } |
614 | |
615 | void eand(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
616 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
617 | if (src1.is_reg_data()) { |
618 | and_(mod, dst.reg_data(), src0.reg_data(), src1.reg_data()); |
619 | } else { |
620 | and_(mod, dst.reg_data(), src0.reg_data(), src1.immediate()); |
621 | } |
622 | } |
623 | |
624 | // Adapted version of magicgu function from Hacker's Delight 10-15. |
625 | static void eidiv_magicgu(uint32_t d, uint32_t &m, uint32_t &p) { |
626 | uint32_t s32_max = std::numeric_limits<int32_t>::max(); |
627 | ir_assert(d != 0 && d <= s32_max); |
628 | uint64_t nc = (s32_max / d) * d - 1; |
629 | for (p = 32; p < 64; p++) { |
630 | uint64_t _2p = 1LL << p; |
631 | if (_2p > nc * (d - 1 - (_2p - 1) % d)) { |
632 | m = (_2p + d - 1 - (_2p - 1) % d) / d; |
633 | return; |
634 | } |
635 | } |
636 | ir_error_not_expected(); |
637 | } |
638 | |
639 | // Emulates integer division by a constant. |
640 | // Requirements: |
641 | // 0 <= x <= UINT32_MAX |
642 | // 0 < y <= INT32_MAX |
643 | // Computes: |
644 | // qot = x / y |
645 | // rem = x % y |
646 | void eidiv(const ngen::InstructionModifier &mod, const ngen::RegData &qot, |
647 | const ngen::RegData &rem, const ngen::RegData &x, uint32_t y) { |
648 | ir_assert(x.getHS() == 0); |
649 | if (ngen::utils::is_zero_or_pow2(y)) { |
650 | auto _x = get_subregister(x); |
651 | if (x.getNeg()) { |
652 | // Negation modifier has bitwise semantics with shr/and so x |
653 | // needs to be arithmetically negated first. |
654 | _x = ra_.alloc_sub(x.getType()); |
655 | mov(1, _x, x); |
656 | } |
657 | if (!qot.isInvalid()) shr(mod, qot, _x, ngen::utils::log2(y)); |
658 | if (!rem.isInvalid()) and_(mod, rem, _x, y - 1); |
659 | if (_x != x) ra_.safeRelease(_x); |
660 | return; |
661 | } |
662 | |
663 | uint32_t m = 0, p = 0; |
664 | eidiv_magicgu(y, m, p); |
665 | |
666 | auto x_tmp = ra_.alloc().ud(); |
667 | auto qot_tmp = ra_.alloc().ud(); |
668 | auto _x = x_tmp[0]; |
669 | auto _qot = qot_tmp[0]; |
670 | mov(1, _x, x); |
671 | |
672 | // qot = (x * m) >> p |
673 | mul(1, acc0.ud(0), _x, m & 0xFFFF); |
674 | mach(1, _qot, _x, m); |
675 | shr<uint32_t>(1, _qot, _qot, p - 32); |
676 | if (!qot.isInvalid()) mov(mod, qot, _qot); |
677 | |
678 | if (!rem.isInvalid()) { |
679 | // rem = x - qot * y |
680 | bool y_is_16_bit = (y <= static_cast<uint32_t>( |
681 | std::numeric_limits<int16_t>::max())); |
682 | if (hw >= ngen::HW::XeLP && y_is_16_bit) { |
683 | mad(mod, rem, x, _qot, -int16_t(y)); |
684 | } else { |
685 | auto tmp = ra_.alloc_sub<uint64_t>(); |
686 | mul(1, tmp.ud(0), _qot, y & 0xFFFF); |
687 | mul(1, tmp.ud(1), _qot, y >> 16); |
688 | shl<uint32_t>(1, tmp.ud(1), tmp.ud(1), 16); |
689 | add(1, tmp.ud(0), tmp.ud(1), tmp.ud(0)); |
690 | add(mod, rem, x, -tmp.ud(0)); |
691 | ra_.safeRelease(tmp); |
692 | } |
693 | } |
694 | |
695 | ra_.safeRelease(x_tmp); |
696 | ra_.safeRelease(qot_tmp); |
697 | } |
698 | |
699 | template <typename DT = void> |
700 | void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, |
701 | ngen::RegData src0) { |
702 | EmulationImplementation::emov<DT>(*this, mod, dst, src0, emu_strategy); |
703 | } |
704 | template <typename DT = void> |
705 | void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, |
706 | ngen::Immediate src0) { |
707 | EmulationImplementation::emov<DT>(*this, mod, dst, src0, emu_strategy); |
708 | } |
709 | template <typename DT = void> |
710 | void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, |
711 | const ngen::RegData &src0, const ngen::RegData &src1) { |
712 | EmulationImplementation::eadd<DT>( |
713 | *this, mod, dst, src0, src1, emu_strategy, emu_state); |
714 | } |
715 | template <typename DT = void> |
716 | void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, |
717 | const ngen::RegData &src0, ngen::Immediate src1) { |
718 | EmulationImplementation::eadd<DT>( |
719 | *this, mod, dst, src0, src1, emu_strategy, emu_state); |
720 | } |
721 | template <typename DT = void> |
722 | void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, |
723 | const ngen::RegData &src0, const ngen::RegData &src1) { |
724 | if (ngen_is_xf(dst.getType())) { |
725 | mul(mod, dst, src0, src1); |
726 | return; |
727 | } |
728 | EmulationImplementation::emul<DT>( |
729 | *this, mod, dst, src0, src1, emu_strategy, emu_state); |
730 | } |
731 | template <typename DT = void> |
732 | void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, |
733 | const ngen::RegData &src0, ngen::Immediate src1) { |
734 | if (ngen_is_xf(dst.getType())) { |
735 | mul(mod, dst, src0, src1); |
736 | return; |
737 | } |
738 | EmulationImplementation::emul<DT>( |
739 | *this, mod, dst, src0, src1, emu_strategy, emu_state); |
740 | } |
741 | template <typename DT = void> |
742 | void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst, |
743 | ngen::RegData src0, uint16_t src1) { |
744 | EmulationImplementation::eshl<DT>( |
745 | *this, mod, dst, src0, src1, emu_strategy, emu_state); |
746 | } |
747 | template <typename DT = void> |
748 | void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst, |
749 | ngen::RegData src0, uint16_t src1) { |
750 | EmulationImplementation::eshr<DT>( |
751 | *this, mod, dst, src0, src1, emu_strategy, emu_state); |
752 | } |
753 | |
754 | protected: |
755 | std::string kernel_name_; |
756 | exec_config_t exec_cfg_; |
757 | kernel_info_t kernel_info_; |
758 | bool require_dpas_; |
759 | bool = false; |
760 | int regs_; |
761 | reg_allocator_t ra_; |
762 | ngen::GRF ; |
763 | |
764 | EmulationStrategy emu_strategy; |
765 | EmulationState emu_state; |
766 | }; |
767 | |
768 | #define IR_KERNEL_EMULATION_FORWARD(hw) \ |
769 | using ir_kernel_t<hw>::emov; \ |
770 | using ir_kernel_t<hw>::eadd; \ |
771 | using ir_kernel_t<hw>::emul; \ |
772 | using ir_kernel_t<hw>::eshl; \ |
773 | using ir_kernel_t<hw>::eshr; |
774 | |
775 | #define IR_KERNEL_FORWARD(hw) \ |
776 | NGEN_FORWARD_OPENCL(hw) \ |
777 | IR_KERNEL_EMULATION_FORWARD(hw) \ |
778 | using ir_kernel_t<hw>::setup_interface; \ |
779 | using ir_kernel_t<hw>::bind_external_vars; \ |
780 | using ir_kernel_t<hw>::generate_prologue; \ |
781 | using ir_kernel_t<hw>::generate_epilogue; \ |
782 | using ir_kernel_t<hw>::emu_strategy; \ |
783 | using ir_kernel_t<hw>::ra_; |
784 | |
785 | } // namespace jit |
786 | } // namespace gpu |
787 | } // namespace impl |
788 | } // namespace dnnl |
789 | |
790 | #endif |
791 | |