1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_JIT_JIT_GENERATOR_HPP
18#define GPU_JIT_JIT_GENERATOR_HPP
19
20#include <memory>
21
22#include "common/bfloat16.hpp"
23#include "common/float16.hpp"
24#include "common/nstl.hpp"
25#include "gpu/jit/jit_generator_base.hpp"
26#include "oneapi/dnnl/dnnl_config.h"
27
28namespace ngen {
29using bfloat16 = dnnl::impl::bfloat16_t;
30using half = dnnl::impl::float16_t;
31} // namespace ngen
32
33#define NGEN_BFLOAT16_TYPE
34#define NGEN_HALF_TYPE
35
36#include "gpu/jit/ngen/ngen_opencl.hpp"
37
38namespace dnnl {
39namespace impl {
40namespace gpu {
41namespace jit {
42
43using gpu_gen_t = ngen::HW;
44constexpr gpu_gen_t gpu_gen9 = ngen::HW::Gen9;
45constexpr gpu_gen_t gpu_gen11 = ngen::HW::Gen11;
46constexpr gpu_gen_t gpu_xe_lp = ngen::HW::XeLP;
47constexpr gpu_gen_t gpu_xe_hp = ngen::HW::XeHP;
48constexpr gpu_gen_t gpu_xe_hpg = ngen::HW::XeHPG;
49constexpr gpu_gen_t gpu_xe_hpc = ngen::HW::XeHPC;
50
51// nGEN jit generator
52//
53// The main purpose of this header file is to provide extra features for nGEN
54// kernel generator, e.g. additional macros and debugging capabilities.
55//
56// Jit generator provides additional memory to simplify kernel debugging. This
57// memory is allocated using Shared Virtual Memory (SVM) feature in OpenCL 2.0.
58// SVM enables the host and device portions of an OpenCL application to
59// seamlessly share pointers and complex pointer-containing data-structures.
60// This memory can be used to dump state of GPU registers or view GPU memory on
61// the host in debugger.
62//
63// In order to use debug memory:
64// 1. Allocate it using 'void jit_generator::dbg_alloc(cl_context context)'
65// 2. Get memory pointer using 'void* jit_generator::dbg_memory()'
66// 3. Pass it as extra OpenCL kernel argument and define it as new argument in
67// kernel interface at corresponding order.
68// 4. Set a breakpoint after 'dnnl_stream_wait()', memory will be available on
69// the host side after kernel execution.
70//
71// A short example below demonstrates how to use debug memory:
72//
73// ``` c++
74// status_t primitive_impl_t::execute(const exec_ctx_t &ctx) {
75// ...
76// auto gpu_engine = utils::downcast<ocl_gpu_engine*>(engine);
77// jit_generator->dbg_alloc(gpu_engine->context());
78// void* dbg_mem = jit_generator->dbg_memory();
79// ...
80// compute::kernel_arg_list_t arg_list;
81// arg_list.set(0, src);
82// arg_list.set(1, dst);
83// arg_list.set(2, dbg_mem, kernel_arg_t::kind_t::svm);
84// ...
85// parallel_for(ctx, nd_range, kernel_, arg_list);
86// }
87//
88// ngen_kernel_t() : jit_generator<...>() {
89// externalName("ngen_kernel");
90// newArgument("src", GlobalPtr);
91// newArgument("dst", GlobalPtr);
92// newArgument("dbg_mem", GlobalPtr);
93// finalizeInterface();
94// ...
95// auto header = r32;
96// auto data = r64;
97// mov<uint64_t>(1, r64, getArgument("dbg_mem"));
98// store(1, scattered_dword(), A64, header, data);
99// ...
100// }
101// ```
102//
103
104template <gpu_gen_t hw>
105struct jit_eltwise_injector_f32;
106
107template <gpu_gen_t hw>
108struct jit_post_op_injector;
109
110template <gpu_gen_t hw>
111class jit_generator : public ngen::OpenCLCodeGenerator<hw>,
112 public jit_generator_base {
113 friend struct jit_eltwise_injector_f32<hw>;
114
115 friend struct jit_post_op_injector<hw>;
116
117private:
118#ifdef CL_VERSION_2_0
119 struct svm_deleter {
120 cl_context context_;
121
122 void operator()(void *ptr) noexcept {
123 if (ptr) clSVMFree(context_, ptr);
124 }
125 };
126 std::unique_ptr<void, svm_deleter> dbg_memory_;
127#endif
128
129public:
130 jit_generator() = default;
131
132 const char *kernel_name() const override {
133 return ngen::OpenCLCodeGenerator<hw>::getExternalName().c_str();
134 }
135
136 cl_kernel get_kernel(cl_context context, cl_device_id device) override {
137 return ngen::OpenCLCodeGenerator<hw>::getKernel(context, device);
138 }
139
140#ifdef CL_VERSION_2_0
141 void dbg_alloc(cl_context context);
142 void *dbg_memory() const { return dbg_memory_.get(); }
143#endif
144
145 void emath(ngen::MathFunction fc, int simd, ngen::GRF dst, ngen::GRF src) {
146 const int max_exec_size = ngen::GRF::bytes(hw) / sizeof(float);
147 for (; simd > 0; simd -= max_exec_size, dst++, src++)
148 this->math(nstl::min(simd, max_exec_size), fc, dst, src);
149 }
150 void eexp(int simd, const ngen::GRF &dst, const ngen::GRF &src) {
151 emath(ngen::MathFunction::exp, simd, dst, src);
152 }
153 void einv(int simd, const ngen::GRF &dst, const ngen::GRF &src) {
154 emath(ngen::MathFunction::inv, simd, dst, src);
155 }
156};
157
158#ifdef CL_VERSION_2_0
159template <gpu_gen_t hw>
160void jit_generator<hw>::dbg_alloc(cl_context context) {
161 constexpr size_t size = 1048576;
162 void *mem = clSVMAlloc(
163 context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, 0);
164 dbg_memory_ = decltype(dbg_memory_)(mem, svm_deleter {context});
165 memset(mem, 0xcd, size);
166}
167#endif
168
169} // namespace jit
170} // namespace gpu
171} // namespace impl
172} // namespace dnnl
173
174#endif // GPU_JIT_JIT_GENERATOR_HPP
175