1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_JIT_JIT_GENERATOR_HPP |
18 | #define GPU_JIT_JIT_GENERATOR_HPP |
19 | |
20 | #include <memory> |
21 | |
22 | #include "common/bfloat16.hpp" |
23 | #include "common/float16.hpp" |
24 | #include "common/nstl.hpp" |
25 | #include "gpu/jit/jit_generator_base.hpp" |
26 | #include "oneapi/dnnl/dnnl_config.h" |
27 | |
28 | namespace ngen { |
29 | using bfloat16 = dnnl::impl::bfloat16_t; |
30 | using half = dnnl::impl::float16_t; |
31 | } // namespace ngen |
32 | |
33 | #define NGEN_BFLOAT16_TYPE |
34 | #define NGEN_HALF_TYPE |
35 | |
36 | #include "gpu/jit/ngen/ngen_opencl.hpp" |
37 | |
38 | namespace dnnl { |
39 | namespace impl { |
40 | namespace gpu { |
41 | namespace jit { |
42 | |
43 | using gpu_gen_t = ngen::HW; |
44 | constexpr gpu_gen_t gpu_gen9 = ngen::HW::Gen9; |
45 | constexpr gpu_gen_t gpu_gen11 = ngen::HW::Gen11; |
46 | constexpr gpu_gen_t gpu_xe_lp = ngen::HW::XeLP; |
47 | constexpr gpu_gen_t gpu_xe_hp = ngen::HW::XeHP; |
48 | constexpr gpu_gen_t gpu_xe_hpg = ngen::HW::XeHPG; |
49 | constexpr gpu_gen_t gpu_xe_hpc = ngen::HW::XeHPC; |
50 | |
51 | // nGEN jit generator |
52 | // |
53 | // The main purpose of this header file is to provide extra features for nGEN |
54 | // kernel generator, e.g. additional macros and debugging capabilities. |
55 | // |
56 | // Jit generator provides additional memory to simplify kernel debugging. This |
57 | // memory is allocated using Shared Virtual Memory (SVM) feature in OpenCL 2.0. |
58 | // SVM enables the host and device portions of an OpenCL application to |
59 | // seamlessly share pointers and complex pointer-containing data-structures. |
60 | // This memory can be used to dump state of GPU registers or view GPU memory on |
61 | // the host in debugger. |
62 | // |
63 | // In order to use debug memory: |
64 | // 1. Allocate it using 'void jit_generator::dbg_alloc(cl_context context)' |
65 | // 2. Get memory pointer using 'void* jit_generator::dbg_memory()' |
66 | // 3. Pass it as extra OpenCL kernel argument and define it as new argument in |
67 | // kernel interface at corresponding order. |
68 | // 4. Set a breakpoint after 'dnnl_stream_wait()', memory will be available on |
69 | // the host side after kernel execution. |
70 | // |
71 | // A short example below demonstrates how to use debug memory: |
72 | // |
73 | // ``` c++ |
74 | // status_t primitive_impl_t::execute(const exec_ctx_t &ctx) { |
75 | // ... |
76 | // auto gpu_engine = utils::downcast<ocl_gpu_engine*>(engine); |
77 | // jit_generator->dbg_alloc(gpu_engine->context()); |
78 | // void* dbg_mem = jit_generator->dbg_memory(); |
79 | // ... |
80 | // compute::kernel_arg_list_t arg_list; |
81 | // arg_list.set(0, src); |
82 | // arg_list.set(1, dst); |
83 | // arg_list.set(2, dbg_mem, kernel_arg_t::kind_t::svm); |
84 | // ... |
85 | // parallel_for(ctx, nd_range, kernel_, arg_list); |
86 | // } |
87 | // |
88 | // ngen_kernel_t() : jit_generator<...>() { |
89 | // externalName("ngen_kernel"); |
90 | // newArgument("src", GlobalPtr); |
91 | // newArgument("dst", GlobalPtr); |
92 | // newArgument("dbg_mem", GlobalPtr); |
93 | // finalizeInterface(); |
94 | // ... |
95 | // auto header = r32; |
96 | // auto data = r64; |
97 | // mov<uint64_t>(1, r64, getArgument("dbg_mem")); |
98 | // store(1, scattered_dword(), A64, header, data); |
99 | // ... |
100 | // } |
101 | // ``` |
102 | // |
103 | |
104 | template <gpu_gen_t hw> |
105 | struct jit_eltwise_injector_f32; |
106 | |
107 | template <gpu_gen_t hw> |
108 | struct jit_post_op_injector; |
109 | |
110 | template <gpu_gen_t hw> |
111 | class jit_generator : public ngen::OpenCLCodeGenerator<hw>, |
112 | public jit_generator_base { |
113 | friend struct jit_eltwise_injector_f32<hw>; |
114 | |
115 | friend struct jit_post_op_injector<hw>; |
116 | |
117 | private: |
118 | #ifdef CL_VERSION_2_0 |
119 | struct svm_deleter { |
120 | cl_context context_; |
121 | |
122 | void operator()(void *ptr) noexcept { |
123 | if (ptr) clSVMFree(context_, ptr); |
124 | } |
125 | }; |
126 | std::unique_ptr<void, svm_deleter> dbg_memory_; |
127 | #endif |
128 | |
129 | public: |
130 | jit_generator() = default; |
131 | |
132 | const char *kernel_name() const override { |
133 | return ngen::OpenCLCodeGenerator<hw>::getExternalName().c_str(); |
134 | } |
135 | |
136 | cl_kernel get_kernel(cl_context context, cl_device_id device) override { |
137 | return ngen::OpenCLCodeGenerator<hw>::getKernel(context, device); |
138 | } |
139 | |
140 | #ifdef CL_VERSION_2_0 |
141 | void dbg_alloc(cl_context context); |
142 | void *dbg_memory() const { return dbg_memory_.get(); } |
143 | #endif |
144 | |
145 | void emath(ngen::MathFunction fc, int simd, ngen::GRF dst, ngen::GRF src) { |
146 | const int max_exec_size = ngen::GRF::bytes(hw) / sizeof(float); |
147 | for (; simd > 0; simd -= max_exec_size, dst++, src++) |
148 | this->math(nstl::min(simd, max_exec_size), fc, dst, src); |
149 | } |
150 | void eexp(int simd, const ngen::GRF &dst, const ngen::GRF &src) { |
151 | emath(ngen::MathFunction::exp, simd, dst, src); |
152 | } |
153 | void einv(int simd, const ngen::GRF &dst, const ngen::GRF &src) { |
154 | emath(ngen::MathFunction::inv, simd, dst, src); |
155 | } |
156 | }; |
157 | |
158 | #ifdef CL_VERSION_2_0 |
159 | template <gpu_gen_t hw> |
160 | void jit_generator<hw>::dbg_alloc(cl_context context) { |
161 | constexpr size_t size = 1048576; |
162 | void *mem = clSVMAlloc( |
163 | context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, 0); |
164 | dbg_memory_ = decltype(dbg_memory_)(mem, svm_deleter {context}); |
165 | memset(mem, 0xcd, size); |
166 | } |
167 | #endif |
168 | |
169 | } // namespace jit |
170 | } // namespace gpu |
171 | } // namespace impl |
172 | } // namespace dnnl |
173 | |
174 | #endif // GPU_JIT_JIT_GENERATOR_HPP |
175 | |