1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <sstream>
19#include <CL/cl.h>
20
21#include "gpu/ocl/ocl_gpu_engine.hpp"
22
23#include "common/type_helpers.hpp"
24#include "common/utils.hpp"
25#include "gpu/compute/kernel_list.hpp"
26#include "gpu/ocl/kernel_utils.hpp"
27#include "gpu/ocl/ocl_gpu_device_info.hpp"
28#include "gpu/ocl/ocl_gpu_engine.hpp"
29#include "gpu/ocl/ocl_memory_storage.hpp"
30#include "gpu/ocl/ocl_stream.hpp"
31#include "gpu/ocl/ocl_utils.hpp"
32
33namespace dnnl {
34namespace impl {
35namespace gpu {
36namespace ocl {
37
38status_t ocl_gpu_engine_t::init() {
39 return init({});
40}
41
42status_t ocl_gpu_engine_t::init(const std::vector<uint8_t> &cache_blob) {
43 cl_int err = CL_SUCCESS;
44 err = clGetDeviceInfo(device_, CL_DEVICE_PLATFORM, sizeof(platform_),
45 &platform_, nullptr);
46 if (err != CL_SUCCESS) {
47 device_ = nullptr;
48 context_ = nullptr;
49 }
50
51 OCL_CHECK(err);
52
53 err = clRetainDevice(device_);
54 if (err != CL_SUCCESS) {
55 device_ = nullptr;
56 context_ = nullptr;
57 }
58
59 OCL_CHECK(err);
60
61 if (is_user_context_) {
62 err = clRetainContext(context_);
63 if (err != CL_SUCCESS) context_ = nullptr;
64 } else {
65 context_
66 = clCreateContext(nullptr, 1, &device_, nullptr, nullptr, &err);
67 }
68
69 OCL_CHECK(err);
70
71 CHECK(check_device(engine_kind::gpu, device_, context_));
72 compute::compute_engine_t::init(cache_blob);
73
74 return status::success;
75}
76
77status_t ocl_gpu_engine_t::create_memory_storage(
78 memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
79 auto _storage = new ocl_buffer_memory_storage_t(this);
80 if (_storage == nullptr) return status::out_of_memory;
81 status_t status = _storage->init(flags, size, handle);
82 if (status != status::success) {
83 delete _storage;
84 return status;
85 }
86 *storage = _storage;
87 return status::success;
88}
89
90status_t ocl_gpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
91 return ocl_stream_t::create_stream(stream, this, flags);
92}
93
94status_t ocl_gpu_engine_t::create_stream(
95 stream_t **stream, cl_command_queue queue) {
96 return ocl_stream_t::create_stream(stream, this, queue);
97}
98
99namespace {
100
101status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine,
102 cache_blob_t cache_blob, const std::vector<const char *> &kernel_names,
103 std::vector<compute::kernel_t> *kernels) {
104 auto dev = ocl_engine->device();
105 auto ctx = ocl_engine->context();
106 cl_int err = CL_SUCCESS;
107 for (size_t i = 0; i < kernel_names.size(); i++) {
108 if (!kernel_names[i]) continue;
109 std::string kernel_name(kernel_names[i]);
110
111 const uint8_t *binary = nullptr;
112 size_t binary_size = 0;
113
114 CHECK(cache_blob.get_binary(&binary, &binary_size));
115
116 auto program = make_ocl_wrapper(clCreateProgramWithBinary(
117 ctx, 1, &dev, &binary_size, &binary, nullptr, &err));
118 OCL_CHECK(err);
119 err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr);
120 OCL_CHECK(err);
121
122 if (kernel_name.empty()) {
123 // Handle the ngen cases when kernel name is not available.
124 // Query the kernel name from the program. It's expected that
125 // an ngen based program contains only 1 kernel.
126 if (kernel_names.size() != 1 || kernels->size() != 1)
127 return status::invalid_arguments;
128 size_t kernel_name_size = 0;
129 err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, nullptr,
130 &kernel_name_size);
131 OCL_CHECK(err);
132
133 kernel_name.resize(kernel_name_size);
134 err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES,
135 kernel_name_size, &kernel_name[0], nullptr);
136 OCL_CHECK(err);
137 assert(!kernel_name.empty());
138 if (kernel_name.empty()) return status::runtime_error;
139 // Remove the null terminator as std::string already includes it.
140 kernel_name.pop_back();
141 }
142 auto ocl_kernel = make_ocl_wrapper(
143 clCreateKernel(program, kernel_name.c_str(), &err));
144 OCL_CHECK(err);
145
146 std::vector<gpu::compute::scalar_type_t> arg_types;
147 CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
148 OCL_CHECK(err);
149
150 auto shared_binary = std::make_shared<compute::binary_t>(
151 binary, binary + binary_size);
152 (*kernels)[i] = compute::kernel_t(
153 new ocl_gpu_kernel_t(shared_binary, kernel_name, arg_types));
154 dump_kernel_binary(ocl_engine, (*kernels)[i]);
155 }
156
157 return status::success;
158}
159
160cl_int maybe_print_debug_info(
161 cl_int err, cl_program program, cl_device_id dev) {
162 // Return error code if verbose is not enabled.
163 if (err == CL_SUCCESS || get_verbose() == 0) return err;
164
165 size_t log_length = 0;
166 err = clGetProgramBuildInfo(
167 program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_length);
168 assert(err == CL_SUCCESS);
169
170 std::vector<char> log_buf(log_length);
171 err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_length,
172 log_buf.data(), nullptr);
173 assert(err == CL_SUCCESS);
174 printf("Error during the build of OpenCL program.\nBuild "
175 "log:\n%s\n",
176 log_buf.data());
177 return err;
178};
179
180inline status_t preprocess_headers(
181 std::stringstream &pp_code, const char *code) {
182 std::stringstream code_stream(code);
183
184 for (std::string line; std::getline(code_stream, line);) {
185 const size_t include_pos = line.find("#include");
186 if (include_pos != std::string::npos) {
187 static constexpr size_t include_len = 8;
188 const size_t first_quote_pos
189 = line.find("\"", include_pos + include_len);
190 const size_t second_quote_pos
191 = line.find("\"", first_quote_pos + 1);
192 const size_t kernel_name_len
193 = second_quote_pos - first_quote_pos - 1;
194 const auto header_name
195 = line.substr(first_quote_pos + 1, kernel_name_len);
196 CHECK(preprocess_headers(pp_code, get_kernel_header(header_name)));
197 } else {
198 pp_code << line << std::endl;
199 }
200 }
201 return status::success;
202}
203
204} // namespace
205
206status_t ocl_gpu_engine_t::create_kernel(compute::kernel_t *kernel,
207 jit::jit_generator_base *jitter, cache_blob_t cache_blob) const {
208 if (!jitter && !cache_blob) return status::invalid_arguments;
209
210 const char *kernel_name = jitter ? jitter->kernel_name() : "";
211
212 if (cache_blob) {
213 std::vector<compute::kernel_t> kernels(1);
214 auto status = create_ocl_kernel_from_cache_blob(
215 this, cache_blob, {kernel_name}, &kernels);
216 CHECK(status);
217 (*kernel) = kernels[0];
218 return status::success;
219 }
220
221 ocl_wrapper_t<cl_kernel> ocl_kernel
222 = jitter->get_kernel(context(), device());
223 std::vector<gpu::compute::scalar_type_t> arg_types;
224 CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
225
226 std::shared_ptr<compute::binary_t> shared_binary;
227 CHECK(get_ocl_program_binary(ocl_kernel, device(), shared_binary));
228
229 *kernel = compute::kernel_t(
230 new ocl_gpu_kernel_t(shared_binary, kernel_name, arg_types));
231 dump_kernel_binary(this, *kernel);
232
233 return status::success;
234}
235
236status_t ocl_gpu_engine_t::create_kernels(
237 std::vector<compute::kernel_t> *kernels,
238 const std::vector<const char *> &kernel_names,
239 const compute::kernel_ctx_t &kernel_ctx,
240 cache_blob_t cache_blob) const {
241
242 *kernels = std::vector<compute::kernel_t>(kernel_names.size());
243
244 if (cache_blob) {
245 return create_ocl_kernel_from_cache_blob(
246 this, cache_blob, kernel_names, kernels);
247 }
248
249 compute::kernel_list_t kernel_list;
250 for (size_t i = 0; i < kernels->size(); ++i) {
251 if (kernel_names[i]) kernel_list.add(kernel_names[i], &(*kernels)[i]);
252 }
253
254 return ocl::create_kernels(this, kernel_list, kernel_ctx);
255}
256
257status_t ocl_gpu_engine_t::create_kernels_from_ocl_source(
258 std::vector<compute::kernel_t> *kernels,
259 const std::vector<const char *> &kernel_names, const char *code_string,
260 const compute::kernel_ctx_t &kernel_ctx) const {
261 std::string options = kernel_ctx.options();
262
263 // XXX: Update options by adding macros for OpenCL extensions that are not
264 // handled properly by the OpenCL runtime
265 auto *dev_info
266 = utils::downcast<const ocl_gpu_device_info_t *>(device_info());
267 options += " " + dev_info->get_cl_ext_options();
268
269 cl_int err;
270 std::stringstream pp_code;
271 // The `cl_cache` requires using `clBuildProgram`. Unfortunately, unlike
272 // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
273 // that, a manual preprocessing of `include` header directives in the
274 // OpenCL kernels is required.
275 CHECK(preprocess_headers(pp_code, code_string));
276 std::string pp_code_str = pp_code.str();
277 const char *pp_code_str_ptr = pp_code_str.c_str();
278
279 auto program = make_ocl_wrapper(clCreateProgramWithSource(
280 context(), 1, &pp_code_str_ptr, nullptr, &err));
281 OCL_CHECK(err);
282
283 cl_device_id dev = device();
284 err = clBuildProgram(program, 1, &dev, options.c_str(), nullptr, nullptr);
285 OCL_CHECK(maybe_print_debug_info(err, program, dev));
286
287 std::shared_ptr<compute::binary_t> shared_binary;
288 CHECK(get_ocl_program_binary(program, dev, shared_binary));
289
290 *kernels = std::vector<compute::kernel_t>(kernel_names.size());
291 for (size_t i = 0; i < kernel_names.size(); ++i) {
292 cl_int err;
293 ocl_wrapper_t<cl_kernel> ocl_kernel
294 = clCreateKernel(program, kernel_names[i], &err);
295 OCL_CHECK(err);
296 std::vector<gpu::compute::scalar_type_t> arg_types;
297 CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
298
299 (*kernels)[i] = compute::kernel_t(new ocl_gpu_kernel_t(
300 shared_binary, kernel_names[i], arg_types));
301 dump_kernel_binary(this, (*kernels)[i]);
302 }
303
304 return status::success;
305}
306
307std::function<void(void *)> ocl_gpu_engine_t::get_program_list_deleter() const {
308 return [](void *p) {
309 cl_int err = clReleaseProgram(reinterpret_cast<cl_program>(p));
310 assert(err == 0);
311 MAYBE_UNUSED(err);
312 };
313}
314
315status_t ocl_gpu_engine_t::init_device_info() {
316 return init_device_info({});
317}
318
319status_t ocl_gpu_engine_t::init_device_info(
320 const std::vector<uint8_t> &cache_blob) {
321 device_info_ = std::make_shared<ocl_gpu_device_info_t>();
322 CHECK(device_info_->init(this, cache_blob));
323 return status::success;
324}
325
326status_t ocl_gpu_engine_t::serialize_device(
327 serialization_stream_t &sstream) const {
328 size_t platform_name_len;
329 cl_int err = clGetPlatformInfo(
330 platform_, CL_PLATFORM_NAME, 0, nullptr, &platform_name_len);
331 OCL_CHECK(err);
332
333 std::vector<char> platform_name(platform_name_len);
334 err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, platform_name.size(),
335 platform_name.data(), nullptr);
336 OCL_CHECK(err);
337
338 sstream.write(platform_name.data(), platform_name.size());
339 sstream.write(device_info()->name().data(), device_info()->name().size());
340 sstream.write(&device_info()->runtime_version().major);
341 sstream.write(&device_info()->runtime_version().minor);
342 sstream.write(&device_info()->runtime_version().build);
343
344 return status::success;
345}
346
347} // namespace ocl
348} // namespace gpu
349} // namespace impl
350} // namespace dnnl
351