1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <sstream> |
19 | #include <CL/cl.h> |
20 | |
21 | #include "gpu/ocl/ocl_gpu_engine.hpp" |
22 | |
23 | #include "common/type_helpers.hpp" |
24 | #include "common/utils.hpp" |
25 | #include "gpu/compute/kernel_list.hpp" |
26 | #include "gpu/ocl/kernel_utils.hpp" |
27 | #include "gpu/ocl/ocl_gpu_device_info.hpp" |
28 | #include "gpu/ocl/ocl_gpu_engine.hpp" |
29 | #include "gpu/ocl/ocl_memory_storage.hpp" |
30 | #include "gpu/ocl/ocl_stream.hpp" |
31 | #include "gpu/ocl/ocl_utils.hpp" |
32 | |
33 | namespace dnnl { |
34 | namespace impl { |
35 | namespace gpu { |
36 | namespace ocl { |
37 | |
38 | status_t ocl_gpu_engine_t::init() { |
39 | return init({}); |
40 | } |
41 | |
42 | status_t ocl_gpu_engine_t::init(const std::vector<uint8_t> &cache_blob) { |
43 | cl_int err = CL_SUCCESS; |
44 | err = clGetDeviceInfo(device_, CL_DEVICE_PLATFORM, sizeof(platform_), |
45 | &platform_, nullptr); |
46 | if (err != CL_SUCCESS) { |
47 | device_ = nullptr; |
48 | context_ = nullptr; |
49 | } |
50 | |
51 | OCL_CHECK(err); |
52 | |
53 | err = clRetainDevice(device_); |
54 | if (err != CL_SUCCESS) { |
55 | device_ = nullptr; |
56 | context_ = nullptr; |
57 | } |
58 | |
59 | OCL_CHECK(err); |
60 | |
61 | if (is_user_context_) { |
62 | err = clRetainContext(context_); |
63 | if (err != CL_SUCCESS) context_ = nullptr; |
64 | } else { |
65 | context_ |
66 | = clCreateContext(nullptr, 1, &device_, nullptr, nullptr, &err); |
67 | } |
68 | |
69 | OCL_CHECK(err); |
70 | |
71 | CHECK(check_device(engine_kind::gpu, device_, context_)); |
72 | compute::compute_engine_t::init(cache_blob); |
73 | |
74 | return status::success; |
75 | } |
76 | |
77 | status_t ocl_gpu_engine_t::create_memory_storage( |
78 | memory_storage_t **storage, unsigned flags, size_t size, void *handle) { |
79 | auto _storage = new ocl_buffer_memory_storage_t(this); |
80 | if (_storage == nullptr) return status::out_of_memory; |
81 | status_t status = _storage->init(flags, size, handle); |
82 | if (status != status::success) { |
83 | delete _storage; |
84 | return status; |
85 | } |
86 | *storage = _storage; |
87 | return status::success; |
88 | } |
89 | |
90 | status_t ocl_gpu_engine_t::create_stream(stream_t **stream, unsigned flags) { |
91 | return ocl_stream_t::create_stream(stream, this, flags); |
92 | } |
93 | |
94 | status_t ocl_gpu_engine_t::create_stream( |
95 | stream_t **stream, cl_command_queue queue) { |
96 | return ocl_stream_t::create_stream(stream, this, queue); |
97 | } |
98 | |
99 | namespace { |
100 | |
101 | status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine, |
102 | cache_blob_t cache_blob, const std::vector<const char *> &kernel_names, |
103 | std::vector<compute::kernel_t> *kernels) { |
104 | auto dev = ocl_engine->device(); |
105 | auto ctx = ocl_engine->context(); |
106 | cl_int err = CL_SUCCESS; |
107 | for (size_t i = 0; i < kernel_names.size(); i++) { |
108 | if (!kernel_names[i]) continue; |
109 | std::string kernel_name(kernel_names[i]); |
110 | |
111 | const uint8_t *binary = nullptr; |
112 | size_t binary_size = 0; |
113 | |
114 | CHECK(cache_blob.get_binary(&binary, &binary_size)); |
115 | |
116 | auto program = make_ocl_wrapper(clCreateProgramWithBinary( |
117 | ctx, 1, &dev, &binary_size, &binary, nullptr, &err)); |
118 | OCL_CHECK(err); |
119 | err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr); |
120 | OCL_CHECK(err); |
121 | |
122 | if (kernel_name.empty()) { |
123 | // Handle the ngen cases when kernel name is not available. |
124 | // Query the kernel name from the program. It's expected that |
125 | // an ngen based program contains only 1 kernel. |
126 | if (kernel_names.size() != 1 || kernels->size() != 1) |
127 | return status::invalid_arguments; |
128 | size_t kernel_name_size = 0; |
129 | err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, nullptr, |
130 | &kernel_name_size); |
131 | OCL_CHECK(err); |
132 | |
133 | kernel_name.resize(kernel_name_size); |
134 | err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, |
135 | kernel_name_size, &kernel_name[0], nullptr); |
136 | OCL_CHECK(err); |
137 | assert(!kernel_name.empty()); |
138 | if (kernel_name.empty()) return status::runtime_error; |
139 | // Remove the null terminator as std::string already includes it. |
140 | kernel_name.pop_back(); |
141 | } |
142 | auto ocl_kernel = make_ocl_wrapper( |
143 | clCreateKernel(program, kernel_name.c_str(), &err)); |
144 | OCL_CHECK(err); |
145 | |
146 | std::vector<gpu::compute::scalar_type_t> arg_types; |
147 | CHECK(get_kernel_arg_types(ocl_kernel, &arg_types)); |
148 | OCL_CHECK(err); |
149 | |
150 | auto shared_binary = std::make_shared<compute::binary_t>( |
151 | binary, binary + binary_size); |
152 | (*kernels)[i] = compute::kernel_t( |
153 | new ocl_gpu_kernel_t(shared_binary, kernel_name, arg_types)); |
154 | dump_kernel_binary(ocl_engine, (*kernels)[i]); |
155 | } |
156 | |
157 | return status::success; |
158 | } |
159 | |
160 | cl_int maybe_print_debug_info( |
161 | cl_int err, cl_program program, cl_device_id dev) { |
162 | // Return error code if verbose is not enabled. |
163 | if (err == CL_SUCCESS || get_verbose() == 0) return err; |
164 | |
165 | size_t log_length = 0; |
166 | err = clGetProgramBuildInfo( |
167 | program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_length); |
168 | assert(err == CL_SUCCESS); |
169 | |
170 | std::vector<char> log_buf(log_length); |
171 | err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_length, |
172 | log_buf.data(), nullptr); |
173 | assert(err == CL_SUCCESS); |
174 | printf("Error during the build of OpenCL program.\nBuild " |
175 | "log:\n%s\n" , |
176 | log_buf.data()); |
177 | return err; |
178 | }; |
179 | |
180 | inline status_t ( |
181 | std::stringstream &pp_code, const char *code) { |
182 | std::stringstream code_stream(code); |
183 | |
184 | for (std::string line; std::getline(code_stream, line);) { |
185 | const size_t include_pos = line.find("#include" ); |
186 | if (include_pos != std::string::npos) { |
187 | static constexpr size_t include_len = 8; |
188 | const size_t first_quote_pos |
189 | = line.find("\"" , include_pos + include_len); |
190 | const size_t second_quote_pos |
191 | = line.find("\"" , first_quote_pos + 1); |
192 | const size_t kernel_name_len |
193 | = second_quote_pos - first_quote_pos - 1; |
194 | const auto header_name |
195 | = line.substr(first_quote_pos + 1, kernel_name_len); |
196 | CHECK(preprocess_headers(pp_code, get_kernel_header(header_name))); |
197 | } else { |
198 | pp_code << line << std::endl; |
199 | } |
200 | } |
201 | return status::success; |
202 | } |
203 | |
204 | } // namespace |
205 | |
206 | status_t ocl_gpu_engine_t::create_kernel(compute::kernel_t *kernel, |
207 | jit::jit_generator_base *jitter, cache_blob_t cache_blob) const { |
208 | if (!jitter && !cache_blob) return status::invalid_arguments; |
209 | |
210 | const char *kernel_name = jitter ? jitter->kernel_name() : "" ; |
211 | |
212 | if (cache_blob) { |
213 | std::vector<compute::kernel_t> kernels(1); |
214 | auto status = create_ocl_kernel_from_cache_blob( |
215 | this, cache_blob, {kernel_name}, &kernels); |
216 | CHECK(status); |
217 | (*kernel) = kernels[0]; |
218 | return status::success; |
219 | } |
220 | |
221 | ocl_wrapper_t<cl_kernel> ocl_kernel |
222 | = jitter->get_kernel(context(), device()); |
223 | std::vector<gpu::compute::scalar_type_t> arg_types; |
224 | CHECK(get_kernel_arg_types(ocl_kernel, &arg_types)); |
225 | |
226 | std::shared_ptr<compute::binary_t> shared_binary; |
227 | CHECK(get_ocl_program_binary(ocl_kernel, device(), shared_binary)); |
228 | |
229 | *kernel = compute::kernel_t( |
230 | new ocl_gpu_kernel_t(shared_binary, kernel_name, arg_types)); |
231 | dump_kernel_binary(this, *kernel); |
232 | |
233 | return status::success; |
234 | } |
235 | |
236 | status_t ocl_gpu_engine_t::create_kernels( |
237 | std::vector<compute::kernel_t> *kernels, |
238 | const std::vector<const char *> &kernel_names, |
239 | const compute::kernel_ctx_t &kernel_ctx, |
240 | cache_blob_t cache_blob) const { |
241 | |
242 | *kernels = std::vector<compute::kernel_t>(kernel_names.size()); |
243 | |
244 | if (cache_blob) { |
245 | return create_ocl_kernel_from_cache_blob( |
246 | this, cache_blob, kernel_names, kernels); |
247 | } |
248 | |
249 | compute::kernel_list_t kernel_list; |
250 | for (size_t i = 0; i < kernels->size(); ++i) { |
251 | if (kernel_names[i]) kernel_list.add(kernel_names[i], &(*kernels)[i]); |
252 | } |
253 | |
254 | return ocl::create_kernels(this, kernel_list, kernel_ctx); |
255 | } |
256 | |
257 | status_t ocl_gpu_engine_t::create_kernels_from_ocl_source( |
258 | std::vector<compute::kernel_t> *kernels, |
259 | const std::vector<const char *> &kernel_names, const char *code_string, |
260 | const compute::kernel_ctx_t &kernel_ctx) const { |
261 | std::string options = kernel_ctx.options(); |
262 | |
263 | // XXX: Update options by adding macros for OpenCL extensions that are not |
264 | // handled properly by the OpenCL runtime |
265 | auto *dev_info |
266 | = utils::downcast<const ocl_gpu_device_info_t *>(device_info()); |
267 | options += " " + dev_info->get_cl_ext_options(); |
268 | |
269 | cl_int err; |
270 | std::stringstream pp_code; |
271 | // The `cl_cache` requires using `clBuildProgram`. Unfortunately, unlike |
272 | // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of |
273 | // that, a manual preprocessing of `include` header directives in the |
274 | // OpenCL kernels is required. |
275 | CHECK(preprocess_headers(pp_code, code_string)); |
276 | std::string pp_code_str = pp_code.str(); |
277 | const char *pp_code_str_ptr = pp_code_str.c_str(); |
278 | |
279 | auto program = make_ocl_wrapper(clCreateProgramWithSource( |
280 | context(), 1, &pp_code_str_ptr, nullptr, &err)); |
281 | OCL_CHECK(err); |
282 | |
283 | cl_device_id dev = device(); |
284 | err = clBuildProgram(program, 1, &dev, options.c_str(), nullptr, nullptr); |
285 | OCL_CHECK(maybe_print_debug_info(err, program, dev)); |
286 | |
287 | std::shared_ptr<compute::binary_t> shared_binary; |
288 | CHECK(get_ocl_program_binary(program, dev, shared_binary)); |
289 | |
290 | *kernels = std::vector<compute::kernel_t>(kernel_names.size()); |
291 | for (size_t i = 0; i < kernel_names.size(); ++i) { |
292 | cl_int err; |
293 | ocl_wrapper_t<cl_kernel> ocl_kernel |
294 | = clCreateKernel(program, kernel_names[i], &err); |
295 | OCL_CHECK(err); |
296 | std::vector<gpu::compute::scalar_type_t> arg_types; |
297 | CHECK(get_kernel_arg_types(ocl_kernel, &arg_types)); |
298 | |
299 | (*kernels)[i] = compute::kernel_t(new ocl_gpu_kernel_t( |
300 | shared_binary, kernel_names[i], arg_types)); |
301 | dump_kernel_binary(this, (*kernels)[i]); |
302 | } |
303 | |
304 | return status::success; |
305 | } |
306 | |
307 | std::function<void(void *)> ocl_gpu_engine_t::get_program_list_deleter() const { |
308 | return [](void *p) { |
309 | cl_int err = clReleaseProgram(reinterpret_cast<cl_program>(p)); |
310 | assert(err == 0); |
311 | MAYBE_UNUSED(err); |
312 | }; |
313 | } |
314 | |
315 | status_t ocl_gpu_engine_t::init_device_info() { |
316 | return init_device_info({}); |
317 | } |
318 | |
319 | status_t ocl_gpu_engine_t::init_device_info( |
320 | const std::vector<uint8_t> &cache_blob) { |
321 | device_info_ = std::make_shared<ocl_gpu_device_info_t>(); |
322 | CHECK(device_info_->init(this, cache_blob)); |
323 | return status::success; |
324 | } |
325 | |
326 | status_t ocl_gpu_engine_t::serialize_device( |
327 | serialization_stream_t &sstream) const { |
328 | size_t platform_name_len; |
329 | cl_int err = clGetPlatformInfo( |
330 | platform_, CL_PLATFORM_NAME, 0, nullptr, &platform_name_len); |
331 | OCL_CHECK(err); |
332 | |
333 | std::vector<char> platform_name(platform_name_len); |
334 | err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, platform_name.size(), |
335 | platform_name.data(), nullptr); |
336 | OCL_CHECK(err); |
337 | |
338 | sstream.write(platform_name.data(), platform_name.size()); |
339 | sstream.write(device_info()->name().data(), device_info()->name().size()); |
340 | sstream.write(&device_info()->runtime_version().major); |
341 | sstream.write(&device_info()->runtime_version().minor); |
342 | sstream.write(&device_info()->runtime_version().build); |
343 | |
344 | return status::success; |
345 | } |
346 | |
347 | } // namespace ocl |
348 | } // namespace gpu |
349 | } // namespace impl |
350 | } // namespace dnnl |
351 | |