1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_OCL_OCL_UTILS_HPP
18#define GPU_OCL_OCL_UTILS_HPP
19
20#include <cinttypes>
21#include <memory>
22#include <sstream>
23#include <string.h>
24#include <string>
25#include <utility>
26#include <vector>
27#include <CL/cl.h>
28#include <initializer_list>
29#include <type_traits>
30#include <unordered_map>
31#include <unordered_set>
32
33#include "common/c_types_map.hpp"
34#include "common/cpp_compat.hpp"
35#include "common/internal_defs.hpp"
36#include "common/utils.hpp"
37#include "common/verbose.hpp"
38#include "gpu/compute/kernel_arg_list.hpp"
39#include "gpu/compute/utils.hpp"
40
41namespace dnnl {
42namespace impl {
43namespace gpu {
44
45namespace compute {
46class kernel_t;
47}
48
49namespace ocl {
50
51inline status_t convert_to_dnnl(cl_int cl_status) {
52 switch (cl_status) {
53 case CL_SUCCESS: return status::success;
54 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
55 case CL_OUT_OF_RESOURCES:
56 case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory;
57 case CL_DEVICE_NOT_FOUND:
58 case CL_DEVICE_NOT_AVAILABLE:
59 case CL_COMPILER_NOT_AVAILABLE:
60 case CL_PROFILING_INFO_NOT_AVAILABLE:
61 case CL_MEM_COPY_OVERLAP:
62 case CL_IMAGE_FORMAT_MISMATCH:
63 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
64 case CL_BUILD_PROGRAM_FAILURE:
65 case CL_MAP_FAILURE:
66 case CL_MISALIGNED_SUB_BUFFER_OFFSET:
67 case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
68 case CL_COMPILE_PROGRAM_FAILURE:
69 case CL_LINKER_NOT_AVAILABLE:
70 case CL_LINK_PROGRAM_FAILURE:
71 case CL_DEVICE_PARTITION_FAILED:
72 case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return status::runtime_error;
73 case CL_INVALID_VALUE:
74 case CL_INVALID_DEVICE_TYPE:
75 case CL_INVALID_CONTEXT:
76 case CL_INVALID_QUEUE_PROPERTIES:
77 case CL_INVALID_COMMAND_QUEUE:
78 case CL_INVALID_HOST_PTR:
79 case CL_INVALID_MEM_OBJECT:
80 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
81 case CL_INVALID_IMAGE_SIZE:
82 case CL_INVALID_SAMPLER:
83 case CL_INVALID_BINARY:
84 case CL_INVALID_BUILD_OPTIONS:
85 case CL_INVALID_PROGRAM:
86 case CL_INVALID_PROGRAM_EXECUTABLE:
87 case CL_INVALID_KERNEL_NAME:
88 case CL_INVALID_KERNEL_DEFINITION: // FI
89 case CL_INVALID_KERNEL:
90 case CL_INVALID_ARG_INDEX:
91 case CL_INVALID_ARG_VALUE: return status::invalid_arguments;
92
93 default: return status::runtime_error;
94 }
95}
96
97enum { OCL_BUFFER_ALIGNMENT = 128 };
98
99#define MAYBE_REPORT_ERROR(msg) \
100 do { \
101 if (get_verbose()) printf("onednn_verbose,gpu,error,%s\n", (msg)); \
102 } while (0)
103
104#define MAYBE_REPORT_OCL_ERROR(s) \
105 do { \
106 if (dnnl::impl::get_verbose()) \
107 printf("onednn_verbose,gpu,ocl_error,%d\n", (int)(s)); \
108 } while (0)
109
110#define OCL_CHECK_V(x) \
111 do { \
112 cl_int s = x; \
113 if (s != CL_SUCCESS) { \
114 MAYBE_REPORT_OCL_ERROR(s); \
115 return; \
116 } \
117 } while (0)
118
119#define OCL_CHECK(x) \
120 do { \
121 cl_int s = x; \
122 if (s != CL_SUCCESS) { \
123 MAYBE_REPORT_OCL_ERROR(s); \
124 return dnnl::impl::gpu::ocl::convert_to_dnnl(s); \
125 } \
126 } while (0)
127
128// Check for three conditions:
129// 1. Device and context are compatible, i.e. the device belongs to
130// the context devices.
131// 2. Device type matches the passed engine kind
132// 3. Device/context platfrom is an Intel platform
133status_t check_device(engine_kind_t eng_kind, cl_device_id dev, cl_context ctx);
134
135status_t get_ocl_devices(
136 std::vector<cl_device_id> *devices, cl_device_type device_type);
137
138status_t get_ocl_device_index(size_t *index, cl_device_id device);
139
140cl_platform_id get_ocl_platform(cl_device_id device);
141cl_platform_id get_ocl_platform(engine_t *engine);
142
143namespace details {
144
145// OpenCL objects reference counting traits
146template <typename T>
147struct ocl_ref_traits;
148//{
149// static void retain(T t) {}
150// static void release(T t) {}
151//};
152
153template <>
154struct ocl_ref_traits<cl_context> {
155 static void retain(cl_context t) { clRetainContext(t); }
156 static void release(cl_context t) { clReleaseContext(t); }
157};
158
159template <>
160struct ocl_ref_traits<cl_command_queue> {
161 static void retain(cl_command_queue t) { clRetainCommandQueue(t); }
162 static void release(cl_command_queue t) { clReleaseCommandQueue(t); }
163};
164
165template <>
166struct ocl_ref_traits<cl_program> {
167 static void retain(cl_program t) { clRetainProgram(t); }
168 static void release(cl_program t) { clReleaseProgram(t); }
169};
170
171template <>
172struct ocl_ref_traits<cl_kernel> {
173 static void retain(cl_kernel t) { clRetainKernel(t); }
174 static void release(cl_kernel t) { clReleaseKernel(t); }
175};
176
177template <>
178struct ocl_ref_traits<cl_mem> {
179 static void retain(cl_mem t) { clRetainMemObject(t); }
180 static void release(cl_mem t) { clReleaseMemObject(t); }
181};
182
183template <>
184struct ocl_ref_traits<cl_sampler> {
185 static void retain(cl_sampler t) { clRetainSampler(t); }
186 static void release(cl_sampler t) { clReleaseSampler(t); }
187};
188
189template <>
190struct ocl_ref_traits<cl_event> {
191 static void retain(cl_event t) { clRetainEvent(t); }
192 static void release(cl_event t) { clReleaseEvent(t); }
193};
194
195template <>
196struct ocl_ref_traits<cl_device_id> {
197 static void retain(cl_device_id t) { clRetainDevice(t); }
198 static void release(cl_device_id t) { clReleaseDevice(t); }
199};
200
201} // namespace details
202
203// Generic class providing RAII support for OpenCL objects
204template <typename T>
205struct ocl_wrapper_t {
206 ocl_wrapper_t(T t = nullptr, bool retain = false) : t_(t) {
207 if (retain) { do_retain(); }
208 }
209
210 ocl_wrapper_t(const ocl_wrapper_t &other) : t_(other.t_) { do_retain(); }
211
212 ocl_wrapper_t(ocl_wrapper_t &&other) noexcept : t_(std::move(other.t_)) {}
213
214 ocl_wrapper_t &operator=(ocl_wrapper_t other) {
215 using std::swap;
216 swap(t_, other.t_);
217 return *this;
218 }
219
220 ~ocl_wrapper_t() { do_release(); }
221
222 operator T() const { return t_; }
223 T get() const { return t_; }
224
225 T release() {
226 T t = t_;
227 t_ = nullptr;
228 return t;
229 }
230
231private:
232 T t_;
233
234 void do_retain() {
235 if (t_) { details::ocl_ref_traits<T>::retain(t_); }
236 }
237
238 void do_release() {
239 if (t_) { details::ocl_ref_traits<T>::release(t_); }
240 }
241};
242
243// Constructs an OpenCL wrapper object (providing RAII support)
244template <typename T>
245ocl_wrapper_t<T> make_ocl_wrapper(T t) {
246 return ocl_wrapper_t<T>(t);
247}
248
249template <typename F>
250struct ext_func_t {
251 ext_func_t(const char *name) : ext_func_ptrs_(intel_platforms().size()) {
252 for (size_t i = 0; i < intel_platforms().size(); ++i) {
253 auto p = intel_platforms()[i];
254 auto it = ext_func_ptrs_.insert({p, load_ext_func(p, name)});
255 assert(it.second);
256 MAYBE_UNUSED(it);
257 }
258 }
259
260 template <typename... Args>
261 typename cpp_compat::invoke_result<F, Args...>::type operator()(
262 engine_t *engine, Args... args) const {
263 auto f = get_func(engine);
264 return f(args...);
265 }
266
267 F get_func(engine_t *engine) const {
268 return get_func(get_ocl_platform(engine));
269 }
270
271 F get_func(cl_platform_id platform) const {
272 return ext_func_ptrs_.at(platform);
273 }
274
275private:
276 std::unordered_map<cl_platform_id, F> ext_func_ptrs_;
277
278 static F load_ext_func(cl_platform_id platform, const char *name) {
279 return reinterpret_cast<F>(
280 clGetExtensionFunctionAddressForPlatform(platform, name));
281 }
282
283 static const std::vector<cl_platform_id> &intel_platforms() {
284 static auto intel_platforms = get_intel_platforms();
285 return intel_platforms;
286 }
287
288 static std::vector<cl_platform_id> get_intel_platforms() {
289 cl_uint num_platforms = 0;
290 cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms);
291 if (err != CL_SUCCESS) return {};
292
293 std::vector<cl_platform_id> platforms(num_platforms);
294 err = clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
295 if (err != CL_SUCCESS) return {};
296
297 std::vector<cl_platform_id> intel_platforms;
298 char vendor_name[128] = {};
299 for (cl_platform_id p : platforms) {
300 err = clGetPlatformInfo(p, CL_PLATFORM_VENDOR, sizeof(vendor_name),
301 vendor_name, nullptr);
302 if (err != CL_SUCCESS) continue;
303 if (std::string(vendor_name).find("Intel") != std::string::npos)
304 intel_platforms.push_back(p);
305 }
306
307 // OpenCL can return a list of platforms that contains duplicates.
308 std::sort(intel_platforms.begin(), intel_platforms.end());
309 intel_platforms.erase(
310 std::unique(intel_platforms.begin(), intel_platforms.end()),
311 intel_platforms.end());
312 return intel_platforms;
313 }
314};
315
316status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
317 cl_kernel ocl_kernel, int idx, bool allow_undef = false);
318
319#ifdef DNNL_ENABLE_MEM_DEBUG
320cl_mem DNNL_WEAK clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags,
321 size_t size, void *host_ptr, cl_int *errcode_ret);
322#else
323cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags,
324 size_t size, void *host_ptr, cl_int *errcode_ret);
325#endif
326
327status_t get_ocl_program_binary(cl_program program, cl_device_id device,
328 std::shared_ptr<compute::binary_t> &binary);
329
330status_t get_ocl_program_binary(cl_kernel kernel, cl_device_id device,
331 std::shared_ptr<compute::binary_t> &binary);
332
333void dump_kernel_binary(cl_kernel ocl_kernel);
334void dump_kernel_binary(
335 const engine_t *engine, const compute::kernel_t &binary_kernel);
336
337status_t get_kernel_arg_types(cl_kernel ocl_kernel,
338 std::vector<gpu::compute::scalar_type_t> *arg_types);
339
340status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count);
341
342status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel);
343
344} // namespace ocl
345} // namespace gpu
346} // namespace impl
347} // namespace dnnl
348
349#endif
350