1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_OCL_OCL_UTILS_HPP |
18 | #define GPU_OCL_OCL_UTILS_HPP |
19 | |
20 | #include <cinttypes> |
21 | #include <memory> |
22 | #include <sstream> |
23 | #include <string.h> |
24 | #include <string> |
25 | #include <utility> |
26 | #include <vector> |
27 | #include <CL/cl.h> |
28 | #include <initializer_list> |
29 | #include <type_traits> |
30 | #include <unordered_map> |
31 | #include <unordered_set> |
32 | |
33 | #include "common/c_types_map.hpp" |
34 | #include "common/cpp_compat.hpp" |
35 | #include "common/internal_defs.hpp" |
36 | #include "common/utils.hpp" |
37 | #include "common/verbose.hpp" |
38 | #include "gpu/compute/kernel_arg_list.hpp" |
39 | #include "gpu/compute/utils.hpp" |
40 | |
41 | namespace dnnl { |
42 | namespace impl { |
43 | namespace gpu { |
44 | |
45 | namespace compute { |
46 | class kernel_t; |
47 | } |
48 | |
49 | namespace ocl { |
50 | |
51 | inline status_t convert_to_dnnl(cl_int cl_status) { |
52 | switch (cl_status) { |
53 | case CL_SUCCESS: return status::success; |
54 | case CL_MEM_OBJECT_ALLOCATION_FAILURE: |
55 | case CL_OUT_OF_RESOURCES: |
56 | case CL_OUT_OF_HOST_MEMORY: return status::out_of_memory; |
57 | case CL_DEVICE_NOT_FOUND: |
58 | case CL_DEVICE_NOT_AVAILABLE: |
59 | case CL_COMPILER_NOT_AVAILABLE: |
60 | case CL_PROFILING_INFO_NOT_AVAILABLE: |
61 | case CL_MEM_COPY_OVERLAP: |
62 | case CL_IMAGE_FORMAT_MISMATCH: |
63 | case CL_IMAGE_FORMAT_NOT_SUPPORTED: |
64 | case CL_BUILD_PROGRAM_FAILURE: |
65 | case CL_MAP_FAILURE: |
66 | case CL_MISALIGNED_SUB_BUFFER_OFFSET: |
67 | case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: |
68 | case CL_COMPILE_PROGRAM_FAILURE: |
69 | case CL_LINKER_NOT_AVAILABLE: |
70 | case CL_LINK_PROGRAM_FAILURE: |
71 | case CL_DEVICE_PARTITION_FAILED: |
72 | case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return status::runtime_error; |
73 | case CL_INVALID_VALUE: |
74 | case CL_INVALID_DEVICE_TYPE: |
75 | case CL_INVALID_CONTEXT: |
76 | case CL_INVALID_QUEUE_PROPERTIES: |
77 | case CL_INVALID_COMMAND_QUEUE: |
78 | case CL_INVALID_HOST_PTR: |
79 | case CL_INVALID_MEM_OBJECT: |
80 | case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: |
81 | case CL_INVALID_IMAGE_SIZE: |
82 | case CL_INVALID_SAMPLER: |
83 | case CL_INVALID_BINARY: |
84 | case CL_INVALID_BUILD_OPTIONS: |
85 | case CL_INVALID_PROGRAM: |
86 | case CL_INVALID_PROGRAM_EXECUTABLE: |
87 | case CL_INVALID_KERNEL_NAME: |
88 | case CL_INVALID_KERNEL_DEFINITION: // FI |
89 | case CL_INVALID_KERNEL: |
90 | case CL_INVALID_ARG_INDEX: |
91 | case CL_INVALID_ARG_VALUE: return status::invalid_arguments; |
92 | |
93 | default: return status::runtime_error; |
94 | } |
95 | } |
96 | |
97 | enum { OCL_BUFFER_ALIGNMENT = 128 }; |
98 | |
99 | #define MAYBE_REPORT_ERROR(msg) \ |
100 | do { \ |
101 | if (get_verbose()) printf("onednn_verbose,gpu,error,%s\n", (msg)); \ |
102 | } while (0) |
103 | |
104 | #define MAYBE_REPORT_OCL_ERROR(s) \ |
105 | do { \ |
106 | if (dnnl::impl::get_verbose()) \ |
107 | printf("onednn_verbose,gpu,ocl_error,%d\n", (int)(s)); \ |
108 | } while (0) |
109 | |
110 | #define OCL_CHECK_V(x) \ |
111 | do { \ |
112 | cl_int s = x; \ |
113 | if (s != CL_SUCCESS) { \ |
114 | MAYBE_REPORT_OCL_ERROR(s); \ |
115 | return; \ |
116 | } \ |
117 | } while (0) |
118 | |
119 | #define OCL_CHECK(x) \ |
120 | do { \ |
121 | cl_int s = x; \ |
122 | if (s != CL_SUCCESS) { \ |
123 | MAYBE_REPORT_OCL_ERROR(s); \ |
124 | return dnnl::impl::gpu::ocl::convert_to_dnnl(s); \ |
125 | } \ |
126 | } while (0) |
127 | |
128 | // Check for three conditions: |
129 | // 1. Device and context are compatible, i.e. the device belongs to |
130 | // the context devices. |
131 | // 2. Device type matches the passed engine kind |
132 | // 3. Device/context platfrom is an Intel platform |
133 | status_t check_device(engine_kind_t eng_kind, cl_device_id dev, cl_context ctx); |
134 | |
135 | status_t get_ocl_devices( |
136 | std::vector<cl_device_id> *devices, cl_device_type device_type); |
137 | |
138 | status_t get_ocl_device_index(size_t *index, cl_device_id device); |
139 | |
140 | cl_platform_id get_ocl_platform(cl_device_id device); |
141 | cl_platform_id get_ocl_platform(engine_t *engine); |
142 | |
143 | namespace details { |
144 | |
145 | // OpenCL objects reference counting traits |
146 | template <typename T> |
147 | struct ocl_ref_traits; |
148 | //{ |
149 | // static void retain(T t) {} |
150 | // static void release(T t) {} |
151 | //}; |
152 | |
153 | template <> |
154 | struct ocl_ref_traits<cl_context> { |
155 | static void retain(cl_context t) { clRetainContext(t); } |
156 | static void release(cl_context t) { clReleaseContext(t); } |
157 | }; |
158 | |
159 | template <> |
160 | struct ocl_ref_traits<cl_command_queue> { |
161 | static void retain(cl_command_queue t) { clRetainCommandQueue(t); } |
162 | static void release(cl_command_queue t) { clReleaseCommandQueue(t); } |
163 | }; |
164 | |
165 | template <> |
166 | struct ocl_ref_traits<cl_program> { |
167 | static void retain(cl_program t) { clRetainProgram(t); } |
168 | static void release(cl_program t) { clReleaseProgram(t); } |
169 | }; |
170 | |
171 | template <> |
172 | struct ocl_ref_traits<cl_kernel> { |
173 | static void retain(cl_kernel t) { clRetainKernel(t); } |
174 | static void release(cl_kernel t) { clReleaseKernel(t); } |
175 | }; |
176 | |
177 | template <> |
178 | struct ocl_ref_traits<cl_mem> { |
179 | static void retain(cl_mem t) { clRetainMemObject(t); } |
180 | static void release(cl_mem t) { clReleaseMemObject(t); } |
181 | }; |
182 | |
183 | template <> |
184 | struct ocl_ref_traits<cl_sampler> { |
185 | static void retain(cl_sampler t) { clRetainSampler(t); } |
186 | static void release(cl_sampler t) { clReleaseSampler(t); } |
187 | }; |
188 | |
189 | template <> |
190 | struct ocl_ref_traits<cl_event> { |
191 | static void retain(cl_event t) { clRetainEvent(t); } |
192 | static void release(cl_event t) { clReleaseEvent(t); } |
193 | }; |
194 | |
195 | template <> |
196 | struct ocl_ref_traits<cl_device_id> { |
197 | static void retain(cl_device_id t) { clRetainDevice(t); } |
198 | static void release(cl_device_id t) { clReleaseDevice(t); } |
199 | }; |
200 | |
201 | } // namespace details |
202 | |
203 | // Generic class providing RAII support for OpenCL objects |
204 | template <typename T> |
205 | struct ocl_wrapper_t { |
206 | ocl_wrapper_t(T t = nullptr, bool retain = false) : t_(t) { |
207 | if (retain) { do_retain(); } |
208 | } |
209 | |
210 | ocl_wrapper_t(const ocl_wrapper_t &other) : t_(other.t_) { do_retain(); } |
211 | |
212 | ocl_wrapper_t(ocl_wrapper_t &&other) noexcept : t_(std::move(other.t_)) {} |
213 | |
214 | ocl_wrapper_t &operator=(ocl_wrapper_t other) { |
215 | using std::swap; |
216 | swap(t_, other.t_); |
217 | return *this; |
218 | } |
219 | |
220 | ~ocl_wrapper_t() { do_release(); } |
221 | |
222 | operator T() const { return t_; } |
223 | T get() const { return t_; } |
224 | |
225 | T release() { |
226 | T t = t_; |
227 | t_ = nullptr; |
228 | return t; |
229 | } |
230 | |
231 | private: |
232 | T t_; |
233 | |
234 | void do_retain() { |
235 | if (t_) { details::ocl_ref_traits<T>::retain(t_); } |
236 | } |
237 | |
238 | void do_release() { |
239 | if (t_) { details::ocl_ref_traits<T>::release(t_); } |
240 | } |
241 | }; |
242 | |
243 | // Constructs an OpenCL wrapper object (providing RAII support) |
244 | template <typename T> |
245 | ocl_wrapper_t<T> make_ocl_wrapper(T t) { |
246 | return ocl_wrapper_t<T>(t); |
247 | } |
248 | |
249 | template <typename F> |
250 | struct ext_func_t { |
251 | ext_func_t(const char *name) : ext_func_ptrs_(intel_platforms().size()) { |
252 | for (size_t i = 0; i < intel_platforms().size(); ++i) { |
253 | auto p = intel_platforms()[i]; |
254 | auto it = ext_func_ptrs_.insert({p, load_ext_func(p, name)}); |
255 | assert(it.second); |
256 | MAYBE_UNUSED(it); |
257 | } |
258 | } |
259 | |
260 | template <typename... Args> |
261 | typename cpp_compat::invoke_result<F, Args...>::type operator()( |
262 | engine_t *engine, Args... args) const { |
263 | auto f = get_func(engine); |
264 | return f(args...); |
265 | } |
266 | |
267 | F get_func(engine_t *engine) const { |
268 | return get_func(get_ocl_platform(engine)); |
269 | } |
270 | |
271 | F get_func(cl_platform_id platform) const { |
272 | return ext_func_ptrs_.at(platform); |
273 | } |
274 | |
275 | private: |
276 | std::unordered_map<cl_platform_id, F> ext_func_ptrs_; |
277 | |
278 | static F load_ext_func(cl_platform_id platform, const char *name) { |
279 | return reinterpret_cast<F>( |
280 | clGetExtensionFunctionAddressForPlatform(platform, name)); |
281 | } |
282 | |
283 | static const std::vector<cl_platform_id> &intel_platforms() { |
284 | static auto intel_platforms = get_intel_platforms(); |
285 | return intel_platforms; |
286 | } |
287 | |
288 | static std::vector<cl_platform_id> get_intel_platforms() { |
289 | cl_uint num_platforms = 0; |
290 | cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms); |
291 | if (err != CL_SUCCESS) return {}; |
292 | |
293 | std::vector<cl_platform_id> platforms(num_platforms); |
294 | err = clGetPlatformIDs(num_platforms, platforms.data(), nullptr); |
295 | if (err != CL_SUCCESS) return {}; |
296 | |
297 | std::vector<cl_platform_id> intel_platforms; |
298 | char vendor_name[128] = {}; |
299 | for (cl_platform_id p : platforms) { |
300 | err = clGetPlatformInfo(p, CL_PLATFORM_VENDOR, sizeof(vendor_name), |
301 | vendor_name, nullptr); |
302 | if (err != CL_SUCCESS) continue; |
303 | if (std::string(vendor_name).find("Intel" ) != std::string::npos) |
304 | intel_platforms.push_back(p); |
305 | } |
306 | |
307 | // OpenCL can return a list of platforms that contains duplicates. |
308 | std::sort(intel_platforms.begin(), intel_platforms.end()); |
309 | intel_platforms.erase( |
310 | std::unique(intel_platforms.begin(), intel_platforms.end()), |
311 | intel_platforms.end()); |
312 | return intel_platforms; |
313 | } |
314 | }; |
315 | |
316 | status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type, |
317 | cl_kernel ocl_kernel, int idx, bool allow_undef = false); |
318 | |
319 | #ifdef DNNL_ENABLE_MEM_DEBUG |
320 | cl_mem DNNL_WEAK clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags, |
321 | size_t size, void *host_ptr, cl_int *errcode_ret); |
322 | #else |
323 | cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags, |
324 | size_t size, void *host_ptr, cl_int *errcode_ret); |
325 | #endif |
326 | |
327 | status_t get_ocl_program_binary(cl_program program, cl_device_id device, |
328 | std::shared_ptr<compute::binary_t> &binary); |
329 | |
330 | status_t get_ocl_program_binary(cl_kernel kernel, cl_device_id device, |
331 | std::shared_ptr<compute::binary_t> &binary); |
332 | |
333 | void dump_kernel_binary(cl_kernel ocl_kernel); |
334 | void dump_kernel_binary( |
335 | const engine_t *engine, const compute::kernel_t &binary_kernel); |
336 | |
337 | status_t get_kernel_arg_types(cl_kernel ocl_kernel, |
338 | std::vector<gpu::compute::scalar_type_t> *arg_types); |
339 | |
340 | status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count); |
341 | |
342 | status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel); |
343 | |
344 | } // namespace ocl |
345 | } // namespace gpu |
346 | } // namespace impl |
347 | } // namespace dnnl |
348 | |
349 | #endif |
350 | |