1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <cstdio> |
19 | #include <cstring> |
20 | #include <mutex> |
21 | #include <CL/cl_ext.h> |
22 | |
23 | #include "gpu/ocl/ocl_gpu_engine.hpp" |
24 | #include "gpu/ocl/ocl_gpu_kernel.hpp" |
25 | #include "gpu/ocl/ocl_utils.hpp" |
26 | |
27 | #ifndef DNNL_ENABLE_JIT_DUMP |
28 | #define DNNL_ENABLE_JIT_DUMP 1 |
29 | #endif |
30 | |
31 | #ifndef CL_KERNEL_BINARY_PROGRAM_INTEL |
32 | #define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D |
33 | #endif |
34 | |
35 | #ifndef CL_DEVICE_NUM_SLICES_INTEL |
36 | #define CL_DEVICE_NUM_SLICES_INTEL 0x4252 |
37 | #endif |
38 | |
39 | #ifndef CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL |
40 | #define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253 |
41 | #endif |
42 | |
43 | #ifndef CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL |
44 | #define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254 |
45 | #endif |
46 | |
47 | namespace dnnl { |
48 | namespace impl { |
49 | namespace gpu { |
50 | namespace ocl { |
51 | |
52 | template <typename T, typename F> |
53 | static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) { |
54 | size_t name_size; |
55 | cl_int err = get_func(obj, name_query, 0, nullptr, &name_size); |
56 | // Ignore error. |
57 | if (err != CL_SUCCESS) return {}; |
58 | |
59 | // Include null terminator explicitly - to safely overwrite it in |
60 | // clGetKernelInfo |
61 | std::string name(name_size, 0); |
62 | err = get_func(obj, name_query, name_size, &name[0], nullptr); |
63 | // Ignore error. |
64 | if (err != CL_SUCCESS) return {}; |
65 | |
66 | // Remove the null terminator as std::string already includes it |
67 | name.resize(name_size - 1); |
68 | return name; |
69 | } |
70 | |
71 | static std::string get_kernel_name(cl_kernel kernel) { |
72 | return get_ocl_name(kernel, clGetKernelInfo, CL_KERNEL_FUNCTION_NAME); |
73 | } |
74 | |
75 | static std::string get_platform_name(cl_platform_id platform) { |
76 | return get_ocl_name(platform, clGetPlatformInfo, CL_PLATFORM_NAME); |
77 | } |
78 | |
79 | static bool is_intel_platform(cl_platform_id platform) { |
80 | auto name = get_platform_name(platform); |
81 | return name.find("Intel" ) != std::string::npos; |
82 | } |
83 | |
84 | status_t check_device( |
85 | engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) { |
86 | assert(dev && ctx); |
87 | |
88 | // Check device and context consistency. |
89 | size_t dev_bytes; |
90 | OCL_CHECK( |
91 | clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, nullptr, &dev_bytes)); |
92 | |
93 | std::vector<cl_device_id> ctx_devices(dev_bytes / sizeof(cl_device_id)); |
94 | OCL_CHECK(clGetContextInfo( |
95 | ctx, CL_CONTEXT_DEVICES, dev_bytes, &ctx_devices[0], nullptr)); |
96 | |
97 | bool found = false; |
98 | for (size_t i = 0; i < ctx_devices.size(); ++i) { |
99 | if (ctx_devices[i] == dev) { |
100 | found = true; |
101 | break; |
102 | } |
103 | } |
104 | if (!found) return status::invalid_arguments; |
105 | |
106 | // Check engine kind and device consistency. |
107 | cl_device_type dev_type; |
108 | OCL_CHECK(clGetDeviceInfo( |
109 | dev, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, nullptr)); |
110 | if ((eng_kind == engine_kind::cpu) |
111 | && (dev_type & CL_DEVICE_TYPE_CPU) == 0) { |
112 | return status::invalid_arguments; |
113 | } |
114 | if ((eng_kind == engine_kind::gpu) |
115 | && (dev_type & CL_DEVICE_TYPE_GPU) == 0) { |
116 | return status::invalid_arguments; |
117 | } |
118 | |
119 | // Check that the platform is an Intel platform. |
120 | cl_platform_id platform; |
121 | OCL_CHECK(clGetDeviceInfo( |
122 | dev, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr)); |
123 | if (!is_intel_platform(platform)) return status::invalid_arguments; |
124 | |
125 | return status::success; |
126 | } |
127 | |
128 | status_t get_ocl_devices( |
129 | std::vector<cl_device_id> *devices, cl_device_type device_type) { |
130 | cl_uint num_platforms = 0; |
131 | |
132 | cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms); |
133 | // No platforms - a valid scenario |
134 | if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success; |
135 | |
136 | OCL_CHECK(err); |
137 | |
138 | std::vector<cl_platform_id> platforms(num_platforms); |
139 | OCL_CHECK(clGetPlatformIDs(num_platforms, &platforms[0], nullptr)); |
140 | |
141 | for (size_t i = 0; i < platforms.size(); ++i) { |
142 | if (!is_intel_platform(platforms[i])) continue; |
143 | |
144 | cl_uint num_devices = 0; |
145 | cl_int err = clGetDeviceIDs( |
146 | platforms[i], device_type, 0, nullptr, &num_devices); |
147 | |
148 | if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) { |
149 | return status::runtime_error; |
150 | } |
151 | |
152 | if (num_devices != 0) { |
153 | std::vector<cl_device_id> plat_devices; |
154 | plat_devices.resize(num_devices); |
155 | OCL_CHECK(clGetDeviceIDs(platforms[i], device_type, num_devices, |
156 | &plat_devices[0], nullptr)); |
157 | |
158 | // Use Intel devices only |
159 | for (size_t j = 0; j < plat_devices.size(); ++j) { |
160 | cl_uint vendor_id; |
161 | clGetDeviceInfo(plat_devices[j], CL_DEVICE_VENDOR_ID, |
162 | sizeof(cl_uint), &vendor_id, nullptr); |
163 | if (vendor_id == 0x8086) { |
164 | devices->push_back(plat_devices[j]); |
165 | } |
166 | } |
167 | } |
168 | } |
169 | // No devices found but still return success |
170 | return status::success; |
171 | } |
172 | |
173 | status_t get_ocl_device_index(size_t *index, cl_device_id device) { |
174 | std::vector<cl_device_id> ocl_devices; |
175 | CHECK(get_ocl_devices(&ocl_devices, CL_DEVICE_TYPE_GPU)); |
176 | |
177 | // Search the top level device unconditionally |
178 | auto parent_device = device; |
179 | auto top_level_device = device; |
180 | while (parent_device) { |
181 | top_level_device = parent_device; |
182 | OCL_CHECK(clGetDeviceInfo(top_level_device, CL_DEVICE_PARENT_DEVICE, |
183 | sizeof(cl_device_id), &parent_device, nullptr)); |
184 | } |
185 | |
186 | // Find the top level device in the list |
187 | auto it = std::find( |
188 | ocl_devices.begin(), ocl_devices.end(), top_level_device); |
189 | if (it != ocl_devices.end()) { |
190 | *index = it - ocl_devices.begin(); |
191 | return status::success; |
192 | } else { |
193 | *index = SIZE_MAX; |
194 | return status::invalid_arguments; |
195 | } |
196 | } |
197 | |
198 | cl_platform_id get_ocl_platform(cl_device_id device) { |
199 | cl_platform_id platform; |
200 | cl_int err = clGetDeviceInfo( |
201 | device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr); |
202 | if (err != CL_SUCCESS) return nullptr; |
203 | return platform; |
204 | } |
205 | |
206 | cl_platform_id get_ocl_platform(engine_t *engine) { |
207 | return utils::downcast<ocl_gpu_engine_t *>(engine)->platform(); |
208 | } |
209 | |
210 | status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type, |
211 | cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) { |
212 | char s_type[16]; |
213 | OCL_CHECK(clGetKernelArgInfo(ocl_kernel, idx, CL_KERNEL_ARG_TYPE_NAME, |
214 | sizeof(s_type), s_type, nullptr)); |
215 | #define CASE(x) \ |
216 | if (!strcmp(STRINGIFY(x), s_type)) { \ |
217 | *type = compute::scalar_type_t::_##x; \ |
218 | return status::success; \ |
219 | } |
220 | CASE(char) |
221 | CASE(float) |
222 | CASE(half) |
223 | CASE(int) |
224 | CASE(long) |
225 | CASE(short) |
226 | CASE(uchar) |
227 | CASE(uint) |
228 | CASE(ulong) |
229 | CASE(ushort) |
230 | CASE(zero_pad_mask_t) |
231 | #undef CASE |
232 | |
233 | if (allow_undef) { |
234 | *type = compute::scalar_type_t::undef; |
235 | return status::success; |
236 | } |
237 | |
238 | assert(!"Not expected" ); |
239 | return status::runtime_error; |
240 | } |
241 | |
242 | cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags, |
243 | size_t size, void *host_ptr, cl_int *errcode_ret) { |
244 | return clCreateBuffer(context, flags, size, host_ptr, errcode_ret); |
245 | } |
246 | |
247 | status_t get_ocl_program_binary(cl_program program, cl_device_id device, |
248 | std::shared_ptr<compute::binary_t> &binary) { |
249 | |
250 | size_t n_devices = 0; |
251 | cl_int err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, |
252 | sizeof(size_t), &n_devices, nullptr); |
253 | OCL_CHECK(err); |
254 | |
255 | std::vector<size_t> binarySize(n_devices); |
256 | err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, |
257 | sizeof(size_t) * n_devices, binarySize.data(), nullptr); |
258 | OCL_CHECK(err); |
259 | |
260 | std::vector<cl_device_id> devices(n_devices); |
261 | err = clGetProgramInfo(program, CL_PROGRAM_DEVICES, |
262 | sizeof(cl_device_id) * n_devices, devices.data(), nullptr); |
263 | OCL_CHECK(err); |
264 | |
265 | size_t device_idx = std::distance( |
266 | devices.begin(), std::find(devices.begin(), devices.end(), device)); |
267 | std::vector<uint8_t *> binary_pointers(n_devices); |
268 | std::vector<std::shared_ptr<compute::binary_t>> binaries(n_devices); |
269 | for (size_t i = 0; i < n_devices; ++i) { |
270 | binaries[i] = std::make_shared<compute::binary_t>(binarySize[i]); |
271 | binary_pointers[i] = binaries[i]->data(); |
272 | } |
273 | |
274 | err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, |
275 | sizeof(uint8_t *) * n_devices, binary_pointers.data(), nullptr); |
276 | OCL_CHECK(err); |
277 | binary = binaries[device_idx]; |
278 | |
279 | return status::success; |
280 | } |
281 | |
282 | status_t get_ocl_program_binary(cl_kernel kernel, cl_device_id device, |
283 | std::shared_ptr<compute::binary_t> &binary) { |
284 | cl_int err; |
285 | |
286 | cl_program program; |
287 | err = clGetKernelInfo( |
288 | kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr); |
289 | OCL_CHECK(err); |
290 | |
291 | return get_ocl_program_binary(program, device, binary); |
292 | } |
293 | |
294 | #if DNNL_ENABLE_JIT_DUMP |
295 | |
296 | void dump_kernel_binary(cl_kernel ocl_kernel) { |
297 | if (!get_jit_dump()) return; |
298 | |
299 | cl_int err; |
300 | |
301 | size_t binary_size; |
302 | err = clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, 0, |
303 | nullptr, &binary_size); |
304 | // Ignore error. |
305 | if (err != CL_SUCCESS) return; |
306 | |
307 | std::vector<uint8_t> binary(binary_size); |
308 | err = clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, |
309 | binary.size(), binary.data(), nullptr); |
310 | // Ignore error. |
311 | if (err != CL_SUCCESS) return; |
312 | |
313 | auto name = get_kernel_name(ocl_kernel); |
314 | // Ignore error. |
315 | if (name.empty()) return; |
316 | |
317 | static std::mutex m; |
318 | std::lock_guard<std::mutex> guard(m); |
319 | |
320 | static int counter = 0; |
321 | std::ostringstream fname; |
322 | fname << "dnnl_dump_gpu_" << name << "." << counter << ".bin" ; |
323 | |
324 | FILE *fp = fopen(fname.str().c_str(), "wb+" ); |
325 | |
326 | // Ignore error. |
327 | if (!fp) return; |
328 | |
329 | fwrite(binary.data(), binary.size(), 1, fp); |
330 | fclose(fp); |
331 | |
332 | counter++; |
333 | } |
334 | |
335 | void dump_kernel_binary( |
336 | const engine_t *engine, const compute::kernel_t &binary_kernel) { |
337 | if (!get_jit_dump()) return; |
338 | |
339 | compute::kernel_t realized_kernel; |
340 | auto status = binary_kernel.realize(&realized_kernel, engine, nullptr); |
341 | |
342 | // Ignore error. |
343 | if (status != status::success) return; |
344 | |
345 | auto *kernel |
346 | = utils::downcast<const ocl_gpu_kernel_t *>(realized_kernel.impl()); |
347 | dump_kernel_binary(kernel->ocl_kernel()); |
348 | } |
349 | #else |
350 | void dump_kernel_binary(const engine_t *, const compute::kernel_t &) {} |
351 | void dump_kernel_binary(cl_kernel) {} |
352 | #endif |
353 | |
354 | status_t get_kernel_arg_types(cl_kernel ocl_kernel, |
355 | std::vector<gpu::compute::scalar_type_t> *arg_types) { |
356 | cl_uint nargs; |
357 | OCL_CHECK(clGetKernelInfo( |
358 | ocl_kernel, CL_KERNEL_NUM_ARGS, sizeof(nargs), &nargs, nullptr)); |
359 | |
360 | *arg_types = std::vector<gpu::compute::scalar_type_t>(nargs); |
361 | |
362 | for (cl_uint i = 0; i < nargs; i++) { |
363 | gpu::compute::scalar_type_t type {}; |
364 | CHECK(gpu::ocl::get_ocl_kernel_arg_type( |
365 | &type, ocl_kernel, i, /*allow_undef=*/true)); |
366 | (*arg_types)[i] = type; |
367 | } |
368 | |
369 | return status::success; |
370 | } |
371 | |
372 | static status_t get_ocl_device_eu_count_intel( |
373 | cl_device_id device, int32_t *eu_count) { |
374 | cl_uint num_slices = 0; |
375 | cl_uint num_sub_slices_per_slice = 0; |
376 | cl_uint num_eus_per_sub_slice = 0; |
377 | |
378 | OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SLICES_INTEL, |
379 | sizeof(num_slices), &num_slices, nullptr)); |
380 | OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL, |
381 | sizeof(num_sub_slices_per_slice), &num_sub_slices_per_slice, |
382 | nullptr)); |
383 | OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, |
384 | sizeof(num_eus_per_sub_slice), &num_eus_per_sub_slice, nullptr)); |
385 | |
386 | *eu_count = (int32_t)( |
387 | num_slices * num_sub_slices_per_slice * num_eus_per_sub_slice); |
388 | return status::success; |
389 | } |
390 | |
391 | status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count) { |
392 | // Try to use Intel-specific slices/sub-slices to deduce EU count. |
393 | auto status = get_ocl_device_eu_count_intel(device, eu_count); |
394 | if (status == status::success) return status; |
395 | |
396 | // If failed, fall back to common OpenCL query. |
397 | cl_uint max_compute_units = 0; |
398 | OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, |
399 | sizeof(max_compute_units), &max_compute_units, nullptr)); |
400 | *eu_count = (int32_t)max_compute_units; |
401 | |
402 | return status::success; |
403 | } |
404 | |
405 | status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel) { |
406 | cl_int err; |
407 | #if !defined(DNNL_SYCL_HIP) && !defined(DNNL_SYCL_CUDA) \ |
408 | && defined(CL_VERSION_2_1) |
409 | *cloned_kernel = clCloneKernel(kernel, &err); |
410 | OCL_CHECK(err); |
411 | #else |
412 | // clCloneKernel is not available - recreate from the program. |
413 | auto name = get_kernel_name(kernel); |
414 | |
415 | cl_program program; |
416 | err = clGetKernelInfo( |
417 | kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr); |
418 | OCL_CHECK(err); |
419 | |
420 | *cloned_kernel = clCreateKernel(program, name.c_str(), &err); |
421 | OCL_CHECK(err); |
422 | #endif |
423 | |
424 | return status::success; |
425 | } |
426 | |
427 | } // namespace ocl |
428 | } // namespace gpu |
429 | } // namespace impl |
430 | } // namespace dnnl |
431 | |