1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <cstdio>
19#include <cstring>
20#include <mutex>
21#include <CL/cl_ext.h>
22
23#include "gpu/ocl/ocl_gpu_engine.hpp"
24#include "gpu/ocl/ocl_gpu_kernel.hpp"
25#include "gpu/ocl/ocl_utils.hpp"
26
27#ifndef DNNL_ENABLE_JIT_DUMP
28#define DNNL_ENABLE_JIT_DUMP 1
29#endif
30
31#ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
32#define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
33#endif
34
35#ifndef CL_DEVICE_NUM_SLICES_INTEL
36#define CL_DEVICE_NUM_SLICES_INTEL 0x4252
37#endif
38
39#ifndef CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL
40#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253
41#endif
42
43#ifndef CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL
44#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254
45#endif
46
47namespace dnnl {
48namespace impl {
49namespace gpu {
50namespace ocl {
51
52template <typename T, typename F>
53static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) {
54 size_t name_size;
55 cl_int err = get_func(obj, name_query, 0, nullptr, &name_size);
56 // Ignore error.
57 if (err != CL_SUCCESS) return {};
58
59 // Include null terminator explicitly - to safely overwrite it in
60 // clGetKernelInfo
61 std::string name(name_size, 0);
62 err = get_func(obj, name_query, name_size, &name[0], nullptr);
63 // Ignore error.
64 if (err != CL_SUCCESS) return {};
65
66 // Remove the null terminator as std::string already includes it
67 name.resize(name_size - 1);
68 return name;
69}
70
71static std::string get_kernel_name(cl_kernel kernel) {
72 return get_ocl_name(kernel, clGetKernelInfo, CL_KERNEL_FUNCTION_NAME);
73}
74
75static std::string get_platform_name(cl_platform_id platform) {
76 return get_ocl_name(platform, clGetPlatformInfo, CL_PLATFORM_NAME);
77}
78
79static bool is_intel_platform(cl_platform_id platform) {
80 auto name = get_platform_name(platform);
81 return name.find("Intel") != std::string::npos;
82}
83
84status_t check_device(
85 engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) {
86 assert(dev && ctx);
87
88 // Check device and context consistency.
89 size_t dev_bytes;
90 OCL_CHECK(
91 clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, nullptr, &dev_bytes));
92
93 std::vector<cl_device_id> ctx_devices(dev_bytes / sizeof(cl_device_id));
94 OCL_CHECK(clGetContextInfo(
95 ctx, CL_CONTEXT_DEVICES, dev_bytes, &ctx_devices[0], nullptr));
96
97 bool found = false;
98 for (size_t i = 0; i < ctx_devices.size(); ++i) {
99 if (ctx_devices[i] == dev) {
100 found = true;
101 break;
102 }
103 }
104 if (!found) return status::invalid_arguments;
105
106 // Check engine kind and device consistency.
107 cl_device_type dev_type;
108 OCL_CHECK(clGetDeviceInfo(
109 dev, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, nullptr));
110 if ((eng_kind == engine_kind::cpu)
111 && (dev_type & CL_DEVICE_TYPE_CPU) == 0) {
112 return status::invalid_arguments;
113 }
114 if ((eng_kind == engine_kind::gpu)
115 && (dev_type & CL_DEVICE_TYPE_GPU) == 0) {
116 return status::invalid_arguments;
117 }
118
119 // Check that the platform is an Intel platform.
120 cl_platform_id platform;
121 OCL_CHECK(clGetDeviceInfo(
122 dev, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr));
123 if (!is_intel_platform(platform)) return status::invalid_arguments;
124
125 return status::success;
126}
127
128status_t get_ocl_devices(
129 std::vector<cl_device_id> *devices, cl_device_type device_type) {
130 cl_uint num_platforms = 0;
131
132 cl_int err = clGetPlatformIDs(0, nullptr, &num_platforms);
133 // No platforms - a valid scenario
134 if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success;
135
136 OCL_CHECK(err);
137
138 std::vector<cl_platform_id> platforms(num_platforms);
139 OCL_CHECK(clGetPlatformIDs(num_platforms, &platforms[0], nullptr));
140
141 for (size_t i = 0; i < platforms.size(); ++i) {
142 if (!is_intel_platform(platforms[i])) continue;
143
144 cl_uint num_devices = 0;
145 cl_int err = clGetDeviceIDs(
146 platforms[i], device_type, 0, nullptr, &num_devices);
147
148 if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) {
149 return status::runtime_error;
150 }
151
152 if (num_devices != 0) {
153 std::vector<cl_device_id> plat_devices;
154 plat_devices.resize(num_devices);
155 OCL_CHECK(clGetDeviceIDs(platforms[i], device_type, num_devices,
156 &plat_devices[0], nullptr));
157
158 // Use Intel devices only
159 for (size_t j = 0; j < plat_devices.size(); ++j) {
160 cl_uint vendor_id;
161 clGetDeviceInfo(plat_devices[j], CL_DEVICE_VENDOR_ID,
162 sizeof(cl_uint), &vendor_id, nullptr);
163 if (vendor_id == 0x8086) {
164 devices->push_back(plat_devices[j]);
165 }
166 }
167 }
168 }
169 // No devices found but still return success
170 return status::success;
171}
172
173status_t get_ocl_device_index(size_t *index, cl_device_id device) {
174 std::vector<cl_device_id> ocl_devices;
175 CHECK(get_ocl_devices(&ocl_devices, CL_DEVICE_TYPE_GPU));
176
177 // Search the top level device unconditionally
178 auto parent_device = device;
179 auto top_level_device = device;
180 while (parent_device) {
181 top_level_device = parent_device;
182 OCL_CHECK(clGetDeviceInfo(top_level_device, CL_DEVICE_PARENT_DEVICE,
183 sizeof(cl_device_id), &parent_device, nullptr));
184 }
185
186 // Find the top level device in the list
187 auto it = std::find(
188 ocl_devices.begin(), ocl_devices.end(), top_level_device);
189 if (it != ocl_devices.end()) {
190 *index = it - ocl_devices.begin();
191 return status::success;
192 } else {
193 *index = SIZE_MAX;
194 return status::invalid_arguments;
195 }
196}
197
198cl_platform_id get_ocl_platform(cl_device_id device) {
199 cl_platform_id platform;
200 cl_int err = clGetDeviceInfo(
201 device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr);
202 if (err != CL_SUCCESS) return nullptr;
203 return platform;
204}
205
206cl_platform_id get_ocl_platform(engine_t *engine) {
207 return utils::downcast<ocl_gpu_engine_t *>(engine)->platform();
208}
209
210status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
211 cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) {
212 char s_type[16];
213 OCL_CHECK(clGetKernelArgInfo(ocl_kernel, idx, CL_KERNEL_ARG_TYPE_NAME,
214 sizeof(s_type), s_type, nullptr));
215#define CASE(x) \
216 if (!strcmp(STRINGIFY(x), s_type)) { \
217 *type = compute::scalar_type_t::_##x; \
218 return status::success; \
219 }
220 CASE(char)
221 CASE(float)
222 CASE(half)
223 CASE(int)
224 CASE(long)
225 CASE(short)
226 CASE(uchar)
227 CASE(uint)
228 CASE(ulong)
229 CASE(ushort)
230 CASE(zero_pad_mask_t)
231#undef CASE
232
233 if (allow_undef) {
234 *type = compute::scalar_type_t::undef;
235 return status::success;
236 }
237
238 assert(!"Not expected");
239 return status::runtime_error;
240}
241
242cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags,
243 size_t size, void *host_ptr, cl_int *errcode_ret) {
244 return clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
245}
246
247status_t get_ocl_program_binary(cl_program program, cl_device_id device,
248 std::shared_ptr<compute::binary_t> &binary) {
249
250 size_t n_devices = 0;
251 cl_int err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
252 sizeof(size_t), &n_devices, nullptr);
253 OCL_CHECK(err);
254
255 std::vector<size_t> binarySize(n_devices);
256 err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
257 sizeof(size_t) * n_devices, binarySize.data(), nullptr);
258 OCL_CHECK(err);
259
260 std::vector<cl_device_id> devices(n_devices);
261 err = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
262 sizeof(cl_device_id) * n_devices, devices.data(), nullptr);
263 OCL_CHECK(err);
264
265 size_t device_idx = std::distance(
266 devices.begin(), std::find(devices.begin(), devices.end(), device));
267 std::vector<uint8_t *> binary_pointers(n_devices);
268 std::vector<std::shared_ptr<compute::binary_t>> binaries(n_devices);
269 for (size_t i = 0; i < n_devices; ++i) {
270 binaries[i] = std::make_shared<compute::binary_t>(binarySize[i]);
271 binary_pointers[i] = binaries[i]->data();
272 }
273
274 err = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
275 sizeof(uint8_t *) * n_devices, binary_pointers.data(), nullptr);
276 OCL_CHECK(err);
277 binary = binaries[device_idx];
278
279 return status::success;
280}
281
282status_t get_ocl_program_binary(cl_kernel kernel, cl_device_id device,
283 std::shared_ptr<compute::binary_t> &binary) {
284 cl_int err;
285
286 cl_program program;
287 err = clGetKernelInfo(
288 kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
289 OCL_CHECK(err);
290
291 return get_ocl_program_binary(program, device, binary);
292}
293
294#if DNNL_ENABLE_JIT_DUMP
295
296void dump_kernel_binary(cl_kernel ocl_kernel) {
297 if (!get_jit_dump()) return;
298
299 cl_int err;
300
301 size_t binary_size;
302 err = clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, 0,
303 nullptr, &binary_size);
304 // Ignore error.
305 if (err != CL_SUCCESS) return;
306
307 std::vector<uint8_t> binary(binary_size);
308 err = clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL,
309 binary.size(), binary.data(), nullptr);
310 // Ignore error.
311 if (err != CL_SUCCESS) return;
312
313 auto name = get_kernel_name(ocl_kernel);
314 // Ignore error.
315 if (name.empty()) return;
316
317 static std::mutex m;
318 std::lock_guard<std::mutex> guard(m);
319
320 static int counter = 0;
321 std::ostringstream fname;
322 fname << "dnnl_dump_gpu_" << name << "." << counter << ".bin";
323
324 FILE *fp = fopen(fname.str().c_str(), "wb+");
325
326 // Ignore error.
327 if (!fp) return;
328
329 fwrite(binary.data(), binary.size(), 1, fp);
330 fclose(fp);
331
332 counter++;
333}
334
335void dump_kernel_binary(
336 const engine_t *engine, const compute::kernel_t &binary_kernel) {
337 if (!get_jit_dump()) return;
338
339 compute::kernel_t realized_kernel;
340 auto status = binary_kernel.realize(&realized_kernel, engine, nullptr);
341
342 // Ignore error.
343 if (status != status::success) return;
344
345 auto *kernel
346 = utils::downcast<const ocl_gpu_kernel_t *>(realized_kernel.impl());
347 dump_kernel_binary(kernel->ocl_kernel());
348}
349#else
350void dump_kernel_binary(const engine_t *, const compute::kernel_t &) {}
351void dump_kernel_binary(cl_kernel) {}
352#endif
353
354status_t get_kernel_arg_types(cl_kernel ocl_kernel,
355 std::vector<gpu::compute::scalar_type_t> *arg_types) {
356 cl_uint nargs;
357 OCL_CHECK(clGetKernelInfo(
358 ocl_kernel, CL_KERNEL_NUM_ARGS, sizeof(nargs), &nargs, nullptr));
359
360 *arg_types = std::vector<gpu::compute::scalar_type_t>(nargs);
361
362 for (cl_uint i = 0; i < nargs; i++) {
363 gpu::compute::scalar_type_t type {};
364 CHECK(gpu::ocl::get_ocl_kernel_arg_type(
365 &type, ocl_kernel, i, /*allow_undef=*/true));
366 (*arg_types)[i] = type;
367 }
368
369 return status::success;
370}
371
372static status_t get_ocl_device_eu_count_intel(
373 cl_device_id device, int32_t *eu_count) {
374 cl_uint num_slices = 0;
375 cl_uint num_sub_slices_per_slice = 0;
376 cl_uint num_eus_per_sub_slice = 0;
377
378 OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SLICES_INTEL,
379 sizeof(num_slices), &num_slices, nullptr));
380 OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL,
381 sizeof(num_sub_slices_per_slice), &num_sub_slices_per_slice,
382 nullptr));
383 OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL,
384 sizeof(num_eus_per_sub_slice), &num_eus_per_sub_slice, nullptr));
385
386 *eu_count = (int32_t)(
387 num_slices * num_sub_slices_per_slice * num_eus_per_sub_slice);
388 return status::success;
389}
390
391status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count) {
392 // Try to use Intel-specific slices/sub-slices to deduce EU count.
393 auto status = get_ocl_device_eu_count_intel(device, eu_count);
394 if (status == status::success) return status;
395
396 // If failed, fall back to common OpenCL query.
397 cl_uint max_compute_units = 0;
398 OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
399 sizeof(max_compute_units), &max_compute_units, nullptr));
400 *eu_count = (int32_t)max_compute_units;
401
402 return status::success;
403}
404
405status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel) {
406 cl_int err;
407#if !defined(DNNL_SYCL_HIP) && !defined(DNNL_SYCL_CUDA) \
408 && defined(CL_VERSION_2_1)
409 *cloned_kernel = clCloneKernel(kernel, &err);
410 OCL_CHECK(err);
411#else
412 // clCloneKernel is not available - recreate from the program.
413 auto name = get_kernel_name(kernel);
414
415 cl_program program;
416 err = clGetKernelInfo(
417 kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
418 OCL_CHECK(err);
419
420 *cloned_kernel = clCreateKernel(program, name.c_str(), &err);
421 OCL_CHECK(err);
422#endif
423
424 return status::success;
425}
426
427} // namespace ocl
428} // namespace gpu
429} // namespace impl
430} // namespace dnnl
431