ocl_utils.cpp source code [oneDNN/src/gpu/ocl/ocl_utils.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <algorithm>
18	#include <cstdio>
19	#include <cstring>
20	#include <mutex>
21	#include <CL/cl_ext.h>
22
23	#include "gpu/ocl/ocl_gpu_engine.hpp"
24	#include "gpu/ocl/ocl_gpu_kernel.hpp"
25	#include "gpu/ocl/ocl_utils.hpp"
26
27	#ifndef DNNL_ENABLE_JIT_DUMP
28	#define DNNL_ENABLE_JIT_DUMP 1
29	#endif
30
31	#ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
32	#define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
33	#endif
34
35	#ifndef CL_DEVICE_NUM_SLICES_INTEL
36	#define CL_DEVICE_NUM_SLICES_INTEL 0x4252
37	#endif
38
39	#ifndef CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL
40	#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253
41	#endif
42
43	#ifndef CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL
44	#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254
45	#endif
46
47	namespace dnnl {
48	namespace impl {
49	namespace gpu {
50	namespace ocl {
51
52	template <typename T, typename F>
53	static std::string get_ocl_name(T obj, F get_func, cl_uint name_query) {
54	size_t name_size;
55	cl_int err = get_func(obj, name_query, `0`, nullptr, &name_size);
56	// Ignore error.
57	if (err != CL_SUCCESS) return {};
58
59	// Include null terminator explicitly - to safely overwrite it in
60	// clGetKernelInfo
61	std::string name(name_size, `0`);
62	err = get_func(obj, name_query, name_size, &name[`0`], nullptr);
63	// Ignore error.
64	if (err != CL_SUCCESS) return {};
65
66	// Remove the null terminator as std::string already includes it
67	name.resize(name_size - `1`);
68	return name;
69	}
70
71	static std::string get_kernel_name(cl_kernel kernel) {
72	return get_ocl_name(kernel, clGetKernelInfo, CL_KERNEL_FUNCTION_NAME);
73	}
74
75	static std::string get_platform_name(cl_platform_id platform) {
76	return get_ocl_name(platform, clGetPlatformInfo, CL_PLATFORM_NAME);
77	}
78
79	static bool is_intel_platform(cl_platform_id platform) {
80	auto name = get_platform_name(platform);
81	return name.find("Intel") != std::string::npos;
82	}
83
84	status_t check_device(
85	engine_kind_t eng_kind, cl_device_id dev, cl_context ctx) {
86	assert(dev && ctx);
87
88	// Check device and context consistency.
89	size_t dev_bytes;
90	OCL_CHECK(
91	clGetContextInfo(ctx, CL_CONTEXT_DEVICES, `0`, nullptr, &dev_bytes));
92
93	std::vector<cl_device_id> ctx_devices(dev_bytes / sizeof(cl_device_id));
94	OCL_CHECK(clGetContextInfo(
95	ctx, CL_CONTEXT_DEVICES, dev_bytes, &ctx_devices[`0`], nullptr));
96
97	bool found = false;
98	for (size_t i = `0`; i < ctx_devices.size(); ++i) {
99	if (ctx_devices[i] == dev) {
100	found = true;
101	break;
102	}
103	}
104	if (!found) return status::invalid_arguments;
105
106	// Check engine kind and device consistency.
107	cl_device_type dev_type;
108	OCL_CHECK(clGetDeviceInfo(
109	dev, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, nullptr));
110	if ((eng_kind == engine_kind::cpu)
111	&& (dev_type & CL_DEVICE_TYPE_CPU) == `0`) {
112	return status::invalid_arguments;
113	}
114	if ((eng_kind == engine_kind::gpu)
115	&& (dev_type & CL_DEVICE_TYPE_GPU) == `0`) {
116	return status::invalid_arguments;
117	}
118
119	// Check that the platform is an Intel platform.
120	cl_platform_id platform;
121	OCL_CHECK(clGetDeviceInfo(
122	dev, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr));
123	if (!is_intel_platform(platform)) return status::invalid_arguments;
124
125	return status::success;
126	}
127
128	status_t get_ocl_devices(
129	std::vector<cl_device_id> *devices, cl_device_type device_type) {
130	cl_uint num_platforms = `0`;
131
132	cl_int err = clGetPlatformIDs(`0`, nullptr, &num_platforms);
133	// No platforms - a valid scenario
134	if (err == CL_PLATFORM_NOT_FOUND_KHR) return status::success;
135
136	OCL_CHECK(err);
137
138	std::vector<cl_platform_id> platforms(num_platforms);
139	OCL_CHECK(clGetPlatformIDs(num_platforms, &platforms[`0`], nullptr));
140
141	for (size_t i = `0`; i < platforms.size(); ++i) {
142	if (!is_intel_platform(platforms[i])) continue;
143
144	cl_uint num_devices = `0`;
145	cl_int err = clGetDeviceIDs(
146	platforms[i], device_type, `0`, nullptr, &num_devices);
147
148	if (!utils::one_of(err, CL_SUCCESS, CL_DEVICE_NOT_FOUND)) {
149	return status::runtime_error;
150	}
151
152	if (num_devices != `0`) {
153	std::vector<cl_device_id> plat_devices;
154	plat_devices.resize(num_devices);
155	OCL_CHECK(clGetDeviceIDs(platforms[i], device_type, num_devices,
156	&plat_devices[`0`], nullptr));
157
158	// Use Intel devices only
159	for (size_t j = `0`; j < plat_devices.size(); ++j) {
160	cl_uint vendor_id;
161	clGetDeviceInfo(plat_devices[j], CL_DEVICE_VENDOR_ID,
162	sizeof(cl_uint), &vendor_id, nullptr);
163	if (vendor_id == `0x8086`) {
164	devices->push_back(plat_devices[j]);
165	}
166	}
167	}
168	}
169	// No devices found but still return success
170	return status::success;
171	}
172
173	status_t get_ocl_device_index(size_t *index, cl_device_id device) {
174	std::vector<cl_device_id> ocl_devices;
175	CHECK(get_ocl_devices(&ocl_devices, CL_DEVICE_TYPE_GPU));
176
177	// Search the top level device unconditionally
178	auto parent_device = device;
179	auto top_level_device = device;
180	while (parent_device) {
181	top_level_device = parent_device;
182	OCL_CHECK(clGetDeviceInfo(top_level_device, CL_DEVICE_PARENT_DEVICE,
183	sizeof(cl_device_id), &parent_device, nullptr));
184	}
185
186	// Find the top level device in the list
187	auto it = std::find(
188	ocl_devices.begin(), ocl_devices.end(), top_level_device);
189	if (it != ocl_devices.end()) {
190	*index = it - ocl_devices.begin();
191	return status::success;
192	} else {
193	*index = SIZE_MAX;
194	return status::invalid_arguments;
195	}
196	}
197
198	cl_platform_id get_ocl_platform(cl_device_id device) {
199	cl_platform_id platform;
200	cl_int err = clGetDeviceInfo(
201	device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr);
202	if (err != CL_SUCCESS) return nullptr;
203	return platform;
204	}
205
206	cl_platform_id get_ocl_platform(engine_t *engine) {
207	return utils::downcast<ocl_gpu_engine_t *>(engine)->platform();
208	}
209
210	status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
211	cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) {
212	char s_type[`16`];
213	OCL_CHECK(clGetKernelArgInfo(ocl_kernel, idx, CL_KERNEL_ARG_TYPE_NAME,
214	sizeof(s_type), s_type, nullptr));
215	#define CASE(x) \
216	if (!strcmp(STRINGIFY(x), s_type)) { \
217	*type = compute::scalar_type_t::_##x; \
218	return status::success; \
219	}
220	CASE(char)
221	CASE(float)
222	CASE(half)
223	CASE(int)
224	CASE(long)
225	CASE(short)
226	CASE(uchar)
227	CASE(uint)
228	CASE(ulong)
229	CASE(ushort)
230	CASE(zero_pad_mask_t)
231	#undef CASE
232
233	if (allow_undef) {
234	*type = compute::scalar_type_t::undef;
235	return status::success;
236	}
237
238	assert(!"Not expected");
239	return status::runtime_error;
240	}
241
242	cl_mem clCreateBuffer_wrapper(cl_context context, cl_mem_flags flags,
243	size_t size, void host_ptr, cl_int errcode_ret) {
244	return clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
245	}
246
247	status_t get_ocl_program_binary(cl_program program, cl_device_id device,
248	std::shared_ptr<compute::binary_t> &binary) {
249
250	size_t n_devices = `0`;
251	cl_int err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
252	sizeof(size_t), &n_devices, nullptr);
253	OCL_CHECK(err);
254
255	std::vector<size_t> binarySize(n_devices);
256	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
257	sizeof(size_t) * n_devices, binarySize.data(), nullptr);
258	OCL_CHECK(err);
259
260	std::vector<cl_device_id> devices(n_devices);
261	err = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
262	sizeof(cl_device_id) * n_devices, devices.data(), nullptr);
263	OCL_CHECK(err);
264
265	size_t device_idx = std::distance(
266	devices.begin(), std::find(devices.begin(), devices.end(), device));
267	std::vector<uint8_t *> binary_pointers(n_devices);
268	std::vector<std::shared_ptr<compute::binary_t>> binaries(n_devices);
269	for (size_t i = `0`; i < n_devices; ++i) {
270	binaries[i] = std::make_shared<compute::binary_t>(binarySize[i]);
271	binary_pointers[i] = binaries[i]->data();
272	}
273
274	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
275	sizeof(uint8_t ) n_devices, binary_pointers.data(), nullptr);
276	OCL_CHECK(err);
277	binary = binaries[device_idx];
278
279	return status::success;
280	}
281
282	status_t get_ocl_program_binary(cl_kernel kernel, cl_device_id device,
283	std::shared_ptr<compute::binary_t> &binary) {
284	cl_int err;
285
286	cl_program program;
287	err = clGetKernelInfo(
288	kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
289	OCL_CHECK(err);
290
291	return get_ocl_program_binary(program, device, binary);
292	}
293
294	#if DNNL_ENABLE_JIT_DUMP
295
296	void dump_kernel_binary(cl_kernel ocl_kernel) {
297	if (!get_jit_dump()) return;
298
299	cl_int err;
300
301	size_t binary_size;
302	err = clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, `0`,
303	nullptr, &binary_size);
304	// Ignore error.
305	if (err != CL_SUCCESS) return;
306
307	std::vector<uint8_t> binary(binary_size);
308	err = clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL,
309	binary.size(), binary.data(), nullptr);
310	// Ignore error.
311	if (err != CL_SUCCESS) return;
312
313	auto name = get_kernel_name(ocl_kernel);
314	// Ignore error.
315	if (name.empty()) return;
316
317	static std::mutex m;
318	std::lock_guard<std::mutex> guard(m);
319
320	static int counter = `0`;
321	std::ostringstream fname;
322	fname << "dnnl_dump_gpu_" << name << "." << counter << ".bin";
323
324	FILE *fp = fopen(fname.str().c_str(), "wb+");
325
326	// Ignore error.
327	if (!fp) return;
328
329	fwrite(binary.data(), binary.size(), `1`, fp);
330	fclose(fp);
331
332	counter++;
333	}
334
335	void dump_kernel_binary(
336	const engine_t engine, const* compute::kernel_t &binary_kernel) {
337	if (!get_jit_dump()) return;
338
339	compute::kernel_t realized_kernel;
340	auto status = binary_kernel.realize(&realized_kernel, engine, nullptr);
341
342	// Ignore error.
343	if (status != status::success) return;
344
345	auto *kernel
346	= utils::downcast<const ocl_gpu_kernel_t *>(realized_kernel.impl());
347	dump_kernel_binary(kernel->ocl_kernel());
348	}
349	#else
350	void dump_kernel_binary(const engine_t , const* compute::kernel_t &) {}
351	void dump_kernel_binary(cl_kernel) {}
352	#endif
353
354	status_t get_kernel_arg_types(cl_kernel ocl_kernel,
355	std::vector<gpu::compute::scalar_type_t> *arg_types) {
356	cl_uint nargs;
357	OCL_CHECK(clGetKernelInfo(
358	ocl_kernel, CL_KERNEL_NUM_ARGS, sizeof(nargs), &nargs, nullptr));
359
360	*arg_types = std::vector<gpu::compute::scalar_type_t>(nargs);
361
362	for (cl_uint i = `0`; i < nargs; i++) {
363	gpu::compute::scalar_type_t type {};
364	CHECK(gpu::ocl::get_ocl_kernel_arg_type(
365	&type, ocl_kernel, i, /allow_undef=/true));
366	(*arg_types)[i] = type;
367	}
368
369	return status::success;
370	}
371
372	static status_t get_ocl_device_eu_count_intel(
373	cl_device_id device, int32_t *eu_count) {
374	cl_uint num_slices = `0`;
375	cl_uint num_sub_slices_per_slice = `0`;
376	cl_uint num_eus_per_sub_slice = `0`;
377
378	OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SLICES_INTEL,
379	sizeof(num_slices), &num_slices, nullptr));
380	OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL,
381	sizeof(num_sub_slices_per_slice), &num_sub_slices_per_slice,
382	nullptr));
383	OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL,
384	sizeof(num_eus_per_sub_slice), &num_eus_per_sub_slice, nullptr));
385
386	*eu_count = (int32_t)(
387	num_slices * num_sub_slices_per_slice * num_eus_per_sub_slice);
388	return status::success;
389	}
390
391	status_t get_ocl_device_eu_count(cl_device_id device, int32_t *eu_count) {
392	// Try to use Intel-specific slices/sub-slices to deduce EU count.
393	auto status = get_ocl_device_eu_count_intel(device, eu_count);
394	if (status == status::success) return status;
395
396	// If failed, fall back to common OpenCL query.
397	cl_uint max_compute_units = `0`;
398	OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
399	sizeof(max_compute_units), &max_compute_units, nullptr));
400	*eu_count = (int32_t)max_compute_units;
401
402	return status::success;
403	}
404
405	status_t clone_kernel(cl_kernel kernel, cl_kernel *cloned_kernel) {
406	cl_int err;
407	#if !defined(DNNL_SYCL_HIP) && !defined(DNNL_SYCL_CUDA) \
408	&& defined(CL_VERSION_2_1)
409	*cloned_kernel = clCloneKernel(kernel, &err);
410	OCL_CHECK(err);
411	#else
412	// clCloneKernel is not available - recreate from the program.
413	auto name = get_kernel_name(kernel);
414
415	cl_program program;
416	err = clGetKernelInfo(
417	kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
418	OCL_CHECK(err);
419
420	*cloned_kernel = clCreateKernel(program, name.c_str(), &err);
421	OCL_CHECK(err);
422	#endif
423
424	return status::success;
425	}
426
427	} // namespace ocl
428	} // namespace gpu
429	} // namespace impl
430	} // namespace dnnl
431

Browse the source code of oneDNN/src/gpu/ocl/ocl_utils.cpp