device_info.cpp source code [oneDNN/src/gpu/compute/device_info.cpp]

1	/*******************************************************************************
2	* Copyright 2020-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <mutex>
18	#include <thread>
19	#include <type_traits>
20
21	#include "gpu/compute/device_info.hpp"
22
23	#ifdef DNNL_WITH_SYCL
24	#include "sycl/sycl_engine_base.hpp"
25	#endif
26
27	namespace dnnl {
28	namespace impl {
29	namespace gpu {
30	namespace compute {
31
32	uint64_t get_future_extensions(compute::gpu_arch_t gpu_arch) {
33	using namespace compute;
34
35	uint64_t extensions = `0`;
36	switch (gpu_arch) {
37	case gpu_arch_t::gen9:
38	case gpu_arch_t::gen11: break;
39	case gpu_arch_t::xe_hp:
40	case gpu_arch_t::xe_hpg:
41	case gpu_arch_t::xe_hpc:
42	extensions \|= (uint64_t)device_ext_t::intel_global_float_atomics;
43	extensions \|= (uint64_t)
44	device_ext_t::intel_subgroup_matrix_multiply_accumulate;
45	extensions \|= (uint64_t)device_ext_t::
46	intel_subgroup_split_matrix_multiply_accumulate;
47	extensions
48	\|= (uint64_t)device_ext_t::intel_variable_eu_thread_count;
49	extensions \|= (uint64_t)device_ext_t::future_bf16_cvt;
50	case gpu_arch_t::xe_lp:
51	extensions \|= (uint64_t)device_ext_t::intel_subgroup_local_block_io;
52	extensions \|= (uint64_t)device_ext_t::intel_dot_accumulate;
53	break;
54	case gpu_arch_t::unknown: break;
55	}
56	return extensions;
57	}
58
59	bool device_info_t::mayiuse_sub_group(int size) const {
60	switch (gpu_arch()) {
61	case gpu_arch_t::xe_hpc: return utils::one_of(size, `16`, `32`);
62	default: return utils::one_of(size, `8`, `16`, `32`);
63	}
64	}
65
66	int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) {
67	switch (gpu_arch) {
68	case gpu::compute::gpu_arch_t::gen9:
69	case gpu::compute::gpu_arch_t::gen11:
70	case gpu::compute::gpu_arch_t::xe_hpc: return `8`;
71	case gpu::compute::gpu_arch_t::xe_lp:
72	case gpu::compute::gpu_arch_t::xe_hp:
73	case gpu::compute::gpu_arch_t::xe_hpg: return `16`;
74	case gpu::compute::gpu_arch_t::unknown: return `8`;
75	}
76	return `8`;
77	}
78
79	int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
80	switch (gpu_arch) {
81	case gpu::compute::gpu_arch_t::gen9: return `16`;
82	case gpu::compute::gpu_arch_t::gen11:
83	case gpu::compute::gpu_arch_t::xe_hpc: return `32`;
84	case gpu::compute::gpu_arch_t::xe_lp:
85	case gpu::compute::gpu_arch_t::xe_hp:
86	case gpu::compute::gpu_arch_t::xe_hpg:
87	case gpu::compute::gpu_arch_t::unknown: return `16`;
88	}
89	return `16`;
90	}
91
92	int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) {
93	switch (gpu_arch) {
94	case gpu::compute::gpu_arch_t::gen9:
95	case gpu::compute::gpu_arch_t::gen11:
96	case gpu::compute::gpu_arch_t::xe_lp: return `7`;
97	case gpu::compute::gpu_arch_t::xe_hp:
98	case gpu::compute::gpu_arch_t::xe_hpg:
99	case gpu::compute::gpu_arch_t::xe_hpc: return large_grf_mode ? `4` : `8`;
100	case gpu::compute::gpu_arch_t::unknown: return `7`;
101	}
102	return `7`;
103	}
104
105	int device_info_t::max_slm_size_per_tg(
106	gpu_arch_t gpu_arch, bool large_grf_mode) {
107	int slm_size = `0`; // SLM size per SS or DSS.
108	switch (gpu_arch) {
109	case gpu::compute::gpu_arch_t::gen9:
110	case gpu::compute::gpu_arch_t::gen11: slm_size = (`1` << `16`); break;
111	case gpu::compute::gpu_arch_t::xe_lp:
112	case gpu::compute::gpu_arch_t::xe_hp:
113	case gpu::compute::gpu_arch_t::xe_hpc:
114	case gpu::compute::gpu_arch_t::xe_hpg: slm_size = (`1` << `17`); break;
115	case gpu::compute::gpu_arch_t::unknown: assert(!"not expected");
116	}
117	return slm_size / threads_per_eu(gpu_arch, large_grf_mode);
118	}
119
120	int device_info_t::slm_memory_bank_count(gpu_arch_t gpu_arch) {
121	switch (gpu_arch) {
122	case gpu::compute::gpu_arch_t::gen9:
123	case gpu::compute::gpu_arch_t::gen11:
124	case gpu::compute::gpu_arch_t::xe_lp: return `16`;
125	case gpu::compute::gpu_arch_t::xe_hp: return `65`;
126	case gpu::compute::gpu_arch_t::xe_hpc: return `64`;
127	case gpu::compute::gpu_arch_t::xe_hpg: return `32`;
128	case gpu::compute::gpu_arch_t::unknown: assert(!"not expected");
129	}
130	return `32`;
131	}
132	// Returns SLM bank granularity in bytes.
133	int device_info_t::slm_memory_bank_granularity(gpu_arch_t gpu_arch) {
134	switch (gpu_arch) {
135	case gpu::compute::gpu_arch_t::gen9:
136	case gpu::compute::gpu_arch_t::gen11:
137	case gpu::compute::gpu_arch_t::xe_lp:
138	case gpu::compute::gpu_arch_t::xe_hp: return `4`;
139	case gpu::compute::gpu_arch_t::xe_hpc:
140	case gpu::compute::gpu_arch_t::xe_hpg: return `8`;
141	case gpu::compute::gpu_arch_t::unknown: assert(!"not expected");
142	}
143	return `4`;
144	}
145
146	status_t device_info_t::init_attributes_common(engine_t *engine) {
147	// TODO: Fix for discrete GPUs. The code below is written for
148	// integrated GPUs assuming that last-level cache for GPU is shared
149	// with CPU.
150	// Integrated GPUs share LLC with CPU which is L3 cache on CPU.
151
152	// XXX: this is the only place where GPU runtime functionally depends on
153	// CPU runtime. The `llc_cache_size_` is used only in one kernel for gen9.
154	// The idea is to use approximate cache size.
155
156	// llc_cache_size_ = cpu::platform::get_per_core_cache_size(3)
157	// cpu::platform::get_num_cores();*
158	// Assumption is that HT is likely enabled on client systems.
159	llc_cache_size_ = std::thread::hardware_concurrency() * (`1` << `20`);
160
161	bool ocl_backend = true;
162
163	#ifdef DNNL_WITH_SYCL
164	using namespace impl::sycl;
165	if (engine->runtime_kind() == runtime_kind::sycl) {
166	auto sycl_engine = utils::downcast<const* sycl_engine_base_t *>(engine);
167	ocl_backend = (sycl_engine->backend() == backend_t::opencl);
168	}
169	#endif
170
171	hw_threads_[`0`] = eu_count_ * threads_per_eu(gpu_arch_, false);
172	hw_threads_[`1`] = eu_count_ * threads_per_eu(gpu_arch_, true);
173
174	max_eus_per_wg_ = max_eus_per_wg(gpu_arch_);
175	max_subgroup_size_ = max_subgroup_size(gpu_arch_);
176
177	mayiuse_non_uniform_work_groups_ = ocl_backend;
178
179	return status::success;
180	}
181
182	status_t device_info_t::init_serialized_device_info(
183	const std::vector<uint8_t> &cache_blob) {
184	if (!cache_blob.empty()) {
185	serialized_device_info_.write(cache_blob.data(), cache_blob.size());
186	return status::success;
187	}
188
189	serialized_device_info_.write(&gpu_arch_);
190	serialized_device_info_.write(&stepping_id_);
191	serialized_device_info_.write(&runtime_version_.major);
192	serialized_device_info_.write(&runtime_version_.minor);
193	serialized_device_info_.write(&runtime_version_.build);
194	serialized_device_info_.write(hw_threads_, `2`);
195	serialized_device_info_.write(&eu_count_);
196	serialized_device_info_.write(&max_eus_per_wg_);
197	serialized_device_info_.write(&max_subgroup_size_);
198	serialized_device_info_.write(&max_wg_size_);
199	serialized_device_info_.write(&llc_cache_size_);
200	serialized_device_info_.write(&extensions_);
201	serialized_device_info_.write(&mayiuse_ngen_kernels_);
202	serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
203
204	const size_t name_size = name_.size();
205	serialized_device_info_.write(&name_size);
206	serialized_device_info_.write(name_.data(), name_size);
207
208	return status::success;
209	}
210
211	status_t device_info_t::init_from_cache_blob(
212	const std::vector<uint8_t> &cache_blob) {
213	if (cache_blob.empty()) return status::invalid_arguments;
214
215	size_t pos = `0`;
216	#define DESERIALIZE(val, expected_type) \
217	static_assert(std::is_same<std::remove_reference<decltype(val)>::type, \
218	expected_type>::value, \
219	#val " has incorrect type"); \
220	(val) = reinterpret_cast<const expected_type >(cache_blob.data() + pos); \
221	pos += sizeof(expected_type);
222
223	DESERIALIZE(gpu_arch_, compute::gpu_arch_t);
224	DESERIALIZE(stepping_id_, int);
225	DESERIALIZE(runtime_version_.major, int);
226	DESERIALIZE(runtime_version_.minor, int);
227	DESERIALIZE(runtime_version_.build, int);
228	DESERIALIZE(hw_threads_[`0`], int32_t);
229	DESERIALIZE(hw_threads_[`1`], int32_t);
230	DESERIALIZE(eu_count_, int32_t);
231	DESERIALIZE(max_eus_per_wg_, int32_t);
232	DESERIALIZE(max_subgroup_size_, int32_t);
233	DESERIALIZE(max_wg_size_, size_t);
234	DESERIALIZE(llc_cache_size_, size_t);
235	DESERIALIZE(extensions_, uint64_t);
236	DESERIALIZE(mayiuse_ngen_kernels_, bool);
237	DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
238	#undef DESERIALIZE
239
240	// name_ is not trivially copyable type
241	const size_t name_size
242	= *reinterpret_cast<const size_t *>(cache_blob.data() + pos);
243	pos += sizeof(size_t);
244	name_ = std::string(
245	reinterpret_cast<const char *>(cache_blob.data() + pos), name_size);
246	pos += name_size;
247	assert(name_size == name_.size());
248	assert(pos == cache_blob.size());
249
250	return status::success;
251	}
252
253	} // namespace compute
254	} // namespace gpu
255	} // namespace impl
256	} // namespace dnnl
257

Browse the source code of oneDNN/src/gpu/compute/device_info.cpp