1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <mutex>
18#include <thread>
19#include <type_traits>
20
21#include "gpu/compute/device_info.hpp"
22
23#ifdef DNNL_WITH_SYCL
24#include "sycl/sycl_engine_base.hpp"
25#endif
26
27namespace dnnl {
28namespace impl {
29namespace gpu {
30namespace compute {
31
32uint64_t get_future_extensions(compute::gpu_arch_t gpu_arch) {
33 using namespace compute;
34
35 uint64_t extensions = 0;
36 switch (gpu_arch) {
37 case gpu_arch_t::gen9:
38 case gpu_arch_t::gen11: break;
39 case gpu_arch_t::xe_hp:
40 case gpu_arch_t::xe_hpg:
41 case gpu_arch_t::xe_hpc:
42 extensions |= (uint64_t)device_ext_t::intel_global_float_atomics;
43 extensions |= (uint64_t)
44 device_ext_t::intel_subgroup_matrix_multiply_accumulate;
45 extensions |= (uint64_t)device_ext_t::
46 intel_subgroup_split_matrix_multiply_accumulate;
47 extensions
48 |= (uint64_t)device_ext_t::intel_variable_eu_thread_count;
49 extensions |= (uint64_t)device_ext_t::future_bf16_cvt;
50 case gpu_arch_t::xe_lp:
51 extensions |= (uint64_t)device_ext_t::intel_subgroup_local_block_io;
52 extensions |= (uint64_t)device_ext_t::intel_dot_accumulate;
53 break;
54 case gpu_arch_t::unknown: break;
55 }
56 return extensions;
57}
58
59bool device_info_t::mayiuse_sub_group(int size) const {
60 switch (gpu_arch()) {
61 case gpu_arch_t::xe_hpc: return utils::one_of(size, 16, 32);
62 default: return utils::one_of(size, 8, 16, 32);
63 }
64}
65
66int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) {
67 switch (gpu_arch) {
68 case gpu::compute::gpu_arch_t::gen9:
69 case gpu::compute::gpu_arch_t::gen11:
70 case gpu::compute::gpu_arch_t::xe_hpc: return 8;
71 case gpu::compute::gpu_arch_t::xe_lp:
72 case gpu::compute::gpu_arch_t::xe_hp:
73 case gpu::compute::gpu_arch_t::xe_hpg: return 16;
74 case gpu::compute::gpu_arch_t::unknown: return 8;
75 }
76 return 8;
77}
78
79int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
80 switch (gpu_arch) {
81 case gpu::compute::gpu_arch_t::gen9: return 16;
82 case gpu::compute::gpu_arch_t::gen11:
83 case gpu::compute::gpu_arch_t::xe_hpc: return 32;
84 case gpu::compute::gpu_arch_t::xe_lp:
85 case gpu::compute::gpu_arch_t::xe_hp:
86 case gpu::compute::gpu_arch_t::xe_hpg:
87 case gpu::compute::gpu_arch_t::unknown: return 16;
88 }
89 return 16;
90}
91
92int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) {
93 switch (gpu_arch) {
94 case gpu::compute::gpu_arch_t::gen9:
95 case gpu::compute::gpu_arch_t::gen11:
96 case gpu::compute::gpu_arch_t::xe_lp: return 7;
97 case gpu::compute::gpu_arch_t::xe_hp:
98 case gpu::compute::gpu_arch_t::xe_hpg:
99 case gpu::compute::gpu_arch_t::xe_hpc: return large_grf_mode ? 4 : 8;
100 case gpu::compute::gpu_arch_t::unknown: return 7;
101 }
102 return 7;
103}
104
105int device_info_t::max_slm_size_per_tg(
106 gpu_arch_t gpu_arch, bool large_grf_mode) {
107 int slm_size = 0; // SLM size per SS or DSS.
108 switch (gpu_arch) {
109 case gpu::compute::gpu_arch_t::gen9:
110 case gpu::compute::gpu_arch_t::gen11: slm_size = (1 << 16); break;
111 case gpu::compute::gpu_arch_t::xe_lp:
112 case gpu::compute::gpu_arch_t::xe_hp:
113 case gpu::compute::gpu_arch_t::xe_hpc:
114 case gpu::compute::gpu_arch_t::xe_hpg: slm_size = (1 << 17); break;
115 case gpu::compute::gpu_arch_t::unknown: assert(!"not expected");
116 }
117 return slm_size / threads_per_eu(gpu_arch, large_grf_mode);
118}
119
120int device_info_t::slm_memory_bank_count(gpu_arch_t gpu_arch) {
121 switch (gpu_arch) {
122 case gpu::compute::gpu_arch_t::gen9:
123 case gpu::compute::gpu_arch_t::gen11:
124 case gpu::compute::gpu_arch_t::xe_lp: return 16;
125 case gpu::compute::gpu_arch_t::xe_hp: return 65;
126 case gpu::compute::gpu_arch_t::xe_hpc: return 64;
127 case gpu::compute::gpu_arch_t::xe_hpg: return 32;
128 case gpu::compute::gpu_arch_t::unknown: assert(!"not expected");
129 }
130 return 32;
131}
132// Returns SLM bank granularity in bytes.
133int device_info_t::slm_memory_bank_granularity(gpu_arch_t gpu_arch) {
134 switch (gpu_arch) {
135 case gpu::compute::gpu_arch_t::gen9:
136 case gpu::compute::gpu_arch_t::gen11:
137 case gpu::compute::gpu_arch_t::xe_lp:
138 case gpu::compute::gpu_arch_t::xe_hp: return 4;
139 case gpu::compute::gpu_arch_t::xe_hpc:
140 case gpu::compute::gpu_arch_t::xe_hpg: return 8;
141 case gpu::compute::gpu_arch_t::unknown: assert(!"not expected");
142 }
143 return 4;
144}
145
146status_t device_info_t::init_attributes_common(engine_t *engine) {
147 // TODO: Fix for discrete GPUs. The code below is written for
148 // integrated GPUs assuming that last-level cache for GPU is shared
149 // with CPU.
150 // Integrated GPUs share LLC with CPU which is L3 cache on CPU.
151
152 // XXX: this is the only place where GPU runtime functionally depends on
153 // CPU runtime. The `llc_cache_size_` is used only in one kernel for gen9.
154 // The idea is to use approximate cache size.
155
156 // llc_cache_size_ = cpu::platform::get_per_core_cache_size(3)
157 // * cpu::platform::get_num_cores();
158 // Assumption is that HT is likely enabled on client systems.
159 llc_cache_size_ = std::thread::hardware_concurrency() * (1 << 20);
160
161 bool ocl_backend = true;
162
163#ifdef DNNL_WITH_SYCL
164 using namespace impl::sycl;
165 if (engine->runtime_kind() == runtime_kind::sycl) {
166 auto *sycl_engine = utils::downcast<const sycl_engine_base_t *>(engine);
167 ocl_backend = (sycl_engine->backend() == backend_t::opencl);
168 }
169#endif
170
171 hw_threads_[0] = eu_count_ * threads_per_eu(gpu_arch_, false);
172 hw_threads_[1] = eu_count_ * threads_per_eu(gpu_arch_, true);
173
174 max_eus_per_wg_ = max_eus_per_wg(gpu_arch_);
175 max_subgroup_size_ = max_subgroup_size(gpu_arch_);
176
177 mayiuse_non_uniform_work_groups_ = ocl_backend;
178
179 return status::success;
180}
181
182status_t device_info_t::init_serialized_device_info(
183 const std::vector<uint8_t> &cache_blob) {
184 if (!cache_blob.empty()) {
185 serialized_device_info_.write(cache_blob.data(), cache_blob.size());
186 return status::success;
187 }
188
189 serialized_device_info_.write(&gpu_arch_);
190 serialized_device_info_.write(&stepping_id_);
191 serialized_device_info_.write(&runtime_version_.major);
192 serialized_device_info_.write(&runtime_version_.minor);
193 serialized_device_info_.write(&runtime_version_.build);
194 serialized_device_info_.write(hw_threads_, 2);
195 serialized_device_info_.write(&eu_count_);
196 serialized_device_info_.write(&max_eus_per_wg_);
197 serialized_device_info_.write(&max_subgroup_size_);
198 serialized_device_info_.write(&max_wg_size_);
199 serialized_device_info_.write(&llc_cache_size_);
200 serialized_device_info_.write(&extensions_);
201 serialized_device_info_.write(&mayiuse_ngen_kernels_);
202 serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
203
204 const size_t name_size = name_.size();
205 serialized_device_info_.write(&name_size);
206 serialized_device_info_.write(name_.data(), name_size);
207
208 return status::success;
209}
210
211status_t device_info_t::init_from_cache_blob(
212 const std::vector<uint8_t> &cache_blob) {
213 if (cache_blob.empty()) return status::invalid_arguments;
214
215 size_t pos = 0;
216#define DESERIALIZE(val, expected_type) \
217 static_assert(std::is_same<std::remove_reference<decltype(val)>::type, \
218 expected_type>::value, \
219 #val " has incorrect type"); \
220 (val) = *reinterpret_cast<const expected_type *>(cache_blob.data() + pos); \
221 pos += sizeof(expected_type);
222
223 DESERIALIZE(gpu_arch_, compute::gpu_arch_t);
224 DESERIALIZE(stepping_id_, int);
225 DESERIALIZE(runtime_version_.major, int);
226 DESERIALIZE(runtime_version_.minor, int);
227 DESERIALIZE(runtime_version_.build, int);
228 DESERIALIZE(hw_threads_[0], int32_t);
229 DESERIALIZE(hw_threads_[1], int32_t);
230 DESERIALIZE(eu_count_, int32_t);
231 DESERIALIZE(max_eus_per_wg_, int32_t);
232 DESERIALIZE(max_subgroup_size_, int32_t);
233 DESERIALIZE(max_wg_size_, size_t);
234 DESERIALIZE(llc_cache_size_, size_t);
235 DESERIALIZE(extensions_, uint64_t);
236 DESERIALIZE(mayiuse_ngen_kernels_, bool);
237 DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
238#undef DESERIALIZE
239
240 // name_ is not trivially copyable type
241 const size_t name_size
242 = *reinterpret_cast<const size_t *>(cache_blob.data() + pos);
243 pos += sizeof(size_t);
244 name_ = std::string(
245 reinterpret_cast<const char *>(cache_blob.data() + pos), name_size);
246 pos += name_size;
247 assert(name_size == name_.size());
248 assert(pos == cache_blob.size());
249
250 return status::success;
251}
252
253} // namespace compute
254} // namespace gpu
255} // namespace impl
256} // namespace dnnl
257