1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <mutex> |
18 | #include <thread> |
19 | #include <type_traits> |
20 | |
21 | #include "gpu/compute/device_info.hpp" |
22 | |
23 | #ifdef DNNL_WITH_SYCL |
24 | #include "sycl/sycl_engine_base.hpp" |
25 | #endif |
26 | |
27 | namespace dnnl { |
28 | namespace impl { |
29 | namespace gpu { |
30 | namespace compute { |
31 | |
32 | uint64_t get_future_extensions(compute::gpu_arch_t gpu_arch) { |
33 | using namespace compute; |
34 | |
35 | uint64_t extensions = 0; |
36 | switch (gpu_arch) { |
37 | case gpu_arch_t::gen9: |
38 | case gpu_arch_t::gen11: break; |
39 | case gpu_arch_t::xe_hp: |
40 | case gpu_arch_t::xe_hpg: |
41 | case gpu_arch_t::xe_hpc: |
42 | extensions |= (uint64_t)device_ext_t::intel_global_float_atomics; |
43 | extensions |= (uint64_t) |
44 | device_ext_t::intel_subgroup_matrix_multiply_accumulate; |
45 | extensions |= (uint64_t)device_ext_t:: |
46 | intel_subgroup_split_matrix_multiply_accumulate; |
47 | extensions |
48 | |= (uint64_t)device_ext_t::intel_variable_eu_thread_count; |
49 | extensions |= (uint64_t)device_ext_t::future_bf16_cvt; |
50 | case gpu_arch_t::xe_lp: |
51 | extensions |= (uint64_t)device_ext_t::intel_subgroup_local_block_io; |
52 | extensions |= (uint64_t)device_ext_t::intel_dot_accumulate; |
53 | break; |
54 | case gpu_arch_t::unknown: break; |
55 | } |
56 | return extensions; |
57 | } |
58 | |
59 | bool device_info_t::mayiuse_sub_group(int size) const { |
60 | switch (gpu_arch()) { |
61 | case gpu_arch_t::xe_hpc: return utils::one_of(size, 16, 32); |
62 | default: return utils::one_of(size, 8, 16, 32); |
63 | } |
64 | } |
65 | |
66 | int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) { |
67 | switch (gpu_arch) { |
68 | case gpu::compute::gpu_arch_t::gen9: |
69 | case gpu::compute::gpu_arch_t::gen11: |
70 | case gpu::compute::gpu_arch_t::xe_hpc: return 8; |
71 | case gpu::compute::gpu_arch_t::xe_lp: |
72 | case gpu::compute::gpu_arch_t::xe_hp: |
73 | case gpu::compute::gpu_arch_t::xe_hpg: return 16; |
74 | case gpu::compute::gpu_arch_t::unknown: return 8; |
75 | } |
76 | return 8; |
77 | } |
78 | |
79 | int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) { |
80 | switch (gpu_arch) { |
81 | case gpu::compute::gpu_arch_t::gen9: return 16; |
82 | case gpu::compute::gpu_arch_t::gen11: |
83 | case gpu::compute::gpu_arch_t::xe_hpc: return 32; |
84 | case gpu::compute::gpu_arch_t::xe_lp: |
85 | case gpu::compute::gpu_arch_t::xe_hp: |
86 | case gpu::compute::gpu_arch_t::xe_hpg: |
87 | case gpu::compute::gpu_arch_t::unknown: return 16; |
88 | } |
89 | return 16; |
90 | } |
91 | |
92 | int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) { |
93 | switch (gpu_arch) { |
94 | case gpu::compute::gpu_arch_t::gen9: |
95 | case gpu::compute::gpu_arch_t::gen11: |
96 | case gpu::compute::gpu_arch_t::xe_lp: return 7; |
97 | case gpu::compute::gpu_arch_t::xe_hp: |
98 | case gpu::compute::gpu_arch_t::xe_hpg: |
99 | case gpu::compute::gpu_arch_t::xe_hpc: return large_grf_mode ? 4 : 8; |
100 | case gpu::compute::gpu_arch_t::unknown: return 7; |
101 | } |
102 | return 7; |
103 | } |
104 | |
105 | int device_info_t::max_slm_size_per_tg( |
106 | gpu_arch_t gpu_arch, bool large_grf_mode) { |
107 | int slm_size = 0; // SLM size per SS or DSS. |
108 | switch (gpu_arch) { |
109 | case gpu::compute::gpu_arch_t::gen9: |
110 | case gpu::compute::gpu_arch_t::gen11: slm_size = (1 << 16); break; |
111 | case gpu::compute::gpu_arch_t::xe_lp: |
112 | case gpu::compute::gpu_arch_t::xe_hp: |
113 | case gpu::compute::gpu_arch_t::xe_hpc: |
114 | case gpu::compute::gpu_arch_t::xe_hpg: slm_size = (1 << 17); break; |
115 | case gpu::compute::gpu_arch_t::unknown: assert(!"not expected" ); |
116 | } |
117 | return slm_size / threads_per_eu(gpu_arch, large_grf_mode); |
118 | } |
119 | |
120 | int device_info_t::slm_memory_bank_count(gpu_arch_t gpu_arch) { |
121 | switch (gpu_arch) { |
122 | case gpu::compute::gpu_arch_t::gen9: |
123 | case gpu::compute::gpu_arch_t::gen11: |
124 | case gpu::compute::gpu_arch_t::xe_lp: return 16; |
125 | case gpu::compute::gpu_arch_t::xe_hp: return 65; |
126 | case gpu::compute::gpu_arch_t::xe_hpc: return 64; |
127 | case gpu::compute::gpu_arch_t::xe_hpg: return 32; |
128 | case gpu::compute::gpu_arch_t::unknown: assert(!"not expected" ); |
129 | } |
130 | return 32; |
131 | } |
132 | // Returns SLM bank granularity in bytes. |
133 | int device_info_t::slm_memory_bank_granularity(gpu_arch_t gpu_arch) { |
134 | switch (gpu_arch) { |
135 | case gpu::compute::gpu_arch_t::gen9: |
136 | case gpu::compute::gpu_arch_t::gen11: |
137 | case gpu::compute::gpu_arch_t::xe_lp: |
138 | case gpu::compute::gpu_arch_t::xe_hp: return 4; |
139 | case gpu::compute::gpu_arch_t::xe_hpc: |
140 | case gpu::compute::gpu_arch_t::xe_hpg: return 8; |
141 | case gpu::compute::gpu_arch_t::unknown: assert(!"not expected" ); |
142 | } |
143 | return 4; |
144 | } |
145 | |
146 | status_t device_info_t::init_attributes_common(engine_t *engine) { |
147 | // TODO: Fix for discrete GPUs. The code below is written for |
148 | // integrated GPUs assuming that last-level cache for GPU is shared |
149 | // with CPU. |
150 | // Integrated GPUs share LLC with CPU which is L3 cache on CPU. |
151 | |
152 | // XXX: this is the only place where GPU runtime functionally depends on |
153 | // CPU runtime. The `llc_cache_size_` is used only in one kernel for gen9. |
154 | // The idea is to use approximate cache size. |
155 | |
156 | // llc_cache_size_ = cpu::platform::get_per_core_cache_size(3) |
157 | // * cpu::platform::get_num_cores(); |
158 | // Assumption is that HT is likely enabled on client systems. |
159 | llc_cache_size_ = std::thread::hardware_concurrency() * (1 << 20); |
160 | |
161 | bool ocl_backend = true; |
162 | |
163 | #ifdef DNNL_WITH_SYCL |
164 | using namespace impl::sycl; |
165 | if (engine->runtime_kind() == runtime_kind::sycl) { |
166 | auto *sycl_engine = utils::downcast<const sycl_engine_base_t *>(engine); |
167 | ocl_backend = (sycl_engine->backend() == backend_t::opencl); |
168 | } |
169 | #endif |
170 | |
171 | hw_threads_[0] = eu_count_ * threads_per_eu(gpu_arch_, false); |
172 | hw_threads_[1] = eu_count_ * threads_per_eu(gpu_arch_, true); |
173 | |
174 | max_eus_per_wg_ = max_eus_per_wg(gpu_arch_); |
175 | max_subgroup_size_ = max_subgroup_size(gpu_arch_); |
176 | |
177 | mayiuse_non_uniform_work_groups_ = ocl_backend; |
178 | |
179 | return status::success; |
180 | } |
181 | |
182 | status_t device_info_t::init_serialized_device_info( |
183 | const std::vector<uint8_t> &cache_blob) { |
184 | if (!cache_blob.empty()) { |
185 | serialized_device_info_.write(cache_blob.data(), cache_blob.size()); |
186 | return status::success; |
187 | } |
188 | |
189 | serialized_device_info_.write(&gpu_arch_); |
190 | serialized_device_info_.write(&stepping_id_); |
191 | serialized_device_info_.write(&runtime_version_.major); |
192 | serialized_device_info_.write(&runtime_version_.minor); |
193 | serialized_device_info_.write(&runtime_version_.build); |
194 | serialized_device_info_.write(hw_threads_, 2); |
195 | serialized_device_info_.write(&eu_count_); |
196 | serialized_device_info_.write(&max_eus_per_wg_); |
197 | serialized_device_info_.write(&max_subgroup_size_); |
198 | serialized_device_info_.write(&max_wg_size_); |
199 | serialized_device_info_.write(&llc_cache_size_); |
200 | serialized_device_info_.write(&extensions_); |
201 | serialized_device_info_.write(&mayiuse_ngen_kernels_); |
202 | serialized_device_info_.write(&mayiuse_non_uniform_work_groups_); |
203 | |
204 | const size_t name_size = name_.size(); |
205 | serialized_device_info_.write(&name_size); |
206 | serialized_device_info_.write(name_.data(), name_size); |
207 | |
208 | return status::success; |
209 | } |
210 | |
211 | status_t device_info_t::init_from_cache_blob( |
212 | const std::vector<uint8_t> &cache_blob) { |
213 | if (cache_blob.empty()) return status::invalid_arguments; |
214 | |
215 | size_t pos = 0; |
216 | #define DESERIALIZE(val, expected_type) \ |
217 | static_assert(std::is_same<std::remove_reference<decltype(val)>::type, \ |
218 | expected_type>::value, \ |
219 | #val " has incorrect type"); \ |
220 | (val) = *reinterpret_cast<const expected_type *>(cache_blob.data() + pos); \ |
221 | pos += sizeof(expected_type); |
222 | |
223 | DESERIALIZE(gpu_arch_, compute::gpu_arch_t); |
224 | DESERIALIZE(stepping_id_, int); |
225 | DESERIALIZE(runtime_version_.major, int); |
226 | DESERIALIZE(runtime_version_.minor, int); |
227 | DESERIALIZE(runtime_version_.build, int); |
228 | DESERIALIZE(hw_threads_[0], int32_t); |
229 | DESERIALIZE(hw_threads_[1], int32_t); |
230 | DESERIALIZE(eu_count_, int32_t); |
231 | DESERIALIZE(max_eus_per_wg_, int32_t); |
232 | DESERIALIZE(max_subgroup_size_, int32_t); |
233 | DESERIALIZE(max_wg_size_, size_t); |
234 | DESERIALIZE(llc_cache_size_, size_t); |
235 | DESERIALIZE(extensions_, uint64_t); |
236 | DESERIALIZE(mayiuse_ngen_kernels_, bool); |
237 | DESERIALIZE(mayiuse_non_uniform_work_groups_, bool); |
238 | #undef DESERIALIZE |
239 | |
240 | // name_ is not trivially copyable type |
241 | const size_t name_size |
242 | = *reinterpret_cast<const size_t *>(cache_blob.data() + pos); |
243 | pos += sizeof(size_t); |
244 | name_ = std::string( |
245 | reinterpret_cast<const char *>(cache_blob.data() + pos), name_size); |
246 | pos += name_size; |
247 | assert(name_size == name_.size()); |
248 | assert(pos == cache_blob.size()); |
249 | |
250 | return status::success; |
251 | } |
252 | |
253 | } // namespace compute |
254 | } // namespace gpu |
255 | } // namespace impl |
256 | } // namespace dnnl |
257 | |