1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_COMPUTE_DEVICE_INFO_HPP |
18 | #define GPU_COMPUTE_DEVICE_INFO_HPP |
19 | |
20 | #include <stdint.h> |
21 | #include <stdlib.h> |
22 | #include <string.h> |
23 | |
24 | #include "common/c_types_map.hpp" |
25 | #include "common/serialization_stream.hpp" |
26 | #include "common/utils.hpp" |
27 | #include "common/z_magic.hpp" |
28 | |
29 | #include "cpu/platform.hpp" |
30 | #include "oneapi/dnnl/dnnl_config.h" |
31 | |
32 | namespace dnnl { |
33 | namespace impl { |
34 | namespace gpu { |
35 | namespace compute { |
36 | |
37 | enum class gpu_arch_t { |
38 | unknown, |
39 | gen9, |
40 | gen11, |
41 | xe_lp, |
42 | xe_hp, |
43 | xe_hpg, |
44 | xe_hpc, |
45 | }; |
46 | |
47 | static inline gpu_arch_t str2gpu_arch(const char *str) { |
48 | #define CASE(_case) \ |
49 | if (!strcmp(STRINGIFY(_case), str)) return gpu_arch_t::_case |
50 | |
51 | CASE(gen9); |
52 | CASE(gen11); |
53 | CASE(xe_lp); |
54 | CASE(xe_hp); |
55 | CASE(xe_hpg); |
56 | CASE(xe_hpc); |
57 | return gpu_arch_t::unknown; |
58 | #undef CASE |
59 | } |
60 | |
61 | enum class device_ext_t : uint64_t { |
62 | // clang-format off |
63 | // OpenCL data types |
64 | khr_fp16 = 1ull << 0, |
65 | khr_fp64 = 1ull << 1, |
66 | // OpenCL atomics |
67 | khr_global_int32_base_atomics = 1ull << 2, |
68 | khr_global_int32_extended_atomics = 1ull << 3, |
69 | khr_int64_base_atomics = 1ull << 4, |
70 | khr_int64_extended_atomics = 1ull << 5, |
71 | khr_local_int32_base_atomics = 1ull << 6, |
72 | khr_local_int32_extended_atomics = 1ull << 7, |
73 | // Intel specific Gen9+ |
74 | intel_subgroups = 1ull << 16, |
75 | intel_required_subgroup_size = 1ull << 17, |
76 | intel_subgroups_char = 1ull << 18, |
77 | intel_subgroups_short = 1ull << 19, |
78 | intel_subgroups_long = 1ull << 20, |
79 | // Intel specific Xe_LP+ |
80 | intel_subgroup_local_block_io = 1ull << 21, |
81 | intel_dot_accumulate = 1ull << 22, |
82 | // Intel specific Xe_HP+ |
83 | intel_global_float_atomics = 1ull << 23, |
84 | intel_subgroup_matrix_multiply_accumulate = 1ull << 24, |
85 | intel_subgroup_split_matrix_multiply_accumulate = 1ull << 25, |
86 | intel_variable_eu_thread_count = 1ull << 26, |
87 | // Future extensions |
88 | future_bf16_cvt = 1ull << 31, |
89 | last |
90 | // clang-format on |
91 | }; |
92 | |
93 | static inline const char *ext2cl_str(device_ext_t ext) { |
94 | #define CASE(x) \ |
95 | case device_ext_t::x: return STRINGIFY(CONCAT2(cl_, x)); |
96 | switch (ext) { |
97 | CASE(khr_fp16) |
98 | CASE(khr_fp64) |
99 | |
100 | CASE(khr_global_int32_base_atomics) |
101 | CASE(khr_global_int32_extended_atomics) |
102 | CASE(khr_int64_base_atomics) |
103 | CASE(khr_int64_extended_atomics) |
104 | CASE(khr_local_int32_base_atomics) |
105 | CASE(khr_local_int32_extended_atomics) |
106 | |
107 | CASE(intel_subgroups) |
108 | CASE(intel_required_subgroup_size) |
109 | CASE(intel_subgroups_char) |
110 | CASE(intel_subgroups_short) |
111 | CASE(intel_subgroups_long) |
112 | |
113 | CASE(intel_subgroup_local_block_io) |
114 | CASE(intel_dot_accumulate) |
115 | |
116 | CASE(intel_global_float_atomics) |
117 | CASE(intel_subgroup_matrix_multiply_accumulate) |
118 | CASE(intel_subgroup_split_matrix_multiply_accumulate) |
119 | CASE(intel_variable_eu_thread_count) |
120 | CASE(future_bf16_cvt) |
121 | default: return nullptr; |
122 | } |
123 | #undef CASE |
124 | } |
125 | |
126 | struct runtime_version_t { |
127 | int major; |
128 | int minor; |
129 | int build; |
130 | |
131 | runtime_version_t(int major = 0, int minor = 0, int build = 0) |
132 | : major {major}, minor {minor}, build {build} {} |
133 | |
134 | bool operator==(const runtime_version_t &other) const { |
135 | return (major == other.major) && (minor == other.minor) |
136 | && (build == other.build); |
137 | } |
138 | |
139 | bool operator!=(const runtime_version_t &other) const { |
140 | return !(*this == other); |
141 | } |
142 | |
143 | bool operator<(const runtime_version_t &other) const { |
144 | if (major < other.major) return true; |
145 | if (major > other.major) return false; |
146 | if (minor < other.minor) return true; |
147 | if (minor > other.minor) return false; |
148 | return (build < other.build); |
149 | } |
150 | |
151 | bool operator>(const runtime_version_t &other) const { |
152 | return (other < *this); |
153 | } |
154 | |
155 | bool operator<=(const runtime_version_t &other) const { |
156 | return !(*this > other); |
157 | } |
158 | |
159 | bool operator>=(const runtime_version_t &other) const { |
160 | return !(*this < other); |
161 | } |
162 | |
163 | status_t set_from_string(const char *s) { |
164 | int i_major = 0, i = 0; |
165 | |
166 | for (; s[i] != '.'; i++) |
167 | if (!s[i]) return status::invalid_arguments; |
168 | |
169 | auto i_minor = ++i; |
170 | |
171 | for (; s[i] != '.'; i++) |
172 | if (!s[i]) return status::invalid_arguments; |
173 | |
174 | auto i_build = ++i; |
175 | |
176 | major = atoi(&s[i_major]); |
177 | minor = atoi(&s[i_minor]); |
178 | build = atoi(&s[i_build]); |
179 | |
180 | return status::success; |
181 | } |
182 | |
183 | std::string str() const { |
184 | return utils::format("%d.%d.%d" , major, minor, build); |
185 | } |
186 | }; |
187 | |
188 | // Needed workaround for future HW extensions |
189 | uint64_t get_future_extensions(compute::gpu_arch_t gpu_arch); |
190 | |
191 | struct device_info_t { |
192 | public: |
193 | virtual ~device_info_t() = default; |
194 | |
195 | status_t init( |
196 | engine_t *engine, const std::vector<uint8_t> &cache_blob = {}) { |
197 | if (!cache_blob.empty()) { |
198 | CHECK(init_from_cache_blob(cache_blob)); |
199 | return init_serialized_device_info(cache_blob); |
200 | } |
201 | |
202 | CHECK(init_device_name(engine)); |
203 | CHECK(init_arch(engine)); |
204 | CHECK(init_runtime_version(engine)); |
205 | CHECK(init_extensions(engine)); |
206 | CHECK(init_attributes(engine)); |
207 | |
208 | CHECK(init_attributes_common(engine)); |
209 | |
210 | if (dnnl_version()->gpu_runtime == DNNL_RUNTIME_OCL) { |
211 | CHECK(init_serialized_device_info()); |
212 | } |
213 | |
214 | return status::success; |
215 | } |
216 | |
217 | bool has(device_ext_t ext) const { return extensions_ & (uint64_t)ext; } |
218 | gpu_arch_t gpu_arch() const { return gpu_arch_; } |
219 | int stepping_id() const { return stepping_id_; } |
220 | int max_eus_per_wg() const { return max_eus_per_wg_; } |
221 | static int max_eus_per_wg(gpu_arch_t gpu_arch); |
222 | int max_subgroup_size() const { return max_subgroup_size_; } |
223 | static int max_subgroup_size(gpu_arch_t gpu_arch); |
224 | size_t max_wg_size() const { return max_wg_size_; } |
225 | int eu_count() const { return eu_count_; } |
226 | int hw_threads() const { return hw_threads_[0]; } |
227 | int hw_threads(bool large_grf_mode) const { |
228 | return hw_threads_[large_grf_mode ? 1 : 0]; |
229 | } |
230 | static int threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode = false); |
231 | static int max_slm_size_per_tg( |
232 | gpu_arch_t gpu_arch, bool large_grf_mode = false); |
233 | static int slm_memory_bank_count(gpu_arch_t gpu_arch); |
234 | static int slm_memory_bank_granularity(gpu_arch_t gpu_arch); |
235 | size_t llc_cache_size() const { return llc_cache_size_; } |
236 | |
237 | const runtime_version_t &runtime_version() const { |
238 | return runtime_version_; |
239 | } |
240 | const std::string &name() const { return name_; } |
241 | |
242 | bool mayiuse_ngen_kernels() const { return mayiuse_ngen_kernels_; } |
243 | |
244 | bool mayiuse_non_uniform_work_groups() const { |
245 | return mayiuse_non_uniform_work_groups_; |
246 | } |
247 | |
248 | bool mayiuse_sub_group(int size) const; |
249 | |
250 | const std::vector<uint8_t> &get_cache_blob() const { |
251 | return serialized_device_info_.get_data(); |
252 | } |
253 | |
254 | status_t get_cache_blob_size(size_t *size) const { |
255 | (*size) = serialized_device_info_.get_data().size(); |
256 | return status::success; |
257 | } |
258 | |
259 | status_t get_cache_blob(size_t size, uint8_t *cache_blob) const { |
260 | const auto &cb = serialized_device_info_.get_data(); |
261 | if (size != cb.size()) return status::invalid_arguments; |
262 | std::memcpy(cache_blob, cb.data(), size); |
263 | return status::success; |
264 | } |
265 | |
266 | protected: |
267 | virtual status_t init_device_name(engine_t *engine) = 0; |
268 | virtual status_t init_arch(engine_t *engine) = 0; |
269 | virtual status_t init_runtime_version(engine_t *engine) = 0; |
270 | virtual status_t init_extensions(engine_t *engine) = 0; |
271 | virtual status_t init_attributes(engine_t *engine) = 0; |
272 | |
273 | compute::gpu_arch_t gpu_arch_ = compute::gpu_arch_t::unknown; |
274 | int stepping_id_ = 0; |
275 | bool mayiuse_ngen_kernels_ = false; |
276 | |
277 | std::string name_; |
278 | runtime_version_t runtime_version_; |
279 | |
280 | // total number of hardware threads: |
281 | // [0] - default mode |
282 | // [1] - large GRF mode |
283 | int32_t hw_threads_[2] = {0, 0}; |
284 | int32_t eu_count_ = 0; |
285 | int32_t max_eus_per_wg_ = 0; |
286 | int32_t max_subgroup_size_ = 0; |
287 | size_t max_wg_size_ = 0; |
288 | size_t llc_cache_size_ = 0; |
289 | |
290 | // extensions_ and gpu_arch_ describe effective extensions and GPU architecture. |
291 | uint64_t extensions_ = 0; |
292 | |
293 | private: |
294 | status_t init_attributes_common(engine_t *engine); |
295 | status_t init_serialized_device_info( |
296 | const std::vector<uint8_t> &cache_blob = {}); |
297 | status_t init_from_cache_blob(const std::vector<uint8_t> &cache_blob); |
298 | |
299 | bool mayiuse_non_uniform_work_groups_ = false; |
300 | |
301 | serialization_stream_t serialized_device_info_; |
302 | }; |
303 | |
304 | } // namespace compute |
305 | } // namespace gpu |
306 | } // namespace impl |
307 | } // namespace dnnl |
308 | |
309 | #endif |
310 | |