1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_COMPUTE_DEVICE_INFO_HPP
18#define GPU_COMPUTE_DEVICE_INFO_HPP
19
20#include <stdint.h>
21#include <stdlib.h>
22#include <string.h>
23
24#include "common/c_types_map.hpp"
25#include "common/serialization_stream.hpp"
26#include "common/utils.hpp"
27#include "common/z_magic.hpp"
28
29#include "cpu/platform.hpp"
30#include "oneapi/dnnl/dnnl_config.h"
31
32namespace dnnl {
33namespace impl {
34namespace gpu {
35namespace compute {
36
37enum class gpu_arch_t {
38 unknown,
39 gen9,
40 gen11,
41 xe_lp,
42 xe_hp,
43 xe_hpg,
44 xe_hpc,
45};
46
47static inline gpu_arch_t str2gpu_arch(const char *str) {
48#define CASE(_case) \
49 if (!strcmp(STRINGIFY(_case), str)) return gpu_arch_t::_case
50
51 CASE(gen9);
52 CASE(gen11);
53 CASE(xe_lp);
54 CASE(xe_hp);
55 CASE(xe_hpg);
56 CASE(xe_hpc);
57 return gpu_arch_t::unknown;
58#undef CASE
59}
60
61enum class device_ext_t : uint64_t {
62 // clang-format off
63 // OpenCL data types
64 khr_fp16 = 1ull << 0,
65 khr_fp64 = 1ull << 1,
66 // OpenCL atomics
67 khr_global_int32_base_atomics = 1ull << 2,
68 khr_global_int32_extended_atomics = 1ull << 3,
69 khr_int64_base_atomics = 1ull << 4,
70 khr_int64_extended_atomics = 1ull << 5,
71 khr_local_int32_base_atomics = 1ull << 6,
72 khr_local_int32_extended_atomics = 1ull << 7,
73 // Intel specific Gen9+
74 intel_subgroups = 1ull << 16,
75 intel_required_subgroup_size = 1ull << 17,
76 intel_subgroups_char = 1ull << 18,
77 intel_subgroups_short = 1ull << 19,
78 intel_subgroups_long = 1ull << 20,
79 // Intel specific Xe_LP+
80 intel_subgroup_local_block_io = 1ull << 21,
81 intel_dot_accumulate = 1ull << 22,
82 // Intel specific Xe_HP+
83 intel_global_float_atomics = 1ull << 23,
84 intel_subgroup_matrix_multiply_accumulate = 1ull << 24,
85 intel_subgroup_split_matrix_multiply_accumulate = 1ull << 25,
86 intel_variable_eu_thread_count = 1ull << 26,
87 // Future extensions
88 future_bf16_cvt = 1ull << 31,
89 last
90 // clang-format on
91};
92
93static inline const char *ext2cl_str(device_ext_t ext) {
94#define CASE(x) \
95 case device_ext_t::x: return STRINGIFY(CONCAT2(cl_, x));
96 switch (ext) {
97 CASE(khr_fp16)
98 CASE(khr_fp64)
99
100 CASE(khr_global_int32_base_atomics)
101 CASE(khr_global_int32_extended_atomics)
102 CASE(khr_int64_base_atomics)
103 CASE(khr_int64_extended_atomics)
104 CASE(khr_local_int32_base_atomics)
105 CASE(khr_local_int32_extended_atomics)
106
107 CASE(intel_subgroups)
108 CASE(intel_required_subgroup_size)
109 CASE(intel_subgroups_char)
110 CASE(intel_subgroups_short)
111 CASE(intel_subgroups_long)
112
113 CASE(intel_subgroup_local_block_io)
114 CASE(intel_dot_accumulate)
115
116 CASE(intel_global_float_atomics)
117 CASE(intel_subgroup_matrix_multiply_accumulate)
118 CASE(intel_subgroup_split_matrix_multiply_accumulate)
119 CASE(intel_variable_eu_thread_count)
120 CASE(future_bf16_cvt)
121 default: return nullptr;
122 }
123#undef CASE
124}
125
126struct runtime_version_t {
127 int major;
128 int minor;
129 int build;
130
131 runtime_version_t(int major = 0, int minor = 0, int build = 0)
132 : major {major}, minor {minor}, build {build} {}
133
134 bool operator==(const runtime_version_t &other) const {
135 return (major == other.major) && (minor == other.minor)
136 && (build == other.build);
137 }
138
139 bool operator!=(const runtime_version_t &other) const {
140 return !(*this == other);
141 }
142
143 bool operator<(const runtime_version_t &other) const {
144 if (major < other.major) return true;
145 if (major > other.major) return false;
146 if (minor < other.minor) return true;
147 if (minor > other.minor) return false;
148 return (build < other.build);
149 }
150
151 bool operator>(const runtime_version_t &other) const {
152 return (other < *this);
153 }
154
155 bool operator<=(const runtime_version_t &other) const {
156 return !(*this > other);
157 }
158
159 bool operator>=(const runtime_version_t &other) const {
160 return !(*this < other);
161 }
162
163 status_t set_from_string(const char *s) {
164 int i_major = 0, i = 0;
165
166 for (; s[i] != '.'; i++)
167 if (!s[i]) return status::invalid_arguments;
168
169 auto i_minor = ++i;
170
171 for (; s[i] != '.'; i++)
172 if (!s[i]) return status::invalid_arguments;
173
174 auto i_build = ++i;
175
176 major = atoi(&s[i_major]);
177 minor = atoi(&s[i_minor]);
178 build = atoi(&s[i_build]);
179
180 return status::success;
181 }
182
183 std::string str() const {
184 return utils::format("%d.%d.%d", major, minor, build);
185 }
186};
187
188// Needed workaround for future HW extensions
189uint64_t get_future_extensions(compute::gpu_arch_t gpu_arch);
190
191struct device_info_t {
192public:
193 virtual ~device_info_t() = default;
194
195 status_t init(
196 engine_t *engine, const std::vector<uint8_t> &cache_blob = {}) {
197 if (!cache_blob.empty()) {
198 CHECK(init_from_cache_blob(cache_blob));
199 return init_serialized_device_info(cache_blob);
200 }
201
202 CHECK(init_device_name(engine));
203 CHECK(init_arch(engine));
204 CHECK(init_runtime_version(engine));
205 CHECK(init_extensions(engine));
206 CHECK(init_attributes(engine));
207
208 CHECK(init_attributes_common(engine));
209
210 if (dnnl_version()->gpu_runtime == DNNL_RUNTIME_OCL) {
211 CHECK(init_serialized_device_info());
212 }
213
214 return status::success;
215 }
216
217 bool has(device_ext_t ext) const { return extensions_ & (uint64_t)ext; }
218 gpu_arch_t gpu_arch() const { return gpu_arch_; }
219 int stepping_id() const { return stepping_id_; }
220 int max_eus_per_wg() const { return max_eus_per_wg_; }
221 static int max_eus_per_wg(gpu_arch_t gpu_arch);
222 int max_subgroup_size() const { return max_subgroup_size_; }
223 static int max_subgroup_size(gpu_arch_t gpu_arch);
224 size_t max_wg_size() const { return max_wg_size_; }
225 int eu_count() const { return eu_count_; }
226 int hw_threads() const { return hw_threads_[0]; }
227 int hw_threads(bool large_grf_mode) const {
228 return hw_threads_[large_grf_mode ? 1 : 0];
229 }
230 static int threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode = false);
231 static int max_slm_size_per_tg(
232 gpu_arch_t gpu_arch, bool large_grf_mode = false);
233 static int slm_memory_bank_count(gpu_arch_t gpu_arch);
234 static int slm_memory_bank_granularity(gpu_arch_t gpu_arch);
235 size_t llc_cache_size() const { return llc_cache_size_; }
236
237 const runtime_version_t &runtime_version() const {
238 return runtime_version_;
239 }
240 const std::string &name() const { return name_; }
241
242 bool mayiuse_ngen_kernels() const { return mayiuse_ngen_kernels_; }
243
244 bool mayiuse_non_uniform_work_groups() const {
245 return mayiuse_non_uniform_work_groups_;
246 }
247
248 bool mayiuse_sub_group(int size) const;
249
250 const std::vector<uint8_t> &get_cache_blob() const {
251 return serialized_device_info_.get_data();
252 }
253
254 status_t get_cache_blob_size(size_t *size) const {
255 (*size) = serialized_device_info_.get_data().size();
256 return status::success;
257 }
258
259 status_t get_cache_blob(size_t size, uint8_t *cache_blob) const {
260 const auto &cb = serialized_device_info_.get_data();
261 if (size != cb.size()) return status::invalid_arguments;
262 std::memcpy(cache_blob, cb.data(), size);
263 return status::success;
264 }
265
266protected:
267 virtual status_t init_device_name(engine_t *engine) = 0;
268 virtual status_t init_arch(engine_t *engine) = 0;
269 virtual status_t init_runtime_version(engine_t *engine) = 0;
270 virtual status_t init_extensions(engine_t *engine) = 0;
271 virtual status_t init_attributes(engine_t *engine) = 0;
272
273 compute::gpu_arch_t gpu_arch_ = compute::gpu_arch_t::unknown;
274 int stepping_id_ = 0;
275 bool mayiuse_ngen_kernels_ = false;
276
277 std::string name_;
278 runtime_version_t runtime_version_;
279
280 // total number of hardware threads:
281 // [0] - default mode
282 // [1] - large GRF mode
283 int32_t hw_threads_[2] = {0, 0};
284 int32_t eu_count_ = 0;
285 int32_t max_eus_per_wg_ = 0;
286 int32_t max_subgroup_size_ = 0;
287 size_t max_wg_size_ = 0;
288 size_t llc_cache_size_ = 0;
289
290 // extensions_ and gpu_arch_ describe effective extensions and GPU architecture.
291 uint64_t extensions_ = 0;
292
293private:
294 status_t init_attributes_common(engine_t *engine);
295 status_t init_serialized_device_info(
296 const std::vector<uint8_t> &cache_blob = {});
297 status_t init_from_cache_blob(const std::vector<uint8_t> &cache_blob);
298
299 bool mayiuse_non_uniform_work_groups_ = false;
300
301 serialization_stream_t serialized_device_info_;
302};
303
304} // namespace compute
305} // namespace gpu
306} // namespace impl
307} // namespace dnnl
308
309#endif
310