1 | /******************************************************************************* |
2 | * Copyright 2018-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef CPU_X64_CPU_ISA_TRAITS_HPP |
18 | #define CPU_X64_CPU_ISA_TRAITS_HPP |
19 | |
20 | #include <functional> |
21 | #include <type_traits> |
22 | #include <unordered_map> |
23 | |
24 | #include "oneapi/dnnl/dnnl_types.h" |
25 | |
26 | #include "common/type_helpers.hpp" |
27 | #include "common/utils.hpp" |
28 | |
29 | #define XBYAK64 |
30 | #define XBYAK_NO_OP_NAMES |
31 | /* in order to make selinux happy memory that would be marked with X-bit should |
32 | * be obtained with mmap */ |
33 | #define XBYAK_USE_MMAP_ALLOCATOR |
34 | /* Use Xbyak's memfd-based allocation, if available */ |
35 | #define XBYAK_USE_MEMFD |
36 | #define XBYAK_NO_EXCEPTION |
37 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) |
38 | /* turn off `size_t to other-type implicit casting` warning |
39 | * currently we have a lot of jit-generated instructions that |
40 | * take uint32_t, but we pass size_t (e.g. due to using sizeof). |
41 | * FIXME: replace size_t parameters with the appropriate ones */ |
42 | #pragma warning(disable : 4267) |
43 | #endif |
44 | #include "common/compiler_workarounds.hpp" |
45 | #include "cpu/x64/xbyak/xbyak.h" |
46 | #include "cpu/x64/xbyak/xbyak_util.h" |
47 | |
48 | namespace dnnl { |
49 | namespace impl { |
50 | namespace cpu { |
51 | namespace x64 { |
52 | |
53 | // Maximum number of features + hints that can be specified via bits |
54 | static constexpr int cpu_isa_total_bits = sizeof(unsigned) * 8; |
55 | |
56 | enum cpu_isa_bit_t : unsigned { |
57 | // Fill in features from least significant bit to most significant bit |
58 | sse41_bit = 1u << 0, |
59 | avx_bit = 1u << 1, |
60 | avx2_bit = 1u << 2, |
61 | avx_vnni_bit = 1u << 3, |
62 | avx_vnni_2_bit = 1u << 4, |
63 | avx512_core_bit = 1u << 5, |
64 | avx512_core_vnni_bit = 1u << 6, |
65 | avx512_core_bf16_bit = 1u << 7, |
66 | avx512_core_fp16_bit = 1u << 8, |
67 | amx_tile_bit = 1u << 9, |
68 | amx_int8_bit = 1u << 10, |
69 | amx_bf16_bit = 1u << 11, |
70 | amx_fp16_bit = 1u << 12, |
71 | // Fill in hints from most significant bit to least significant bit |
72 | prefer_ymm_bit = 1u << (cpu_isa_total_bits - 1), |
73 | }; |
74 | |
75 | dnnl_cpu_isa_hints_t DNNL_API get_cpu_isa_hints(bool soft = false); |
76 | status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints); |
77 | |
78 | namespace cpu_isa_hints_utils { |
79 | /* hints_1 | hints_2 | ... | hints_n where hints_i are hint specific |
80 | bits declared inside the cpu_isa_bit_t */ |
81 | static constexpr unsigned hints_mask = prefer_ymm_bit; |
82 | |
83 | static unsigned cvt2mask(dnnl_cpu_isa_hints_t hints) { |
84 | static const std::unordered_map<dnnl_cpu_isa_hints_t, unsigned, |
85 | std::hash<int>> |
86 | hints_map = {{dnnl_cpu_isa_no_hints, 0}, |
87 | {dnnl_cpu_isa_prefer_ymm, prefer_ymm_bit}}; |
88 | |
89 | auto iter = hints_map.find(hints); |
90 | if (iter != hints_map.end()) |
91 | return iter->second; |
92 | else { |
93 | assert(!"unexpected CPU ISA hint" ); |
94 | return 0; |
95 | } |
96 | } |
97 | |
98 | static bool is_hints_bit_set(cpu_isa_bit_t hint_bit, bool soft) { |
99 | const dnnl_cpu_isa_hints_t hints = get_cpu_isa_hints(soft); |
100 | const unsigned cur_hints_mask = cpu_isa_hints_utils::cvt2mask(hints); |
101 | return (cur_hints_mask & hint_bit) == hint_bit; |
102 | } |
103 | } // namespace cpu_isa_hints_utils |
104 | |
105 | enum cpu_isa_t : unsigned { |
106 | isa_undef = 0u, |
107 | sse41 = sse41_bit, |
108 | avx = avx_bit | sse41, |
109 | avx2 = avx2_bit | avx, |
110 | avx2_vnni = avx_vnni_bit | avx2, |
111 | avx2_vnni_2 = avx2_vnni | avx_vnni_2_bit, |
112 | avx512_core = avx512_core_bit | avx2, |
113 | avx512_core_vnni = avx512_core_vnni_bit | avx512_core, |
114 | avx512_core_bf16 = avx512_core_bf16_bit | avx512_core_vnni, |
115 | avx512_core_bf16_ymm = prefer_ymm_bit | avx512_core_bf16, |
116 | amx_tile = amx_tile_bit, |
117 | amx_int8 = amx_int8_bit | amx_tile, |
118 | amx_bf16 = amx_bf16_bit | amx_tile, |
119 | amx_fp16 = amx_fp16_bit | amx_tile, |
120 | avx512_core_fp16 = avx512_core_fp16_bit | avx512_core_bf16 | avx_vnni_bit, |
121 | avx512_core_amx = avx512_core_fp16 | amx_int8 | amx_bf16, |
122 | avx512_core_amx_fp16 = avx512_core_amx | amx_fp16, |
123 | // NOTES: 1. isa_all by default has no isa specific hints |
124 | isa_all = ~0u & ~cpu_isa_hints_utils::hints_mask, |
125 | }; |
126 | |
127 | enum class cpu_isa_cmp_t { |
128 | // List of infix comparison relations between two cpu_isa_t |
129 | // where we take isa_1 and isa_2 to be two cpu_isa_t instances. |
130 | |
131 | // isa_1 SUBSET isa_2 if all feature flags supported by isa_1 |
132 | // are supported by isa_2 as well (equality allowed) |
133 | SUBSET, |
134 | |
135 | // isa_1 SUPERSET isa_2 if all feature flags supported by isa_2 |
136 | // are supported by isa_1 as well (equality allowed) |
137 | SUPERSET, |
138 | |
139 | // Few more options that (depending upon need) can be enabled in future |
140 | |
141 | // 1. PROPER_SUBSET: isa_1 SUBSET isa_2 and isa_1 != isa_2 |
142 | // 2. PROPER_SUPERSET: isa_1 SUPERSET isa_2 and isa_1 != isa_2 |
143 | }; |
144 | |
145 | const char *get_isa_info(); |
146 | |
147 | cpu_isa_t get_max_cpu_isa(); |
148 | cpu_isa_t DNNL_API get_max_cpu_isa_mask(bool soft = false); |
149 | status_t set_max_cpu_isa(dnnl_cpu_isa_t isa); |
150 | dnnl_cpu_isa_t get_effective_cpu_isa(); |
151 | |
152 | static inline bool compare_isa( |
153 | cpu_isa_t isa_1, cpu_isa_cmp_t cmp, cpu_isa_t isa_2) { |
154 | assert(isa_1 != isa_all); |
155 | assert(isa_2 != isa_all); |
156 | // Comparison with `isa_all` is illegal. |
157 | if (utils::one_of(isa_all, isa_1, isa_2)) return false; |
158 | |
159 | // By default, comparison between ISA ignores ISA specific hints |
160 | unsigned mask_1 |
161 | = static_cast<unsigned>(isa_1) & ~cpu_isa_hints_utils::hints_mask; |
162 | unsigned mask_2 |
163 | = static_cast<unsigned>(isa_2) & ~cpu_isa_hints_utils::hints_mask; |
164 | unsigned mask_common = mask_1 & mask_2; |
165 | |
166 | switch (cmp) { |
167 | case cpu_isa_cmp_t::SUBSET: return mask_1 == mask_common; |
168 | case cpu_isa_cmp_t::SUPERSET: return mask_2 == mask_common; |
169 | default: assert(!"unsupported comparison of isa" ); return false; |
170 | } |
171 | } |
172 | |
173 | static inline bool is_subset(cpu_isa_t isa_1, cpu_isa_t isa_2) { |
174 | return compare_isa(isa_1, cpu_isa_cmp_t::SUBSET, isa_2); |
175 | } |
176 | |
177 | static inline bool is_superset(cpu_isa_t isa_1, cpu_isa_t isa_2) { |
178 | return compare_isa(isa_1, cpu_isa_cmp_t::SUPERSET, isa_2); |
179 | } |
180 | |
181 | template <typename Vmm> |
182 | struct vreg_traits {}; |
183 | |
184 | template <> |
185 | struct vreg_traits<Xbyak::Zmm> { |
186 | typedef Xbyak::Ymm Vmm_lower_t; |
187 | static constexpr size_t vlen = 64; |
188 | }; |
189 | |
190 | template <> |
191 | struct vreg_traits<Xbyak::Ymm> { |
192 | typedef Xbyak::Xmm Vmm_lower_t; |
193 | static constexpr size_t vlen = 32; |
194 | }; |
195 | |
196 | template <> |
197 | struct vreg_traits<Xbyak::Xmm> { |
198 | typedef Xbyak::Xmm Vmm_lower_t; |
199 | static constexpr size_t vlen = 16; |
200 | }; |
201 | |
202 | template <cpu_isa_t> |
203 | struct cpu_isa_traits {}; /* ::vlen -> 32 (for avx2) */ |
204 | |
205 | // pack struct so it can fit into a single 64-byte cache line |
206 | #pragma pack(push, 1) |
207 | struct palette_config_t { |
208 | uint8_t palette_id; |
209 | uint8_t startRow; |
210 | uint8_t reserved[14]; |
211 | uint16_t cols[16]; |
212 | uint8_t rows[16]; |
213 | }; |
214 | #pragma pack(pop) |
215 | |
216 | template <> |
217 | struct cpu_isa_traits<isa_all> { |
218 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_default; |
219 | static constexpr const char *user_option_env = "default" ; |
220 | }; |
221 | |
222 | template <> |
223 | struct cpu_isa_traits<sse41> { |
224 | typedef Xbyak::Xmm Vmm; |
225 | static constexpr int vlen_shift = 4; |
226 | static constexpr int vlen = vreg_traits<Vmm>::vlen; |
227 | static constexpr int n_vregs = 16; |
228 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_sse41; |
229 | static constexpr const char *user_option_env = "sse41" ; |
230 | }; |
231 | |
232 | template <> |
233 | struct cpu_isa_traits<avx> { |
234 | typedef Xbyak::Ymm Vmm; |
235 | static constexpr int vlen_shift = 5; |
236 | static constexpr int vlen = vreg_traits<Vmm>::vlen; |
237 | static constexpr int n_vregs = 16; |
238 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx; |
239 | static constexpr const char *user_option_env = "avx" ; |
240 | }; |
241 | |
242 | template <> |
243 | struct cpu_isa_traits<avx2> : public cpu_isa_traits<avx> { |
244 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2; |
245 | static constexpr const char *user_option_env = "avx2" ; |
246 | }; |
247 | |
248 | template <> |
249 | struct cpu_isa_traits<avx2_vnni> : public cpu_isa_traits<avx2> { |
250 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2_vnni; |
251 | static constexpr const char *user_option_env = "avx2_vnni" ; |
252 | }; |
253 | |
254 | template <> |
255 | struct cpu_isa_traits<avx2_vnni_2> : public cpu_isa_traits<avx2> { |
256 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2_vnni_2; |
257 | static constexpr const char *user_option_env = "avx2_vnni_2" ; |
258 | }; |
259 | |
260 | template <> |
261 | struct cpu_isa_traits<avx512_core> { |
262 | typedef Xbyak::Zmm Vmm; |
263 | static constexpr int vlen_shift = 6; |
264 | static constexpr int vlen = vreg_traits<Vmm>::vlen; |
265 | static constexpr int n_vregs = 32; |
266 | static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx512_core; |
267 | static constexpr const char *user_option_env = "avx512_core" ; |
268 | }; |
269 | |
270 | template <> |
271 | struct cpu_isa_traits<avx512_core_vnni> : public cpu_isa_traits<avx512_core> { |
272 | static constexpr dnnl_cpu_isa_t user_option_val |
273 | = dnnl_cpu_isa_avx512_core_vnni; |
274 | static constexpr const char *user_option_env = "avx512_core_vnni" ; |
275 | }; |
276 | |
277 | template <> |
278 | struct cpu_isa_traits<avx512_core_bf16> : public cpu_isa_traits<avx512_core> { |
279 | static constexpr dnnl_cpu_isa_t user_option_val |
280 | = dnnl_cpu_isa_avx512_core_bf16; |
281 | static constexpr const char *user_option_env = "avx512_core_bf16" ; |
282 | }; |
283 | |
284 | template <> |
285 | struct cpu_isa_traits<avx512_core_amx> { |
286 | static constexpr dnnl_cpu_isa_t user_option_val |
287 | = dnnl_cpu_isa_avx512_core_amx; |
288 | static constexpr const char *user_option_env = "avx512_core_amx" ; |
289 | }; |
290 | |
291 | template <> |
292 | struct cpu_isa_traits<avx512_core_fp16> : public cpu_isa_traits<avx512_core> { |
293 | static constexpr dnnl_cpu_isa_t user_option_val |
294 | = dnnl_cpu_isa_avx512_core_fp16; |
295 | static constexpr const char *user_option_env = "avx512_core_fp16" ; |
296 | }; |
297 | |
298 | template <> |
299 | struct cpu_isa_traits<avx512_core_amx_fp16> { |
300 | static constexpr dnnl_cpu_isa_t user_option_val |
301 | = dnnl_cpu_isa_avx512_core_amx_fp16; |
302 | static constexpr const char *user_option_env = "avx512_core_amx_fp16" ; |
303 | }; |
304 | |
305 | inline const Xbyak::util::Cpu &cpu() { |
306 | const static Xbyak::util::Cpu cpu_; |
307 | return cpu_; |
308 | } |
309 | |
310 | namespace amx { |
311 | |
312 | // Return the target palette for AMX instructions. Currently this is `0` if AMX |
313 | // instructions are not supported, and `1` if they are. |
314 | int get_target_palette(); |
315 | |
316 | int get_max_tiles(int palette); |
317 | int get_max_column_bytes(int palette); |
318 | int get_max_rows(int palette); |
319 | bool DNNL_API is_available(); |
320 | |
321 | } // namespace amx |
322 | |
323 | namespace { |
324 | |
325 | static inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) { |
326 | using namespace Xbyak::util; |
327 | |
328 | unsigned cpu_isa_mask = x64::get_max_cpu_isa_mask(soft); |
329 | unsigned cpu_isa_no_hints = cpu_isa & ~cpu_isa_hints_utils::hints_mask; |
330 | |
331 | if ((cpu_isa_mask & cpu_isa_no_hints) != cpu_isa_no_hints) return false; |
332 | |
333 | switch (cpu_isa) { |
334 | case sse41: return cpu().has(Cpu::tSSE41); |
335 | case avx: return cpu().has(Cpu::tAVX); |
336 | case avx2: return cpu().has(Cpu::tAVX2); |
337 | case avx2_vnni: return mayiuse(avx2, soft) && cpu().has(Cpu::tAVX_VNNI); |
338 | case avx2_vnni_2: |
339 | return mayiuse(avx2_vnni, soft) && cpu().has(Cpu::tAVX_VNNI_INT8) |
340 | && cpu().has(Cpu::tAVX_NE_CONVERT); |
341 | case avx512_core: |
342 | return cpu().has(Cpu::tAVX512F) && cpu().has(Cpu::tAVX512BW) |
343 | && cpu().has(Cpu::tAVX512VL) && cpu().has(Cpu::tAVX512DQ); |
344 | case avx512_core_vnni: |
345 | return cpu().has(Cpu::tAVX512F) && cpu().has(Cpu::tAVX512BW) |
346 | && cpu().has(Cpu::tAVX512VL) && cpu().has(Cpu::tAVX512DQ) |
347 | && cpu().has(Cpu::tAVX512_VNNI); |
348 | case avx512_core_bf16: |
349 | return mayiuse(avx512_core_vnni, soft) |
350 | && cpu().has(Cpu::tAVX512_BF16); |
351 | case avx512_core_bf16_ymm: |
352 | return mayiuse(avx512_core_bf16, soft) |
353 | && cpu_isa_hints_utils::is_hints_bit_set( |
354 | prefer_ymm_bit, soft); |
355 | case avx512_core_fp16: |
356 | return cpu().has(Cpu::tAVX512_FP16) |
357 | && mayiuse(avx512_core_bf16, soft) |
358 | && mayiuse(avx2_vnni, soft); |
359 | case amx_tile: |
360 | return cpu().has(Cpu::tAMX_TILE) && x64::amx::is_available(); |
361 | case amx_int8: |
362 | return mayiuse(amx_tile, soft) && cpu().has(Cpu::tAMX_INT8); |
363 | case amx_bf16: |
364 | return mayiuse(amx_tile, soft) && cpu().has(Cpu::tAMX_BF16); |
365 | case amx_fp16: |
366 | return mayiuse(amx_tile, soft) && cpu().has(Cpu::tAMX_FP16); |
367 | case avx512_core_amx: |
368 | return mayiuse(amx_int8, soft) && mayiuse(amx_bf16, soft) |
369 | && mayiuse(avx512_core_fp16, soft); |
370 | case avx512_core_amx_fp16: |
371 | return mayiuse(avx512_core_amx, soft) && mayiuse(amx_fp16, soft); |
372 | case isa_undef: return true; |
373 | case isa_all: return false; |
374 | } |
375 | return false; |
376 | } |
377 | |
378 | static inline bool isa_has_bf16(cpu_isa_t isa) { |
379 | return is_superset(isa, avx512_core_bf16); |
380 | } |
381 | |
382 | static inline bool isa_has_masks(cpu_isa_t isa) { |
383 | return is_superset(isa, avx512_core); |
384 | } |
385 | |
386 | static inline int isa_max_vlen(cpu_isa_t isa) { |
387 | if (is_superset(isa, avx512_core)) |
388 | return cpu_isa_traits<avx512_core>::vlen; |
389 | else if (is_superset(isa, avx2)) |
390 | return cpu_isa_traits<avx2>::vlen; |
391 | else if (is_superset(isa, sse41)) |
392 | return cpu_isa_traits<sse41>::vlen; |
393 | else |
394 | return 0; |
395 | } |
396 | |
397 | static inline int isa_num_vregs(cpu_isa_t isa) { |
398 | if (is_superset(isa, avx512_core)) |
399 | return cpu_isa_traits<avx512_core>::n_vregs; |
400 | else if (is_superset(isa, avx2)) |
401 | return cpu_isa_traits<avx2>::n_vregs; |
402 | else if (is_superset(isa, sse41)) |
403 | return cpu_isa_traits<sse41>::n_vregs; |
404 | else |
405 | return 0; |
406 | } |
407 | |
408 | } // namespace |
409 | |
410 | /* whatever is required to generate string literals... */ |
411 | #include "common/z_magic.hpp" |
412 | /* clang-format off */ |
413 | #define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \ |
414 | ((isa) == isa_undef ? prefix STRINGIFY(undef) : \ |
415 | (isa) == sse41 ? prefix STRINGIFY(sse41) : \ |
416 | (isa) == avx ? prefix STRINGIFY(avx) : \ |
417 | (isa) == avx2 ? prefix STRINGIFY(avx2) : \ |
418 | (isa) == avx2_vnni ? prefix STRINGIFY(avx2_vnni) : \ |
419 | (isa) == avx2_vnni_2 ? prefix STRINGIFY(avx2_vnni_2) : \ |
420 | (isa) == avx512_core ? prefix STRINGIFY(avx512_core) : \ |
421 | (isa) == avx512_core_vnni ? prefix STRINGIFY(avx512_core_vnni) : \ |
422 | (isa) == avx512_core_bf16 ? prefix STRINGIFY(avx512_core_bf16) : \ |
423 | (isa) == avx512_core_fp16 ? prefix STRINGIFY(avx512_core_fp16) : \ |
424 | (isa) == avx512_core_amx ? prefix STRINGIFY(avx512_core_amx) : \ |
425 | (isa) == avx512_core_amx_fp16 ? prefix STRINGIFY(avx512_core_amx_fp16) : \ |
426 | prefix suffix_if_any) |
427 | /* clang-format on */ |
428 | |
429 | inline size_t data_type_vnni_granularity(data_type_t data_type) { |
430 | using namespace data_type; |
431 | switch (data_type) { |
432 | case f32: |
433 | case s32: return size_t(1); |
434 | case f16: |
435 | case bf16: return size_t(2); |
436 | case s8: |
437 | case u8: return size_t(4); |
438 | case data_type::undef: |
439 | default: assert(!"unknown data_type" ); |
440 | } |
441 | return size_t(0); /* should not be reachable */ |
442 | } |
443 | |
444 | template <cpu_isa_t isa> |
445 | inline size_t data_type_vnni_simd_elems(data_type_t data_type) { |
446 | const size_t dt_size = types::data_type_size(data_type); |
447 | assert(dt_size > 0); |
448 | return cpu_isa_traits<isa>::vlen / dt_size; |
449 | } |
450 | |
451 | } // namespace x64 |
452 | } // namespace cpu |
453 | } // namespace impl |
454 | } // namespace dnnl |
455 | |
456 | #endif |
457 | |