1/*******************************************************************************
2* Copyright 2018-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef CPU_X64_CPU_ISA_TRAITS_HPP
18#define CPU_X64_CPU_ISA_TRAITS_HPP
19
20#include <functional>
21#include <type_traits>
22#include <unordered_map>
23
24#include "oneapi/dnnl/dnnl_types.h"
25
26#include "common/type_helpers.hpp"
27#include "common/utils.hpp"
28
29#define XBYAK64
30#define XBYAK_NO_OP_NAMES
31/* in order to make selinux happy memory that would be marked with X-bit should
32 * be obtained with mmap */
33#define XBYAK_USE_MMAP_ALLOCATOR
34/* Use Xbyak's memfd-based allocation, if available */
35#define XBYAK_USE_MEMFD
36#define XBYAK_NO_EXCEPTION
37#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
38/* turn off `size_t to other-type implicit casting` warning
39 * currently we have a lot of jit-generated instructions that
40 * take uint32_t, but we pass size_t (e.g. due to using sizeof).
41 * FIXME: replace size_t parameters with the appropriate ones */
42#pragma warning(disable : 4267)
43#endif
44#include "common/compiler_workarounds.hpp"
45#include "cpu/x64/xbyak/xbyak.h"
46#include "cpu/x64/xbyak/xbyak_util.h"
47
48namespace dnnl {
49namespace impl {
50namespace cpu {
51namespace x64 {
52
53// Maximum number of features + hints that can be specified via bits
54static constexpr int cpu_isa_total_bits = sizeof(unsigned) * 8;
55
56enum cpu_isa_bit_t : unsigned {
57 // Fill in features from least significant bit to most significant bit
58 sse41_bit = 1u << 0,
59 avx_bit = 1u << 1,
60 avx2_bit = 1u << 2,
61 avx_vnni_bit = 1u << 3,
62 avx_vnni_2_bit = 1u << 4,
63 avx512_core_bit = 1u << 5,
64 avx512_core_vnni_bit = 1u << 6,
65 avx512_core_bf16_bit = 1u << 7,
66 avx512_core_fp16_bit = 1u << 8,
67 amx_tile_bit = 1u << 9,
68 amx_int8_bit = 1u << 10,
69 amx_bf16_bit = 1u << 11,
70 amx_fp16_bit = 1u << 12,
71 // Fill in hints from most significant bit to least significant bit
72 prefer_ymm_bit = 1u << (cpu_isa_total_bits - 1),
73};
74
75dnnl_cpu_isa_hints_t DNNL_API get_cpu_isa_hints(bool soft = false);
76status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints);
77
78namespace cpu_isa_hints_utils {
79/* hints_1 | hints_2 | ... | hints_n where hints_i are hint specific
80 bits declared inside the cpu_isa_bit_t */
81static constexpr unsigned hints_mask = prefer_ymm_bit;
82
83static unsigned cvt2mask(dnnl_cpu_isa_hints_t hints) {
84 static const std::unordered_map<dnnl_cpu_isa_hints_t, unsigned,
85 std::hash<int>>
86 hints_map = {{dnnl_cpu_isa_no_hints, 0},
87 {dnnl_cpu_isa_prefer_ymm, prefer_ymm_bit}};
88
89 auto iter = hints_map.find(hints);
90 if (iter != hints_map.end())
91 return iter->second;
92 else {
93 assert(!"unexpected CPU ISA hint");
94 return 0;
95 }
96}
97
98static bool is_hints_bit_set(cpu_isa_bit_t hint_bit, bool soft) {
99 const dnnl_cpu_isa_hints_t hints = get_cpu_isa_hints(soft);
100 const unsigned cur_hints_mask = cpu_isa_hints_utils::cvt2mask(hints);
101 return (cur_hints_mask & hint_bit) == hint_bit;
102}
103} // namespace cpu_isa_hints_utils
104
105enum cpu_isa_t : unsigned {
106 isa_undef = 0u,
107 sse41 = sse41_bit,
108 avx = avx_bit | sse41,
109 avx2 = avx2_bit | avx,
110 avx2_vnni = avx_vnni_bit | avx2,
111 avx2_vnni_2 = avx2_vnni | avx_vnni_2_bit,
112 avx512_core = avx512_core_bit | avx2,
113 avx512_core_vnni = avx512_core_vnni_bit | avx512_core,
114 avx512_core_bf16 = avx512_core_bf16_bit | avx512_core_vnni,
115 avx512_core_bf16_ymm = prefer_ymm_bit | avx512_core_bf16,
116 amx_tile = amx_tile_bit,
117 amx_int8 = amx_int8_bit | amx_tile,
118 amx_bf16 = amx_bf16_bit | amx_tile,
119 amx_fp16 = amx_fp16_bit | amx_tile,
120 avx512_core_fp16 = avx512_core_fp16_bit | avx512_core_bf16 | avx_vnni_bit,
121 avx512_core_amx = avx512_core_fp16 | amx_int8 | amx_bf16,
122 avx512_core_amx_fp16 = avx512_core_amx | amx_fp16,
123 // NOTES: 1. isa_all by default has no isa specific hints
124 isa_all = ~0u & ~cpu_isa_hints_utils::hints_mask,
125};
126
127enum class cpu_isa_cmp_t {
128 // List of infix comparison relations between two cpu_isa_t
129 // where we take isa_1 and isa_2 to be two cpu_isa_t instances.
130
131 // isa_1 SUBSET isa_2 if all feature flags supported by isa_1
132 // are supported by isa_2 as well (equality allowed)
133 SUBSET,
134
135 // isa_1 SUPERSET isa_2 if all feature flags supported by isa_2
136 // are supported by isa_1 as well (equality allowed)
137 SUPERSET,
138
139 // Few more options that (depending upon need) can be enabled in future
140
141 // 1. PROPER_SUBSET: isa_1 SUBSET isa_2 and isa_1 != isa_2
142 // 2. PROPER_SUPERSET: isa_1 SUPERSET isa_2 and isa_1 != isa_2
143};
144
145const char *get_isa_info();
146
147cpu_isa_t get_max_cpu_isa();
148cpu_isa_t DNNL_API get_max_cpu_isa_mask(bool soft = false);
149status_t set_max_cpu_isa(dnnl_cpu_isa_t isa);
150dnnl_cpu_isa_t get_effective_cpu_isa();
151
152static inline bool compare_isa(
153 cpu_isa_t isa_1, cpu_isa_cmp_t cmp, cpu_isa_t isa_2) {
154 assert(isa_1 != isa_all);
155 assert(isa_2 != isa_all);
156 // Comparison with `isa_all` is illegal.
157 if (utils::one_of(isa_all, isa_1, isa_2)) return false;
158
159 // By default, comparison between ISA ignores ISA specific hints
160 unsigned mask_1
161 = static_cast<unsigned>(isa_1) & ~cpu_isa_hints_utils::hints_mask;
162 unsigned mask_2
163 = static_cast<unsigned>(isa_2) & ~cpu_isa_hints_utils::hints_mask;
164 unsigned mask_common = mask_1 & mask_2;
165
166 switch (cmp) {
167 case cpu_isa_cmp_t::SUBSET: return mask_1 == mask_common;
168 case cpu_isa_cmp_t::SUPERSET: return mask_2 == mask_common;
169 default: assert(!"unsupported comparison of isa"); return false;
170 }
171}
172
173static inline bool is_subset(cpu_isa_t isa_1, cpu_isa_t isa_2) {
174 return compare_isa(isa_1, cpu_isa_cmp_t::SUBSET, isa_2);
175}
176
177static inline bool is_superset(cpu_isa_t isa_1, cpu_isa_t isa_2) {
178 return compare_isa(isa_1, cpu_isa_cmp_t::SUPERSET, isa_2);
179}
180
181template <typename Vmm>
182struct vreg_traits {};
183
184template <>
185struct vreg_traits<Xbyak::Zmm> {
186 typedef Xbyak::Ymm Vmm_lower_t;
187 static constexpr size_t vlen = 64;
188};
189
190template <>
191struct vreg_traits<Xbyak::Ymm> {
192 typedef Xbyak::Xmm Vmm_lower_t;
193 static constexpr size_t vlen = 32;
194};
195
196template <>
197struct vreg_traits<Xbyak::Xmm> {
198 typedef Xbyak::Xmm Vmm_lower_t;
199 static constexpr size_t vlen = 16;
200};
201
202template <cpu_isa_t>
203struct cpu_isa_traits {}; /* ::vlen -> 32 (for avx2) */
204
205// pack struct so it can fit into a single 64-byte cache line
206#pragma pack(push, 1)
207struct palette_config_t {
208 uint8_t palette_id;
209 uint8_t startRow;
210 uint8_t reserved[14];
211 uint16_t cols[16];
212 uint8_t rows[16];
213};
214#pragma pack(pop)
215
216template <>
217struct cpu_isa_traits<isa_all> {
218 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_default;
219 static constexpr const char *user_option_env = "default";
220};
221
222template <>
223struct cpu_isa_traits<sse41> {
224 typedef Xbyak::Xmm Vmm;
225 static constexpr int vlen_shift = 4;
226 static constexpr int vlen = vreg_traits<Vmm>::vlen;
227 static constexpr int n_vregs = 16;
228 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_sse41;
229 static constexpr const char *user_option_env = "sse41";
230};
231
232template <>
233struct cpu_isa_traits<avx> {
234 typedef Xbyak::Ymm Vmm;
235 static constexpr int vlen_shift = 5;
236 static constexpr int vlen = vreg_traits<Vmm>::vlen;
237 static constexpr int n_vregs = 16;
238 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx;
239 static constexpr const char *user_option_env = "avx";
240};
241
242template <>
243struct cpu_isa_traits<avx2> : public cpu_isa_traits<avx> {
244 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2;
245 static constexpr const char *user_option_env = "avx2";
246};
247
248template <>
249struct cpu_isa_traits<avx2_vnni> : public cpu_isa_traits<avx2> {
250 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2_vnni;
251 static constexpr const char *user_option_env = "avx2_vnni";
252};
253
254template <>
255struct cpu_isa_traits<avx2_vnni_2> : public cpu_isa_traits<avx2> {
256 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2_vnni_2;
257 static constexpr const char *user_option_env = "avx2_vnni_2";
258};
259
260template <>
261struct cpu_isa_traits<avx512_core> {
262 typedef Xbyak::Zmm Vmm;
263 static constexpr int vlen_shift = 6;
264 static constexpr int vlen = vreg_traits<Vmm>::vlen;
265 static constexpr int n_vregs = 32;
266 static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx512_core;
267 static constexpr const char *user_option_env = "avx512_core";
268};
269
270template <>
271struct cpu_isa_traits<avx512_core_vnni> : public cpu_isa_traits<avx512_core> {
272 static constexpr dnnl_cpu_isa_t user_option_val
273 = dnnl_cpu_isa_avx512_core_vnni;
274 static constexpr const char *user_option_env = "avx512_core_vnni";
275};
276
277template <>
278struct cpu_isa_traits<avx512_core_bf16> : public cpu_isa_traits<avx512_core> {
279 static constexpr dnnl_cpu_isa_t user_option_val
280 = dnnl_cpu_isa_avx512_core_bf16;
281 static constexpr const char *user_option_env = "avx512_core_bf16";
282};
283
284template <>
285struct cpu_isa_traits<avx512_core_amx> {
286 static constexpr dnnl_cpu_isa_t user_option_val
287 = dnnl_cpu_isa_avx512_core_amx;
288 static constexpr const char *user_option_env = "avx512_core_amx";
289};
290
291template <>
292struct cpu_isa_traits<avx512_core_fp16> : public cpu_isa_traits<avx512_core> {
293 static constexpr dnnl_cpu_isa_t user_option_val
294 = dnnl_cpu_isa_avx512_core_fp16;
295 static constexpr const char *user_option_env = "avx512_core_fp16";
296};
297
298template <>
299struct cpu_isa_traits<avx512_core_amx_fp16> {
300 static constexpr dnnl_cpu_isa_t user_option_val
301 = dnnl_cpu_isa_avx512_core_amx_fp16;
302 static constexpr const char *user_option_env = "avx512_core_amx_fp16";
303};
304
305inline const Xbyak::util::Cpu &cpu() {
306 const static Xbyak::util::Cpu cpu_;
307 return cpu_;
308}
309
310namespace amx {
311
312// Return the target palette for AMX instructions. Currently this is `0` if AMX
313// instructions are not supported, and `1` if they are.
314int get_target_palette();
315
316int get_max_tiles(int palette);
317int get_max_column_bytes(int palette);
318int get_max_rows(int palette);
319bool DNNL_API is_available();
320
321} // namespace amx
322
323namespace {
324
325static inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
326 using namespace Xbyak::util;
327
328 unsigned cpu_isa_mask = x64::get_max_cpu_isa_mask(soft);
329 unsigned cpu_isa_no_hints = cpu_isa & ~cpu_isa_hints_utils::hints_mask;
330
331 if ((cpu_isa_mask & cpu_isa_no_hints) != cpu_isa_no_hints) return false;
332
333 switch (cpu_isa) {
334 case sse41: return cpu().has(Cpu::tSSE41);
335 case avx: return cpu().has(Cpu::tAVX);
336 case avx2: return cpu().has(Cpu::tAVX2);
337 case avx2_vnni: return mayiuse(avx2, soft) && cpu().has(Cpu::tAVX_VNNI);
338 case avx2_vnni_2:
339 return mayiuse(avx2_vnni, soft) && cpu().has(Cpu::tAVX_VNNI_INT8)
340 && cpu().has(Cpu::tAVX_NE_CONVERT);
341 case avx512_core:
342 return cpu().has(Cpu::tAVX512F) && cpu().has(Cpu::tAVX512BW)
343 && cpu().has(Cpu::tAVX512VL) && cpu().has(Cpu::tAVX512DQ);
344 case avx512_core_vnni:
345 return cpu().has(Cpu::tAVX512F) && cpu().has(Cpu::tAVX512BW)
346 && cpu().has(Cpu::tAVX512VL) && cpu().has(Cpu::tAVX512DQ)
347 && cpu().has(Cpu::tAVX512_VNNI);
348 case avx512_core_bf16:
349 return mayiuse(avx512_core_vnni, soft)
350 && cpu().has(Cpu::tAVX512_BF16);
351 case avx512_core_bf16_ymm:
352 return mayiuse(avx512_core_bf16, soft)
353 && cpu_isa_hints_utils::is_hints_bit_set(
354 prefer_ymm_bit, soft);
355 case avx512_core_fp16:
356 return cpu().has(Cpu::tAVX512_FP16)
357 && mayiuse(avx512_core_bf16, soft)
358 && mayiuse(avx2_vnni, soft);
359 case amx_tile:
360 return cpu().has(Cpu::tAMX_TILE) && x64::amx::is_available();
361 case amx_int8:
362 return mayiuse(amx_tile, soft) && cpu().has(Cpu::tAMX_INT8);
363 case amx_bf16:
364 return mayiuse(amx_tile, soft) && cpu().has(Cpu::tAMX_BF16);
365 case amx_fp16:
366 return mayiuse(amx_tile, soft) && cpu().has(Cpu::tAMX_FP16);
367 case avx512_core_amx:
368 return mayiuse(amx_int8, soft) && mayiuse(amx_bf16, soft)
369 && mayiuse(avx512_core_fp16, soft);
370 case avx512_core_amx_fp16:
371 return mayiuse(avx512_core_amx, soft) && mayiuse(amx_fp16, soft);
372 case isa_undef: return true;
373 case isa_all: return false;
374 }
375 return false;
376}
377
378static inline bool isa_has_bf16(cpu_isa_t isa) {
379 return is_superset(isa, avx512_core_bf16);
380}
381
382static inline bool isa_has_masks(cpu_isa_t isa) {
383 return is_superset(isa, avx512_core);
384}
385
386static inline int isa_max_vlen(cpu_isa_t isa) {
387 if (is_superset(isa, avx512_core))
388 return cpu_isa_traits<avx512_core>::vlen;
389 else if (is_superset(isa, avx2))
390 return cpu_isa_traits<avx2>::vlen;
391 else if (is_superset(isa, sse41))
392 return cpu_isa_traits<sse41>::vlen;
393 else
394 return 0;
395}
396
397static inline int isa_num_vregs(cpu_isa_t isa) {
398 if (is_superset(isa, avx512_core))
399 return cpu_isa_traits<avx512_core>::n_vregs;
400 else if (is_superset(isa, avx2))
401 return cpu_isa_traits<avx2>::n_vregs;
402 else if (is_superset(isa, sse41))
403 return cpu_isa_traits<sse41>::n_vregs;
404 else
405 return 0;
406}
407
408} // namespace
409
410/* whatever is required to generate string literals... */
411#include "common/z_magic.hpp"
412/* clang-format off */
413#define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \
414 ((isa) == isa_undef ? prefix STRINGIFY(undef) : \
415 (isa) == sse41 ? prefix STRINGIFY(sse41) : \
416 (isa) == avx ? prefix STRINGIFY(avx) : \
417 (isa) == avx2 ? prefix STRINGIFY(avx2) : \
418 (isa) == avx2_vnni ? prefix STRINGIFY(avx2_vnni) : \
419 (isa) == avx2_vnni_2 ? prefix STRINGIFY(avx2_vnni_2) : \
420 (isa) == avx512_core ? prefix STRINGIFY(avx512_core) : \
421 (isa) == avx512_core_vnni ? prefix STRINGIFY(avx512_core_vnni) : \
422 (isa) == avx512_core_bf16 ? prefix STRINGIFY(avx512_core_bf16) : \
423 (isa) == avx512_core_fp16 ? prefix STRINGIFY(avx512_core_fp16) : \
424 (isa) == avx512_core_amx ? prefix STRINGIFY(avx512_core_amx) : \
425 (isa) == avx512_core_amx_fp16 ? prefix STRINGIFY(avx512_core_amx_fp16) : \
426 prefix suffix_if_any)
427/* clang-format on */
428
429inline size_t data_type_vnni_granularity(data_type_t data_type) {
430 using namespace data_type;
431 switch (data_type) {
432 case f32:
433 case s32: return size_t(1);
434 case f16:
435 case bf16: return size_t(2);
436 case s8:
437 case u8: return size_t(4);
438 case data_type::undef:
439 default: assert(!"unknown data_type");
440 }
441 return size_t(0); /* should not be reachable */
442}
443
444template <cpu_isa_t isa>
445inline size_t data_type_vnni_simd_elems(data_type_t data_type) {
446 const size_t dt_size = types::data_type_size(data_type);
447 assert(dt_size > 0);
448 return cpu_isa_traits<isa>::vlen / dt_size;
449}
450
451} // namespace x64
452} // namespace cpu
453} // namespace impl
454} // namespace dnnl
455
456#endif
457