1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3* Copyright 2020 FUJITSU LIMITED
4* Copyright 2022 Arm Ltd. and affiliates
5*
6* Licensed under the Apache License, Version 2.0 (the "License");
7* you may not use this file except in compliance with the License.
8* You may obtain a copy of the License at
9*
10* http://www.apache.org/licenses/LICENSE-2.0
11*
12* Unless required by applicable law or agreed to in writing, software
13* distributed under the License is distributed on an "AS IS" BASIS,
14* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15* See the License for the specific language governing permissions and
16* limitations under the License.
17*******************************************************************************/
18
19#include <thread>
20
21#include "cpu/platform.hpp"
22
23#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
24#include <algorithm>
25
26#if defined(_WIN32)
27#include <windows.h>
28#elif defined(__GLIBC__)
29#include <sched.h>
30#endif
31#endif
32
33#if DNNL_X64
34#include "cpu/x64/cpu_isa_traits.hpp"
35#elif DNNL_AARCH64
36#include "cpu/aarch64/cpu_isa_traits.hpp"
37#if DNNL_AARCH64_USE_ACL
38// For checking if fp16 isa is supported on the platform
39#include "arm_compute/core/CPP/CPPTypes.h"
40// For setting the number of threads for ACL
41#include "src/common/cpuinfo/CpuInfo.h"
42#endif
43#endif
44
45// For DNNL_X64 build we compute the timestamp using rdtsc. Use std::chrono for
46// other builds.
47#if !DNNL_X64
48#include <chrono>
49#endif
50
51namespace dnnl {
52namespace impl {
53namespace cpu {
54namespace platform {
55
56const char *get_isa_info() {
57#if DNNL_X64
58 return x64::get_isa_info();
59#elif DNNL_AARCH64
60 return aarch64::get_isa_info();
61#else
62 return "Generic";
63#endif
64}
65
66dnnl_cpu_isa_t get_effective_cpu_isa() {
67#if DNNL_X64
68 return x64::get_effective_cpu_isa();
69#elif DNNL_AARCH64
70 return aarch64::get_effective_cpu_isa();
71#else
72 return dnnl_cpu_isa_default;
73#endif
74}
75
76status_t set_max_cpu_isa(dnnl_cpu_isa_t isa) {
77#if DNNL_X64
78 return x64::set_max_cpu_isa(isa);
79#else
80 return status::unimplemented;
81#endif
82}
83
84status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints) {
85#if DNNL_X64
86 return x64::set_cpu_isa_hints(isa_hints);
87#else
88 return status::unimplemented;
89#endif
90}
91
92dnnl_cpu_isa_hints_t get_cpu_isa_hints() {
93#if DNNL_X64
94 return x64::get_cpu_isa_hints();
95#else
96 return dnnl_cpu_isa_no_hints;
97#endif
98}
99
100bool prefer_ymm_requested() {
101#if DNNL_X64
102 const bool prefer_ymm = x64::get_cpu_isa_hints() == dnnl_cpu_isa_prefer_ymm;
103 return prefer_ymm;
104#else
105 return false;
106#endif
107}
108
109bool has_data_type_support(data_type_t data_type) {
110 // Notice: see notes in header
111 switch (data_type) {
112 case data_type::bf16:
113#if DNNL_X64
114 return x64::mayiuse(x64::avx512_core)
115 || x64::mayiuse(x64::avx2_vnni_2);
116#elif DNNL_PPC64
117#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
118 return true;
119#endif
120#else
121 return false;
122#endif
123 case data_type::f16:
124#if DNNL_X64
125 return x64::mayiuse(x64::avx512_core_fp16)
126 || x64::mayiuse(x64::avx2_vnni_2);
127#elif DNNL_AARCH64_USE_ACL
128 return arm_compute::CPUInfo::get().has_fp16();
129#else
130 return false;
131#endif
132 default: return true;
133 }
134}
135
136bool has_training_support(data_type_t data_type) {
137 // TODO: maybe return false for int8, but some primitives like prelu
138 // have training support
139 switch (data_type) {
140 case data_type::bf16:
141#if DNNL_X64
142 return x64::mayiuse(x64::avx512_core);
143#elif DNNL_PPC64
144#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
145 return true;
146#endif
147#else
148 return false;
149#endif
150 case data_type::f16:
151#if DNNL_X64
152 return x64::mayiuse(x64::avx512_core_fp16);
153#else
154 return false;
155#endif
156 default: return true;
157 }
158}
159
160float s8s8_weights_scale_factor() {
161#if DNNL_X64
162 return x64::mayiuse(x64::avx512_core_vnni) ? 1.0f : 0.5f;
163#else
164 return 1.0f;
165#endif
166}
167
168unsigned get_per_core_cache_size(int level) {
169 auto guess = [](int level) {
170 switch (level) {
171 case 1: return 32U * 1024;
172 case 2: return 512U * 1024;
173 case 3: return 1024U * 1024;
174 default: return 0U;
175 }
176 };
177
178#if DNNL_X64
179 using namespace x64;
180 if (cpu().getDataCacheLevels() == 0) return guess(level);
181
182 if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) {
183 unsigned l = level - 1;
184 return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l);
185 } else
186 return 0;
187#else
188 return guess(level);
189#endif
190}
191
192unsigned get_num_cores() {
193#if DNNL_X64
194 return x64::cpu().getNumCores(Xbyak::util::CoreLevel);
195#elif DNNL_AARCH64_USE_ACL
196 return arm_compute::cpuinfo::num_threads_hint();
197#else
198 return 1;
199#endif
200}
201
202#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
203// The purpose of this function is to return the potential maximum number of
204// threads in user's threadpool. It is assumed that the number of threads in an
205// actual threadpool will not exceed the number cores in a socket reported by
206// the OS, which may or may not be equal to the number of total physical cores
207// in a socket depending on the OS configuration (read -- VM environment). In
208// order to simulate the number of cores available in such environment, this
209// function supports process affinity.
210unsigned get_max_threads_to_use() {
211 // TODO: the logic below should involve number of sockets to provide exact
212 // number of cores on 2+ socket systems.
213 int num_cores_per_socket = (int)dnnl::impl::cpu::platform::get_num_cores();
214 // It may happen that XByak doesn't get num of threads identified, e.g. for
215 // AMD. In order to make threadpool working, we supply an additional
216 // condition to have some reasonable number of threads available at
217 // primitive descriptor creation time.
218 if (num_cores_per_socket == 0)
219 num_cores_per_socket = std::thread::hardware_concurrency();
220
221#if defined(_WIN32)
222 DWORD_PTR proc_affinity_mask;
223 DWORD_PTR sys_affinity_mask;
224 if (GetProcessAffinityMask(
225 GetCurrentProcess(), &proc_affinity_mask, &sys_affinity_mask)) {
226 int masked_nthr = 0;
227 for (int i = 0; i < CHAR_BIT * sizeof(proc_affinity_mask);
228 i++, proc_affinity_mask >>= 1)
229 masked_nthr += proc_affinity_mask & 1;
230 return std::min(masked_nthr, num_cores_per_socket);
231 }
232#elif defined(__GLIBC__)
233 cpu_set_t cpu_set;
234 // Check if the affinity of the process has been set using, e.g.,
235 // numactl.
236 if (::sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
237 return std::min(CPU_COUNT(&cpu_set), num_cores_per_socket);
238#endif
239 return num_cores_per_socket;
240}
241#endif
242
243int get_vector_register_size() {
244#if DNNL_X64
245 using namespace x64;
246 if (mayiuse(avx512_core)) return cpu_isa_traits<avx512_core>::vlen;
247 if (mayiuse(avx)) return cpu_isa_traits<avx>::vlen;
248 if (mayiuse(sse41)) return cpu_isa_traits<sse41>::vlen;
249#elif DNNL_AARCH64
250 using namespace aarch64;
251 if (mayiuse(asimd)) return cpu_isa_traits<asimd>::vlen;
252 if (mayiuse(sve_512)) return cpu_isa_traits<sve_512>::vlen;
253#endif
254 return 0;
255}
256
257/* The purpose of this function is to provide a very efficient timestamp
258 * calculation (used primarily for primitive cache). For DNNL_X64, this can be
259 * accomplished using *rdtsc* since it provides a timestamp value that (i) is
260 * independent for each core, and (ii) is synchronized across cores in multiple
261 * sockets.
262 * TODO: For now, use std::chrono::steady_clock for other builds, however
263 * another more optimized function may be called here.
264 */
265size_t get_timestamp() {
266#if DNNL_X64
267 return static_cast<size_t>(Xbyak::util::Clock::getRdtsc());
268#else
269 return static_cast<size_t>(
270 std::chrono::steady_clock::now().time_since_epoch().count());
271#endif
272}
273
274} // namespace platform
275} // namespace cpu
276} // namespace impl
277} // namespace dnnl
278