1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * Copyright 2020 FUJITSU LIMITED |
4 | * Copyright 2022 Arm Ltd. and affiliates |
5 | * |
6 | * Licensed under the Apache License, Version 2.0 (the "License"); |
7 | * you may not use this file except in compliance with the License. |
8 | * You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, software |
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | * See the License for the specific language governing permissions and |
16 | * limitations under the License. |
17 | *******************************************************************************/ |
18 | |
19 | #include <thread> |
20 | |
21 | #include "cpu/platform.hpp" |
22 | |
23 | #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL |
24 | #include <algorithm> |
25 | |
26 | #if defined(_WIN32) |
27 | #include <windows.h> |
28 | #elif defined(__GLIBC__) |
29 | #include <sched.h> |
30 | #endif |
31 | #endif |
32 | |
33 | #if DNNL_X64 |
34 | #include "cpu/x64/cpu_isa_traits.hpp" |
35 | #elif DNNL_AARCH64 |
36 | #include "cpu/aarch64/cpu_isa_traits.hpp" |
37 | #if DNNL_AARCH64_USE_ACL |
38 | // For checking if fp16 isa is supported on the platform |
39 | #include "arm_compute/core/CPP/CPPTypes.h" |
40 | // For setting the number of threads for ACL |
41 | #include "src/common/cpuinfo/CpuInfo.h" |
42 | #endif |
43 | #endif |
44 | |
45 | // For DNNL_X64 build we compute the timestamp using rdtsc. Use std::chrono for |
46 | // other builds. |
47 | #if !DNNL_X64 |
48 | #include <chrono> |
49 | #endif |
50 | |
51 | namespace dnnl { |
52 | namespace impl { |
53 | namespace cpu { |
54 | namespace platform { |
55 | |
56 | const char *get_isa_info() { |
57 | #if DNNL_X64 |
58 | return x64::get_isa_info(); |
59 | #elif DNNL_AARCH64 |
60 | return aarch64::get_isa_info(); |
61 | #else |
62 | return "Generic" ; |
63 | #endif |
64 | } |
65 | |
66 | dnnl_cpu_isa_t get_effective_cpu_isa() { |
67 | #if DNNL_X64 |
68 | return x64::get_effective_cpu_isa(); |
69 | #elif DNNL_AARCH64 |
70 | return aarch64::get_effective_cpu_isa(); |
71 | #else |
72 | return dnnl_cpu_isa_default; |
73 | #endif |
74 | } |
75 | |
76 | status_t set_max_cpu_isa(dnnl_cpu_isa_t isa) { |
77 | #if DNNL_X64 |
78 | return x64::set_max_cpu_isa(isa); |
79 | #else |
80 | return status::unimplemented; |
81 | #endif |
82 | } |
83 | |
84 | status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints) { |
85 | #if DNNL_X64 |
86 | return x64::set_cpu_isa_hints(isa_hints); |
87 | #else |
88 | return status::unimplemented; |
89 | #endif |
90 | } |
91 | |
92 | dnnl_cpu_isa_hints_t get_cpu_isa_hints() { |
93 | #if DNNL_X64 |
94 | return x64::get_cpu_isa_hints(); |
95 | #else |
96 | return dnnl_cpu_isa_no_hints; |
97 | #endif |
98 | } |
99 | |
100 | bool prefer_ymm_requested() { |
101 | #if DNNL_X64 |
102 | const bool prefer_ymm = x64::get_cpu_isa_hints() == dnnl_cpu_isa_prefer_ymm; |
103 | return prefer_ymm; |
104 | #else |
105 | return false; |
106 | #endif |
107 | } |
108 | |
109 | bool has_data_type_support(data_type_t data_type) { |
110 | // Notice: see notes in header |
111 | switch (data_type) { |
112 | case data_type::bf16: |
113 | #if DNNL_X64 |
114 | return x64::mayiuse(x64::avx512_core) |
115 | || x64::mayiuse(x64::avx2_vnni_2); |
116 | #elif DNNL_PPC64 |
117 | #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__) |
118 | return true; |
119 | #endif |
120 | #else |
121 | return false; |
122 | #endif |
123 | case data_type::f16: |
124 | #if DNNL_X64 |
125 | return x64::mayiuse(x64::avx512_core_fp16) |
126 | || x64::mayiuse(x64::avx2_vnni_2); |
127 | #elif DNNL_AARCH64_USE_ACL |
128 | return arm_compute::CPUInfo::get().has_fp16(); |
129 | #else |
130 | return false; |
131 | #endif |
132 | default: return true; |
133 | } |
134 | } |
135 | |
136 | bool has_training_support(data_type_t data_type) { |
137 | // TODO: maybe return false for int8, but some primitives like prelu |
138 | // have training support |
139 | switch (data_type) { |
140 | case data_type::bf16: |
141 | #if DNNL_X64 |
142 | return x64::mayiuse(x64::avx512_core); |
143 | #elif DNNL_PPC64 |
144 | #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__) |
145 | return true; |
146 | #endif |
147 | #else |
148 | return false; |
149 | #endif |
150 | case data_type::f16: |
151 | #if DNNL_X64 |
152 | return x64::mayiuse(x64::avx512_core_fp16); |
153 | #else |
154 | return false; |
155 | #endif |
156 | default: return true; |
157 | } |
158 | } |
159 | |
160 | float s8s8_weights_scale_factor() { |
161 | #if DNNL_X64 |
162 | return x64::mayiuse(x64::avx512_core_vnni) ? 1.0f : 0.5f; |
163 | #else |
164 | return 1.0f; |
165 | #endif |
166 | } |
167 | |
168 | unsigned get_per_core_cache_size(int level) { |
169 | auto guess = [](int level) { |
170 | switch (level) { |
171 | case 1: return 32U * 1024; |
172 | case 2: return 512U * 1024; |
173 | case 3: return 1024U * 1024; |
174 | default: return 0U; |
175 | } |
176 | }; |
177 | |
178 | #if DNNL_X64 |
179 | using namespace x64; |
180 | if (cpu().getDataCacheLevels() == 0) return guess(level); |
181 | |
182 | if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) { |
183 | unsigned l = level - 1; |
184 | return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l); |
185 | } else |
186 | return 0; |
187 | #else |
188 | return guess(level); |
189 | #endif |
190 | } |
191 | |
192 | unsigned get_num_cores() { |
193 | #if DNNL_X64 |
194 | return x64::cpu().getNumCores(Xbyak::util::CoreLevel); |
195 | #elif DNNL_AARCH64_USE_ACL |
196 | return arm_compute::cpuinfo::num_threads_hint(); |
197 | #else |
198 | return 1; |
199 | #endif |
200 | } |
201 | |
202 | #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL |
203 | // The purpose of this function is to return the potential maximum number of |
204 | // threads in user's threadpool. It is assumed that the number of threads in an |
205 | // actual threadpool will not exceed the number cores in a socket reported by |
206 | // the OS, which may or may not be equal to the number of total physical cores |
207 | // in a socket depending on the OS configuration (read -- VM environment). In |
208 | // order to simulate the number of cores available in such environment, this |
209 | // function supports process affinity. |
210 | unsigned get_max_threads_to_use() { |
211 | // TODO: the logic below should involve number of sockets to provide exact |
212 | // number of cores on 2+ socket systems. |
213 | int num_cores_per_socket = (int)dnnl::impl::cpu::platform::get_num_cores(); |
214 | // It may happen that XByak doesn't get num of threads identified, e.g. for |
215 | // AMD. In order to make threadpool working, we supply an additional |
216 | // condition to have some reasonable number of threads available at |
217 | // primitive descriptor creation time. |
218 | if (num_cores_per_socket == 0) |
219 | num_cores_per_socket = std::thread::hardware_concurrency(); |
220 | |
221 | #if defined(_WIN32) |
222 | DWORD_PTR proc_affinity_mask; |
223 | DWORD_PTR sys_affinity_mask; |
224 | if (GetProcessAffinityMask( |
225 | GetCurrentProcess(), &proc_affinity_mask, &sys_affinity_mask)) { |
226 | int masked_nthr = 0; |
227 | for (int i = 0; i < CHAR_BIT * sizeof(proc_affinity_mask); |
228 | i++, proc_affinity_mask >>= 1) |
229 | masked_nthr += proc_affinity_mask & 1; |
230 | return std::min(masked_nthr, num_cores_per_socket); |
231 | } |
232 | #elif defined(__GLIBC__) |
233 | cpu_set_t cpu_set; |
234 | // Check if the affinity of the process has been set using, e.g., |
235 | // numactl. |
236 | if (::sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0) |
237 | return std::min(CPU_COUNT(&cpu_set), num_cores_per_socket); |
238 | #endif |
239 | return num_cores_per_socket; |
240 | } |
241 | #endif |
242 | |
243 | int get_vector_register_size() { |
244 | #if DNNL_X64 |
245 | using namespace x64; |
246 | if (mayiuse(avx512_core)) return cpu_isa_traits<avx512_core>::vlen; |
247 | if (mayiuse(avx)) return cpu_isa_traits<avx>::vlen; |
248 | if (mayiuse(sse41)) return cpu_isa_traits<sse41>::vlen; |
249 | #elif DNNL_AARCH64 |
250 | using namespace aarch64; |
251 | if (mayiuse(asimd)) return cpu_isa_traits<asimd>::vlen; |
252 | if (mayiuse(sve_512)) return cpu_isa_traits<sve_512>::vlen; |
253 | #endif |
254 | return 0; |
255 | } |
256 | |
257 | /* The purpose of this function is to provide a very efficient timestamp |
258 | * calculation (used primarily for primitive cache). For DNNL_X64, this can be |
259 | * accomplished using *rdtsc* since it provides a timestamp value that (i) is |
260 | * independent for each core, and (ii) is synchronized across cores in multiple |
261 | * sockets. |
262 | * TODO: For now, use std::chrono::steady_clock for other builds, however |
263 | * another more optimized function may be called here. |
264 | */ |
265 | size_t get_timestamp() { |
266 | #if DNNL_X64 |
267 | return static_cast<size_t>(Xbyak::util::Clock::getRdtsc()); |
268 | #else |
269 | return static_cast<size_t>( |
270 | std::chrono::steady_clock::now().time_since_epoch().count()); |
271 | #endif |
272 | } |
273 | |
274 | } // namespace platform |
275 | } // namespace cpu |
276 | } // namespace impl |
277 | } // namespace dnnl |
278 | |