1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_ |
17 | #define TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_ |
18 | |
19 | #include <string> |
20 | |
21 | // TODO(ahentz): This is not strictly required here but, for historical |
22 | // reasons, many people depend on cpu_info.h in order to use kLittleEndian. |
23 | #include "tensorflow/tsl/platform/byte_order.h" |
24 | |
25 | #if defined(_MSC_VER) |
26 | // included so __cpuidex function is available for GETCPUID on Windows |
27 | #include <intrin.h> |
28 | #endif |
29 | |
30 | namespace tsl { |
31 | namespace port { |
32 | |
33 | // Returns an estimate of the number of schedulable CPUs for this |
34 | // process. Usually, it's constant throughout the lifetime of a |
35 | // process, but it might change if the underlying cluster management |
36 | // software can change it dynamically. If the underlying call fails, a default |
37 | // value (e.g. `4`) may be returned. |
38 | int NumSchedulableCPUs(); |
39 | |
40 | // Returns an estimate for the maximum parallelism for this process. |
41 | // Applications should avoid running more than this number of threads with |
42 | // intensive workloads concurrently to avoid performance degradation and |
43 | // contention. |
44 | // This value is either the number of schedulable CPUs, or a value specific to |
45 | // the underlying cluster management. Applications should assume this value can |
46 | // change throughout the lifetime of the process. This function must not be |
47 | // called during initialization, i.e., before main() has started. |
48 | int MaxParallelism(); |
49 | |
50 | // Returns an estimate for the maximum parallelism for this process on the |
51 | // provided numa node, or any numa node if `numa_node` is kNUMANoAffinity. |
52 | // See MaxParallelism() for more information. |
53 | int MaxParallelism(int numa_node); |
54 | |
55 | // Returns the total number of CPUs on the system. This number should |
56 | // not change even if the underlying cluster management software may |
57 | // change the number of schedulable CPUs. Unlike `NumSchedulableCPUs`, if the |
58 | // underlying call fails, an invalid value of -1 will be returned; |
59 | // the user must check for validity. |
60 | static constexpr int kUnknownCPU = -1; |
61 | int NumTotalCPUs(); |
62 | |
63 | // Returns the id of the current CPU. Returns -1 if the current CPU cannot be |
64 | // identified. If successful, the return value will be in [0, NumTotalCPUs()). |
65 | int GetCurrentCPU(); |
66 | |
67 | // Returns an estimate of the number of hyperthreads per physical core |
68 | // on the CPU |
69 | int NumHyperthreadsPerCore(); |
70 | |
71 | // Mostly ISA related features that we care about |
72 | enum CPUFeature { |
73 | // Do not change numeric assignments. |
74 | MMX = 0, |
75 | SSE = 1, |
76 | SSE2 = 2, |
77 | SSE3 = 3, |
78 | SSSE3 = 4, |
79 | SSE4_1 = 5, |
80 | SSE4_2 = 6, |
81 | CMOV = 7, |
82 | CMPXCHG8B = 8, |
83 | CMPXCHG16B = 9, |
84 | POPCNT = 10, |
85 | AES = 11, |
86 | AVX = 12, |
87 | RDRAND = 13, |
88 | AVX2 = 14, |
89 | FMA = 15, |
90 | F16C = 16, |
91 | PCLMULQDQ = 17, |
92 | RDSEED = 18, |
93 | ADX = 19, |
94 | SMAP = 20, |
95 | |
96 | // Prefetch Vector Data Into Caches with Intent to Write and T1 Hint |
97 | // http://www.felixcloutier.com/x86/PREFETCHWT1.html. |
98 | // You probably want PREFETCHW instead. |
99 | PREFETCHWT1 = 21, |
100 | |
101 | BMI1 = 22, |
102 | BMI2 = 23, |
103 | HYPERVISOR = 25, // 0 when on a real CPU, 1 on (well-behaved) hypervisor. |
104 | |
105 | // Prefetch Data into Caches in Anticipation of a Write (3D Now!). |
106 | // http://www.felixcloutier.com/x86/PREFETCHW.html |
107 | PREFETCHW = 26, |
108 | |
109 | // AVX-512: 512-bit vectors (plus masking, etc.) in Knights Landing, |
110 | // Skylake, Xeon, etc. Each of these entries is a different subset of |
111 | // instructions, various combinations of which occur on various CPU types. |
112 | AVX512F = 27, // Foundation |
113 | AVX512CD = 28, // Conflict detection |
114 | AVX512ER = 29, // Exponential and reciprocal |
115 | AVX512PF = 30, // Prefetching |
116 | AVX512VL = 31, // Shorter vector lengths |
117 | AVX512BW = 32, // Byte and word |
118 | AVX512DQ = 33, // Dword and qword |
119 | AVX512VBMI = 34, // Bit manipulation |
120 | AVX512IFMA = 35, // Integer multiply-add |
121 | AVX512_4VNNIW = 36, // Integer neural network (Intel Xeon Phi only) |
122 | AVX512_4FMAPS = 37, // Floating point neural network (Intel Xeon Phi only) |
123 | AVX512_VNNI = 38, // Integer neural network |
124 | AVX512_BF16 = 39, // Bfloat16 neural network |
125 | |
126 | // AVX version of AVX512_VNNI in CPUs such as Alder Lake and Sapphire Rapids. |
127 | AVX_VNNI = 40, // Integer neural network |
128 | |
129 | // AMX: Advanced Matrix Extension in Sapphire Rapids. |
130 | // Perform matrix multiplication on the Tile Matrix Multiply (TMUL) unit, |
131 | // supporting two popular data types in neural networks, int8 and bfloat16. |
132 | AMX_TILE = 41, // Tile configuration and load/store |
133 | AMX_INT8 = 42, // Int8 tile matrix multiplication |
134 | AMX_BF16 = 43, // Bfloat16 tile matrix multiplication |
135 | }; |
136 | |
137 | // Checks whether the current processor supports one of the features above. |
138 | // Checks CPU registers to return hardware capabilities. |
139 | bool TestCPUFeature(CPUFeature feature); |
140 | |
141 | // Returns CPU Vendor string (i.e. 'GenuineIntel', 'AuthenticAMD', etc.) |
142 | std::string CPUVendorIDString(); |
143 | |
144 | // Returns CPU family. |
145 | int CPUFamily(); |
146 | |
147 | // Returns CPU model number. |
148 | int CPUModelNum(); |
149 | |
150 | // Returns nominal core processor cycles per second of each processor. |
151 | double NominalCPUFrequency(); |
152 | |
153 | // Returns num of hyperthreads per physical core |
154 | int CPUIDNumSMT(); |
155 | |
156 | } // namespace port |
157 | } // namespace tsl |
158 | |
159 | #endif // TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_ |
160 | |