1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_
17#define TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_
18
19#include <string>
20
21// TODO(ahentz): This is not strictly required here but, for historical
22// reasons, many people depend on cpu_info.h in order to use kLittleEndian.
23#include "tensorflow/tsl/platform/byte_order.h"
24
25#if defined(_MSC_VER)
26// included so __cpuidex function is available for GETCPUID on Windows
27#include <intrin.h>
28#endif
29
30namespace tsl {
31namespace port {
32
33// Returns an estimate of the number of schedulable CPUs for this
34// process. Usually, it's constant throughout the lifetime of a
35// process, but it might change if the underlying cluster management
36// software can change it dynamically. If the underlying call fails, a default
37// value (e.g. `4`) may be returned.
38int NumSchedulableCPUs();
39
40// Returns an estimate for the maximum parallelism for this process.
41// Applications should avoid running more than this number of threads with
42// intensive workloads concurrently to avoid performance degradation and
43// contention.
44// This value is either the number of schedulable CPUs, or a value specific to
45// the underlying cluster management. Applications should assume this value can
46// change throughout the lifetime of the process. This function must not be
47// called during initialization, i.e., before main() has started.
48int MaxParallelism();
49
50// Returns an estimate for the maximum parallelism for this process on the
51// provided numa node, or any numa node if `numa_node` is kNUMANoAffinity.
52// See MaxParallelism() for more information.
53int MaxParallelism(int numa_node);
54
55// Returns the total number of CPUs on the system. This number should
56// not change even if the underlying cluster management software may
57// change the number of schedulable CPUs. Unlike `NumSchedulableCPUs`, if the
58// underlying call fails, an invalid value of -1 will be returned;
59// the user must check for validity.
60static constexpr int kUnknownCPU = -1;
61int NumTotalCPUs();
62
63// Returns the id of the current CPU. Returns -1 if the current CPU cannot be
64// identified. If successful, the return value will be in [0, NumTotalCPUs()).
65int GetCurrentCPU();
66
67// Returns an estimate of the number of hyperthreads per physical core
68// on the CPU
69int NumHyperthreadsPerCore();
70
71// Mostly ISA related features that we care about
72enum CPUFeature {
73 // Do not change numeric assignments.
74 MMX = 0,
75 SSE = 1,
76 SSE2 = 2,
77 SSE3 = 3,
78 SSSE3 = 4,
79 SSE4_1 = 5,
80 SSE4_2 = 6,
81 CMOV = 7,
82 CMPXCHG8B = 8,
83 CMPXCHG16B = 9,
84 POPCNT = 10,
85 AES = 11,
86 AVX = 12,
87 RDRAND = 13,
88 AVX2 = 14,
89 FMA = 15,
90 F16C = 16,
91 PCLMULQDQ = 17,
92 RDSEED = 18,
93 ADX = 19,
94 SMAP = 20,
95
96 // Prefetch Vector Data Into Caches with Intent to Write and T1 Hint
97 // http://www.felixcloutier.com/x86/PREFETCHWT1.html.
98 // You probably want PREFETCHW instead.
99 PREFETCHWT1 = 21,
100
101 BMI1 = 22,
102 BMI2 = 23,
103 HYPERVISOR = 25, // 0 when on a real CPU, 1 on (well-behaved) hypervisor.
104
105 // Prefetch Data into Caches in Anticipation of a Write (3D Now!).
106 // http://www.felixcloutier.com/x86/PREFETCHW.html
107 PREFETCHW = 26,
108
109 // AVX-512: 512-bit vectors (plus masking, etc.) in Knights Landing,
110 // Skylake, Xeon, etc. Each of these entries is a different subset of
111 // instructions, various combinations of which occur on various CPU types.
112 AVX512F = 27, // Foundation
113 AVX512CD = 28, // Conflict detection
114 AVX512ER = 29, // Exponential and reciprocal
115 AVX512PF = 30, // Prefetching
116 AVX512VL = 31, // Shorter vector lengths
117 AVX512BW = 32, // Byte and word
118 AVX512DQ = 33, // Dword and qword
119 AVX512VBMI = 34, // Bit manipulation
120 AVX512IFMA = 35, // Integer multiply-add
121 AVX512_4VNNIW = 36, // Integer neural network (Intel Xeon Phi only)
122 AVX512_4FMAPS = 37, // Floating point neural network (Intel Xeon Phi only)
123 AVX512_VNNI = 38, // Integer neural network
124 AVX512_BF16 = 39, // Bfloat16 neural network
125
126 // AVX version of AVX512_VNNI in CPUs such as Alder Lake and Sapphire Rapids.
127 AVX_VNNI = 40, // Integer neural network
128
129 // AMX: Advanced Matrix Extension in Sapphire Rapids.
130 // Perform matrix multiplication on the Tile Matrix Multiply (TMUL) unit,
131 // supporting two popular data types in neural networks, int8 and bfloat16.
132 AMX_TILE = 41, // Tile configuration and load/store
133 AMX_INT8 = 42, // Int8 tile matrix multiplication
134 AMX_BF16 = 43, // Bfloat16 tile matrix multiplication
135};
136
137// Checks whether the current processor supports one of the features above.
138// Checks CPU registers to return hardware capabilities.
139bool TestCPUFeature(CPUFeature feature);
140
141// Returns CPU Vendor string (i.e. 'GenuineIntel', 'AuthenticAMD', etc.)
142std::string CPUVendorIDString();
143
144// Returns CPU family.
145int CPUFamily();
146
147// Returns CPU model number.
148int CPUModelNum();
149
150// Returns nominal core processor cycles per second of each processor.
151double NominalCPUFrequency();
152
153// Returns num of hyperthreads per physical core
154int CPUIDNumSMT();
155
156} // namespace port
157} // namespace tsl
158
159#endif // TENSORFLOW_TSL_PLATFORM_CPU_INFO_H_
160