1 | /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef RUY_RUY_CPU_CACHE_PARAMS_H_ |
17 | #define RUY_RUY_CPU_CACHE_PARAMS_H_ |
18 | |
19 | namespace ruy { |
20 | |
21 | // Holds some information about a CPU's data caches. |
22 | // |
23 | // Meaning of 'local': a 'local' cache means a cache that is used by only one |
24 | // CPU core, not shared with other cores. It might still be used by multiple |
25 | // 'processors' in case of SMT as in Intel HyperThreading. CPUs often have |
26 | // multiple levels of local cache, e.g. L1 and L2. We typically return the |
27 | // larger one, the assumption being that even the larger one has substantially |
28 | // lower latency than any higher (non-local) cache, however as noted below (*) |
29 | // the implementation may choose to ignore a cache level. |
30 | // |
31 | // Meaning of 'last level': this refers to some higher cache level, typically |
32 | // shared among multiple CPU cores, so we considered using the terminology |
33 | // 'shared' instead of 'last_level'. However that created some confusion of its |
34 | // own, as the meaning of 'shared' varies between CPUs, with some CPUs not |
35 | // having any level of cache shared among all cores. That is why we stick with |
36 | // the 'last_level' terminology, however with the following caveats: |
37 | // 1. As noted below (*) the implementation may choose to ignore a cache |
38 | // level, which could cause the 'last level' cache according to ruy not to be |
39 | // the actual last level. |
40 | // 2. On some systems-on-chip there is a 'last level' cache outside of the |
41 | // last level cache in the CPU complex. Ruy is not currently doing anything |
42 | // specific regarding such caches. |
43 | // 3. We haven't figured out how to amend our terminology to be meaningful |
44 | // on NUMA architectures. NUMA hasn't been part of ruy's scope so far. |
45 | // |
46 | // (*) Note on ignoring certain cache levels: |
47 | // The implementation may choose to ignore a cache if it's suspected not to |
48 | // have compelling performance. This is true about all cache levels, but more |
49 | // likely regarding the 'last level' cache. For example, a L4 cache may be |
50 | // ignored if we believe that it's not the right latency/size compromise for us, |
51 | // so on such a CPU, the L3 cache may be used as the 'last level' cache instead. |
52 | // |
53 | // (**) Note on CPUs with heterogeneous cores: |
54 | // Some CPUs have multiple cores with different local caches. For example, some |
55 | // ARM big.LITTLE CPUs have some CPU cores with L1=32k and L2=128k, and some |
56 | // other CPU cores with L1=64k and L2=256k or even 512k. On such CPUs, the |
57 | // fields in this struct refer to the minimum value over all cores. In other |
58 | // words, we use conservative values that do not risk over-estimating local |
59 | // cache sizes in case of a migration of our threads to smaller cores. |
60 | // |
61 | // Example: |
62 | // On a Qualcomm S855 SoC, there are 8 CPU cores. Each core has L1 and L2 data |
63 | // caches local to it: |
64 | // - 4 cores have L1=32k, L2=128k. |
65 | // - 3 cores have L1=64k, L2=256k. |
66 | // - 1 core has L1=64k, L2=512k. |
67 | // All 8 cores share a L3 cache of size 2M, and there is beyond that a SoC-level |
68 | // cache of size 3M. |
69 | // On such a system, we should have: |
70 | // - local_level_cache_size=128k, the smallest L2 size. |
71 | // - last_level_cache_size=2M, the L3 cache size, ignoring the SoC-level cache. |
72 | struct CpuCacheParams final { |
73 | // Minimum value (see (**)), over all cores, of the size in bytes of its local |
74 | // cache (see "Meaning of 'local'"). |
75 | int local_cache_size = 0; |
76 | // Minimum value (see (**)), over all cores, of the size in bytes of its last |
77 | // level cache (see "Meaning of 'last level'"). |
78 | int last_level_cache_size = 0; |
79 | }; |
80 | |
81 | } // namespace ruy |
82 | |
83 | #endif // RUY_RUY_CPU_CACHE_PARAMS_H_ |
84 | |