1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // Library doing minimal CPU detection to decide what to tune asm code for. |
17 | // |
18 | // # Tuning vs Path |
19 | // |
20 | // Tunings are merely local variations of optimized code paths, that are |
21 | // drop-in replacements for each other --- the input and output data layouts |
22 | // are identical. By contrast, what ruy calls a Path dictates its own |
23 | // data layouts. For example, Path::kNeonDotprod will use different |
24 | // layouts compared to Path::kNeon; but within each, different tunings |
25 | // will share that same layout. |
26 | // |
27 | // # Tuning is for now only based on 1 bit: Generic / A55ish |
28 | // |
29 | // In practice, each of our asm code paths only needs one bit information to |
30 | // decide on tuning: whether the CPU is out-of-order or in-order. |
31 | // That is because out-of-order CPUs are by definition relatively insensitive |
32 | // to small-scale asm details (which is what "tuning" is about); and for each |
33 | // asm code path, there tends to be one main in-order CPU architecture that |
34 | // we focus our tuning effort on. Examples: |
35 | // * For Path::kNeon, the main in-order CPU is Cortex-A53/A55 (pre-dotprod) |
36 | // * For Path::kNeonDotprod, the main in-order CPU is Cortex-A55r1 (dotprod) |
37 | // |
38 | // Because having tuned code paths is a compromise of efficiency gains |
39 | // versus implementation effort and code size, we are happy to stop at just this |
40 | // single bit of information, Generic / A55ish, at least in the current CPU |
41 | // landscape. This could change in the future. |
42 | #ifndef RUY_RUY_TUNE_H_ |
43 | #define RUY_RUY_TUNE_H_ |
44 | |
45 | #include "ruy/cpuinfo.h" |
46 | #include "ruy/opt_set.h" |
47 | #include "ruy/platform.h" |
48 | #include "ruy/time.h" |
49 | |
50 | namespace ruy { |
51 | |
52 | enum class Tuning { |
53 | // kAuto means please use auto-detection. It's the default in the |
54 | // user-visible parts (see Context). It's meant to be resolved to an |
55 | // actual tuning at some point by means of TuningResolver. |
56 | kAuto, |
57 | // Use code not tuned for any particular CPU, typically performing well |
58 | // on out-of-order cores that don't require as much tuning. |
59 | kGeneric, |
60 | // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly: |
61 | // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common |
62 | // that they are in-order CPU cores with largely similar requirements of code |
63 | // tuning. The most important such requirement is to use only 64-bit loads |
64 | // to maximize dual-issuing. |
65 | // |
66 | // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads |
67 | // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with |
68 | // INS instructions to insert 64bit lanes into NEON registers. However, since |
69 | // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same |
70 | // kernels in practice anyway, so there was no need to distinguish them with |
71 | // separate Tuning values. |
72 | kA55ish, |
73 | // Use code tuned for Cortex-X1 CPUs. Currently, the driver to distinguish |
74 | // this CPU is the get maximum performance on the dotprod kernels, where we |
75 | // attain high performance simply by avoiding any manual loop unrolling. As a |
76 | // purely performance oriented microarchitecture, there will likely be |
77 | // additional reasons to distinguish the X1 from other CPUs. |
78 | kX1 |
79 | }; |
80 | |
81 | // Why a TuningResolver class? |
82 | // |
83 | // Ideally, this Library would offer a single function, |
84 | // Tuning GetCurrentCPUTuning(); |
85 | // |
86 | // However, determining information about the current CPU is not necessarily |
87 | // cheap, so we currently cache that and only invalidate/reevaluate after |
88 | // a fixed amount of time. This need to store state is why this library |
89 | // has to expose a class, TuningResolver, not just a function. |
90 | class TuningResolver { |
91 | public: |
92 | TuningResolver(); |
93 | |
94 | // Allows the user to specify an explicit Tuning value, bypassing auto |
95 | // detection; or to specify Tuning::kAuto, reverting to auto detection. |
96 | void SetTuning(Tuning tuning) { unresolved_tuning_ = tuning; } |
97 | |
98 | // Get an actual tuning --- that is the function that this class wanted to be. |
99 | Tuning Resolve(CpuInfo* cpuinfo); |
100 | |
101 | private: |
102 | TuningResolver(const TuningResolver&) = delete; |
103 | |
104 | // Perform the tuning resolution now. That may typically use EvalRatio and |
105 | // ThresholdRatio, but an implementation may use a different approach instead. |
106 | Tuning ResolveNow(CpuInfo* cpuinfo); |
107 | |
108 | // The tuning as specified by the user, before actual resolution happens |
109 | // i.e. before querying any specifics of the current CPU. |
110 | // The default value kAuto means try to auto-detect. Other values mean |
111 | // bypass auto-detect, use explicit value instead. See SetTuning(). |
112 | Tuning unresolved_tuning_ = Tuning::kAuto; |
113 | // Cached last resolved tuning. |
114 | Tuning last_resolved_tuning_ = Tuning::kAuto; |
115 | // Timepoint of cached last resolved tuning, for invalidation purposes. |
116 | TimePoint last_resolved_timepoint_; |
117 | // Cached last resolved tunings that are older than this age are invalid. |
118 | const Duration expiry_duration_; |
119 | }; |
120 | |
121 | } // namespace ruy |
122 | |
123 | #endif // RUY_RUY_TUNE_H_ |
124 | |