1/* Copyright 2019 Google LLC. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// Library doing minimal CPU detection to decide what to tune asm code for.
17//
18// # Tuning vs Path
19//
20// Tunings are merely local variations of optimized code paths, that are
21// drop-in replacements for each other --- the input and output data layouts
22// are identical. By contrast, what ruy calls a Path dictates its own
23// data layouts. For example, Path::kNeonDotprod will use different
24// layouts compared to Path::kNeon; but within each, different tunings
25// will share that same layout.
26//
27// # Tuning is for now only based on 1 bit: Generic / A55ish
28//
29// In practice, each of our asm code paths only needs one bit information to
30// decide on tuning: whether the CPU is out-of-order or in-order.
31// That is because out-of-order CPUs are by definition relatively insensitive
32// to small-scale asm details (which is what "tuning" is about); and for each
33// asm code path, there tends to be one main in-order CPU architecture that
34// we focus our tuning effort on. Examples:
35// * For Path::kNeon, the main in-order CPU is Cortex-A53/A55 (pre-dotprod)
36// * For Path::kNeonDotprod, the main in-order CPU is Cortex-A55r1 (dotprod)
37//
38// Because having tuned code paths is a compromise of efficiency gains
39// versus implementation effort and code size, we are happy to stop at just this
40// single bit of information, Generic / A55ish, at least in the current CPU
41// landscape. This could change in the future.
42#ifndef RUY_RUY_TUNE_H_
43#define RUY_RUY_TUNE_H_
44
45#include "ruy/cpuinfo.h"
46#include "ruy/opt_set.h"
47#include "ruy/platform.h"
48#include "ruy/time.h"
49
50namespace ruy {
51
52enum class Tuning {
53 // kAuto means please use auto-detection. It's the default in the
54 // user-visible parts (see Context). It's meant to be resolved to an
55 // actual tuning at some point by means of TuningResolver.
56 kAuto,
57 // Use code not tuned for any particular CPU, typically performing well
58 // on out-of-order cores that don't require as much tuning.
59 kGeneric,
60 // Use code tuned for "Cortex-A55-ish" CPUs, by which we mean mostly:
61 // A53, A55r0 (pre-dotprod), A55r1 (with dotprod). These CPUs have in common
62 // that they are in-order CPU cores with largely similar requirements of code
63 // tuning. The most important such requirement is to use only 64-bit loads
64 // to maximize dual-issuing.
65 //
66 // A55r1 differs from A55r0 and A53 in that it dual-issues 64-bit NEON loads
67 // whereas A55r0 and A53 require using non-NEON ARM 64-bit loads together with
68 // INS instructions to insert 64bit lanes into NEON registers. However, since
69 // A55r1 supports dotprod unlike A55r0 and A53, they are not using the same
70 // kernels in practice anyway, so there was no need to distinguish them with
71 // separate Tuning values.
72 kA55ish,
73 // Use code tuned for Cortex-X1 CPUs. Currently, the driver to distinguish
74 // this CPU is the get maximum performance on the dotprod kernels, where we
75 // attain high performance simply by avoiding any manual loop unrolling. As a
76 // purely performance oriented microarchitecture, there will likely be
77 // additional reasons to distinguish the X1 from other CPUs.
78 kX1
79};
80
81// Why a TuningResolver class?
82//
83// Ideally, this Library would offer a single function,
84// Tuning GetCurrentCPUTuning();
85//
86// However, determining information about the current CPU is not necessarily
87// cheap, so we currently cache that and only invalidate/reevaluate after
88// a fixed amount of time. This need to store state is why this library
89// has to expose a class, TuningResolver, not just a function.
90class TuningResolver {
91 public:
92 TuningResolver();
93
94 // Allows the user to specify an explicit Tuning value, bypassing auto
95 // detection; or to specify Tuning::kAuto, reverting to auto detection.
96 void SetTuning(Tuning tuning) { unresolved_tuning_ = tuning; }
97
98 // Get an actual tuning --- that is the function that this class wanted to be.
99 Tuning Resolve(CpuInfo* cpuinfo);
100
101 private:
102 TuningResolver(const TuningResolver&) = delete;
103
104 // Perform the tuning resolution now. That may typically use EvalRatio and
105 // ThresholdRatio, but an implementation may use a different approach instead.
106 Tuning ResolveNow(CpuInfo* cpuinfo);
107
108 // The tuning as specified by the user, before actual resolution happens
109 // i.e. before querying any specifics of the current CPU.
110 // The default value kAuto means try to auto-detect. Other values mean
111 // bypass auto-detect, use explicit value instead. See SetTuning().
112 Tuning unresolved_tuning_ = Tuning::kAuto;
113 // Cached last resolved tuning.
114 Tuning last_resolved_tuning_ = Tuning::kAuto;
115 // Timepoint of cached last resolved tuning, for invalidation purposes.
116 TimePoint last_resolved_timepoint_;
117 // Cached last resolved tunings that are older than this age are invalid.
118 const Duration expiry_duration_;
119};
120
121} // namespace ruy
122
123#endif // RUY_RUY_TUNE_H_
124