1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "ruy/ctx.h" |
17 | |
18 | #include <cstdlib> |
19 | #include <functional> |
20 | #include <string> |
21 | |
22 | #include "ruy/check_macros.h" |
23 | #include "ruy/cpuinfo.h" |
24 | #include "ruy/ctx_impl.h" |
25 | #include "ruy/have_built_path_for.h" |
26 | #include "ruy/path.h" |
27 | #include "ruy/performance_advisory.h" |
28 | #include "ruy/platform.h" |
29 | #include "ruy/prepacked_cache.h" |
30 | #include "ruy/trace.h" |
31 | |
32 | namespace ruy { |
33 | |
34 | const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); } |
35 | CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); } |
36 | |
37 | Path Ctx::last_used_path() const { return impl().last_used_path_; } |
38 | Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; } |
39 | void Ctx::set_explicit_tuning(Tuning value) { |
40 | mutable_impl()->explicit_tuning_ = value; |
41 | } |
42 | const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; } |
43 | ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; } |
44 | int Ctx::max_num_threads() const { return impl().max_num_threads_; } |
45 | void Ctx::set_max_num_threads(int value) { |
46 | mutable_impl()->max_num_threads_ = value; |
47 | } |
48 | void Ctx::clear_performance_advisories() { |
49 | mutable_impl()->performance_advisory_ = PerformanceAdvisory::kNone; |
50 | } |
51 | void Ctx::set_performance_advisory(PerformanceAdvisory advisory) { |
52 | mutable_impl()->performance_advisory_ = |
53 | mutable_impl()->performance_advisory_ | advisory; |
54 | } |
55 | bool Ctx::performance_advisory(PerformanceAdvisory advisory) const { |
56 | return (impl().performance_advisory_ & advisory) != |
57 | PerformanceAdvisory::kNone; |
58 | } |
59 | |
60 | void Ctx::SetRuntimeEnabledPaths(Path paths) { |
61 | if (paths == Path::kNone) { |
62 | // Revert to default behavior using runtime detection. |
63 | mutable_impl()->runtime_enabled_paths_ = Path::kNone; |
64 | } else { |
65 | // Explicitly set enabled paths. Ensure that non-arch are always enabled |
66 | // (needed for fallbacks). |
67 | mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths; |
68 | } |
69 | } |
70 | |
71 | CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; } |
72 | |
73 | namespace { |
74 | |
75 | int GetHexIntEnvVarOrZero(const char* name) { |
76 | const char* val = getenv(name); |
77 | if (!val) { |
78 | return 0; |
79 | } |
80 | return std::stoi(val, nullptr, 16); |
81 | } |
82 | |
83 | // For each Path bit set in `paths_to_test`, performs runtime detection and |
84 | // sets the corresponding bit in the return value if and only if it is |
85 | // supported. Path bits that are not set in the input |
86 | // `paths_to_detect` value are also left not set in the return value. |
87 | Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) { |
88 | // Paths in kNonArchPathsIncludingInternalVariants are always implicitly |
89 | // supported. Further logic below may add more bits to `results`. |
90 | Path result = kNonArchPathsIncludingInternalVariants; |
91 | |
92 | // Conditionally sets the `path` bit in `result`, if reported as supported |
93 | // by the `is_supported` predicate. |
94 | auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) { |
95 | if ((paths_to_detect & path) != Path::kNone) { |
96 | if (is_supported()) { |
97 | result = result | path; |
98 | } |
99 | } |
100 | }; |
101 | |
102 | #if RUY_PLATFORM_ARM |
103 | // NEON is unconditionally available on ARM64. |
104 | // On ARM32 it's technically possible for it to be unavailable, but we've |
105 | // always chosen to just crash on such devices. We could reevaluate that, |
106 | // however for non-NEON devices to be actually supported, we would need to |
107 | // address also compiler-generated NEON code. That would mean to remove |
108 | // -mfpu=neon from ruy_copts and only use this flag in select NEON translation |
109 | // units, and implement have_built_path_for_neon, similar to the x86 SIMD |
110 | // paths. |
111 | maybe_add(Path::kNeon, []() { return true; }); |
112 | |
113 | // NEON dotprod requires runtime detection, however unlike the x86 SIMD paths |
114 | // it still does not require have_built_path_for because we unconditionally |
115 | // build it at the moment. That is largely because we have had to machine |
116 | // encode dotprod instructions, so we don't actually rely on toolchain support |
117 | // for them. |
118 | maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); }); |
119 | #elif RUY_PLATFORM_X86 |
120 | // x86 SIMD paths currently require both runtime detection, and detection of |
121 | // whether we're building the path at all. |
122 | maybe_add(Path::kAvx, |
123 | [=]() { return HaveBuiltPathForAvx() && cpuinfo->Avx(); }); |
124 | maybe_add(Path::kAvx2Fma, |
125 | [=]() { return HaveBuiltPathForAvx2Fma() && cpuinfo->Avx2Fma(); }); |
126 | maybe_add(Path::kAvx512, |
127 | [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); }); |
128 | #else |
129 | (void)maybe_add; |
130 | (void)cpuinfo; |
131 | #endif |
132 | |
133 | // Sanity checks |
134 | RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone); |
135 | RUY_DCHECK_EQ( |
136 | result & ~(kNonArchPathsIncludingInternalVariants | paths_to_detect), |
137 | Path::kNone); |
138 | return result; |
139 | } |
140 | |
141 | } // namespace |
142 | |
143 | Path Ctx::GetRuntimeEnabledPaths() { |
144 | RUY_TRACE_SCOPE; |
145 | // Just a shorthand alias. Using a pointer to make it clear we're mutating |
146 | // this value in-place. |
147 | Path* paths = &mutable_impl()->runtime_enabled_paths_; |
148 | |
149 | // The value Path::kNone indicates the initial state before detection has been |
150 | // performed. |
151 | if (*paths != Path::kNone) { |
152 | RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE); |
153 | return *paths; |
154 | } |
155 | // User may have set path explicitly in env var. |
156 | Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS" )); |
157 | if (paths_bitfield != Path::kNone) { |
158 | *paths = paths_bitfield; |
159 | RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR); |
160 | return *paths; |
161 | } |
162 | // Finally, use runtime detection. |
163 | *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo()); |
164 | RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION); |
165 | return *paths; |
166 | } |
167 | |
168 | Path Ctx::SelectPath(Path compiled_paths) { |
169 | return mutable_impl()->last_used_path_ = |
170 | GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths()); |
171 | } |
172 | |
173 | void Ctx::EnsureThreadSpecificResources(int thread_count) { |
174 | auto& resources = mutable_impl()->thread_specific_resources_; |
175 | while (thread_count > static_cast<int>(resources.size())) { |
176 | resources.emplace_back(new ThreadSpecificResource); |
177 | } |
178 | RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size())); |
179 | } |
180 | |
181 | TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const { |
182 | const auto& resources = impl().thread_specific_resources_; |
183 | RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size())); |
184 | return &resources[thread_index]->tuning_resolver; |
185 | } |
186 | |
187 | Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const { |
188 | const auto& resources = impl().thread_specific_resources_; |
189 | RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size())); |
190 | return &resources[thread_index]->allocator; |
191 | } |
192 | |
193 | Allocator* Ctx::GetMainAllocator() { |
194 | if (!impl().main_allocator_) { |
195 | mutable_impl()->main_allocator_.reset(new Allocator); |
196 | } |
197 | return impl().main_allocator_.get(); |
198 | } |
199 | |
200 | PrepackedCache* Ctx::GetPrepackedCache() { |
201 | if (!impl().prepacked_cache_) { |
202 | mutable_impl()->prepacked_cache_.reset(new PrepackedCache); |
203 | } |
204 | return impl().prepacked_cache_.get(); |
205 | } |
206 | |
207 | Tuning Ctx::GetMainThreadTuning() { |
208 | EnsureThreadSpecificResources(1); |
209 | TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0); |
210 | tuning_resolver->SetTuning(explicit_tuning()); |
211 | return tuning_resolver->Resolve(mutable_cpuinfo()); |
212 | } |
213 | |
214 | void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; } |
215 | |
216 | } // namespace ruy |
217 | |