1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "ruy/allocator.h" |
17 | |
18 | #include "ruy/opt_set.h" |
19 | #include "ruy/size_util.h" |
20 | #include "ruy/system_aligned_alloc.h" |
21 | |
22 | namespace ruy { |
23 | |
24 | Allocator::~Allocator() { |
25 | FreeAll(); |
26 | detail::SystemAlignedFree(ptr_); |
27 | } |
28 | |
29 | void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) { |
30 | if (current_ + num_bytes > size_) { |
31 | return nullptr; |
32 | } |
33 | void* ret = static_cast<char*>(ptr_) + current_; |
34 | current_ += num_bytes; |
35 | return ret; |
36 | } |
37 | |
38 | void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) { |
39 | void* p = detail::SystemAlignedAlloc(num_bytes); |
40 | fallback_blocks_total_size_ += num_bytes; |
41 | fallback_blocks_.push_back(p); |
42 | return p; |
43 | } |
44 | |
45 | void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) { |
46 | if (num_bytes == 0) { |
47 | return nullptr; |
48 | } |
49 | const std::ptrdiff_t rounded_num_bytes = |
50 | round_up_pot(num_bytes, detail::kMinimumBlockAlignment); |
51 | if (void* p = AllocateFast(rounded_num_bytes)) { |
52 | return p; |
53 | } |
54 | return AllocateSlow(rounded_num_bytes); |
55 | } |
56 | |
57 | void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes, |
58 | const void* to_avoid) { |
59 | #if RUY_OPT(AVOID_ALIASING) |
60 | if (num_bytes == 0) { |
61 | return nullptr; |
62 | } |
63 | // The minimum L1D cache aliasing periodicity in bytes that we expect to |
64 | // encounter on any device. This does not seem to be documented, but |
65 | // empirically we observe the following: |
66 | // Cortex-A53: 1024 |
67 | // Cortex-A55r1: 2048 |
68 | // Cortex-A76: not as easily observable. |
69 | // Over-estimating this makes the AVOID_ALIASING optimization useless on |
70 | // devices with lower periodicity. |
71 | // Under-estimating this by 2x should be harmless. |
72 | // Under-estimating this by a larger factor should gradually degrade |
73 | // performance due to cache aliasing causing mutual eviction between |
74 | // the packed matrix data, and the source matrix data being prefetched by the |
75 | // CPU ahead of the packing code execution. |
76 | static constexpr std::uint32_t kMinPeriod = 1024; |
77 | static_assert(is_pot(kMinPeriod), "" ); |
78 | void* p = AllocateBytes(num_bytes + kMinPeriod); |
79 | auto unsigned_low_bits = [](const void* p) { |
80 | return static_cast<std::uint32_t>(reinterpret_cast<std::uintptr_t>(p)); |
81 | }; |
82 | // This relies on unsigned integer overflow wrapping around. |
83 | std::uint32_t diff_modulus = |
84 | (unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod; |
85 | // diff_modulus is in [0, kMinPeriod). |
86 | // We want it as close as possible to the middle of that interval, |
87 | // kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid, |
88 | // is when diff_modulus is close to the ends of that interval, 0 or |
89 | // kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the |
90 | // first or the last quarter of that interval. |
91 | bool need_offset = |
92 | diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4; |
93 | return static_cast<char*>(p) + (need_offset ? (kMinPeriod / 2) : 0); |
94 | #else |
95 | (void)to_avoid; |
96 | return AllocateBytes(num_bytes); |
97 | #endif |
98 | } |
99 | |
100 | void Allocator::FreeAll() { |
101 | current_ = 0; |
102 | if (fallback_blocks_.empty()) { |
103 | return; |
104 | } |
105 | |
106 | // No rounding-up of the size means linear instead of logarithmic |
107 | // bound on the number of allocation in some worst-case calling patterns. |
108 | // This is considered worth it because minimizing memory usage is important |
109 | // and actual calling patterns in applications that we care about still |
110 | // reach the no-further-allocations steady state in a small finite number |
111 | // of iterations. |
112 | std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_; |
113 | detail::SystemAlignedFree(ptr_); |
114 | ptr_ = detail::SystemAlignedAlloc(new_size); |
115 | size_ = new_size; |
116 | |
117 | for (void* p : fallback_blocks_) { |
118 | detail::SystemAlignedFree(p); |
119 | } |
120 | fallback_blocks_.clear(); |
121 | fallback_blocks_total_size_ = 0; |
122 | } |
123 | |
124 | } // namespace ruy |
125 | |