1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "ruy/prepacked_cache.h" |
17 | |
18 | #include "ruy/mat.h" |
19 | #include "ruy/profiler/instrumentation.h" |
20 | #include "ruy/system_aligned_alloc.h" |
21 | |
22 | namespace ruy { |
23 | |
24 | namespace { |
25 | |
26 | // Allocates the `data` and `sums` buffers, and sets the corresponding |
27 | // pointer fields, in a PEMat whose other fields, particularly `layout` |
28 | // and the runtime data types, are already populated. |
29 | std::ptrdiff_t AllocateBuffers(PEMat* packed_matrix) { |
30 | const std::ptrdiff_t data_bytes = DataBytes(*packed_matrix); |
31 | packed_matrix->data = detail::SystemAlignedAlloc(data_bytes); |
32 | std::ptrdiff_t sums_bytes = 0; |
33 | if (!packed_matrix->sums_type.is_floating_point) { |
34 | // Integer quantized matrices also need the `sums` buffer. |
35 | sums_bytes = SumsBytes(*packed_matrix); |
36 | packed_matrix->sums = detail::SystemAlignedAlloc(sums_bytes); |
37 | } |
38 | return data_bytes + sums_bytes; |
39 | } |
40 | |
41 | // Frees the `data` and `sums` buffers held by a PEMat. |
42 | void FreeBuffers(const PEMat& packed_matrix) { |
43 | detail::SystemAlignedFree(packed_matrix.data); |
44 | detail::SystemAlignedFree(packed_matrix.sums); |
45 | } |
46 | |
47 | } // end anonymous namespace |
48 | |
49 | std::size_t PrepackedCache::KeyHash::operator()( |
50 | const PrepackedCache::Key& key) const { |
51 | std::size_t src_data_hash = reinterpret_cast<std::size_t>(key.src_data); |
52 | // Naive hash of the layout. Based on some heuristic reasoning, not any |
53 | // benchmarking. |
54 | // A choice of hash function here is just an optimization matter |
55 | // anyway, since a hash collision only results in some Key::operator== calls |
56 | // to disambiguate, and even just returning src_data_hash, ignoring the layout |
57 | // altogether, would probably be good enough, as the case of multiple entries |
58 | // with the same data pointer will be uncommon. |
59 | // Here we multiply-add the layout fields using some small constant prime |
60 | // numbers as multipliers. The conventional approach of xor-ing bit-rotations |
61 | // would result in some hash collisions because these values are typically |
62 | // small positive integers, so bit-rotations are essentially bit-shifts, |
63 | // and powers of two are common. |
64 | std::size_t packed_layout_hash = |
65 | static_cast<int>(key.packed_layout.order) + |
66 | static_cast<int>(key.packed_layout.kernel.order) * 2 + |
67 | key.packed_layout.stride * 3 + key.packed_layout.kernel.rows * 5 + |
68 | key.packed_layout.kernel.cols * 7 + key.packed_layout.rows * 11 + |
69 | key.packed_layout.cols * 13; |
70 | return src_data_hash ^ packed_layout_hash; |
71 | } |
72 | |
73 | PrepackedCache::~PrepackedCache() { |
74 | for (const auto& pair : cache_) { |
75 | FreeBuffers(pair.second.packed_matrix); |
76 | } |
77 | } |
78 | |
79 | PrepackedCache::Action PrepackedCache::Get(const void* src_data, |
80 | PEMat* packed_matrix) { |
81 | // Construct a Key and look up the cache. |
82 | Key key; |
83 | key.src_data = src_data; |
84 | key.packed_layout = packed_matrix->layout; |
85 | key.zero_point = packed_matrix->zero_point; |
86 | const auto& itr = cache_.find(key); |
87 | |
88 | if (itr != cache_.end()) { |
89 | // Found existing entry. Update its timestamp and return it. |
90 | itr->second.timestamp = timestamp_++; |
91 | *packed_matrix = itr->second.packed_matrix; |
92 | return Action::kGotExistingEntry; |
93 | } |
94 | |
95 | // No existing entry found. Allocate new buffers now and insert in the cache. |
96 | const std::ptrdiff_t new_bytes = AllocateBuffers(packed_matrix); |
97 | EjectUntilRoomFor(new_bytes); |
98 | Entry entry{*packed_matrix, timestamp_++}; |
99 | cache_.emplace(key, entry); |
100 | buffers_bytes_ += new_bytes; |
101 | return Action::kInsertedNewEntry; |
102 | } |
103 | |
104 | void PrepackedCache::EjectUntilRoomFor(std::ptrdiff_t new_bytes) { |
105 | profiler::ScopeLabel label("PrepackedCacheEjection" ); |
106 | // While we are above the threshold of ejection, eject the LRU entry. |
107 | while (!cache_.empty() && buffers_bytes_ + new_bytes > max_buffers_bytes_) { |
108 | EjectOne(); |
109 | } |
110 | } |
111 | |
112 | void PrepackedCache::EjectOne() { |
113 | auto oldest = cache_.begin(); |
114 | Timestamp oldest_timestamp = oldest->second.timestamp; |
115 | { |
116 | for (auto itr = cache_.begin(); itr != cache_.end(); ++itr) { |
117 | if (itr->second.timestamp < oldest_timestamp) { |
118 | oldest = itr; |
119 | oldest_timestamp = itr->second.timestamp; |
120 | } |
121 | } |
122 | } |
123 | const PEMat& packed_matrix = oldest->second.packed_matrix; |
124 | buffers_bytes_ -= DataBytes(packed_matrix) + SumsBytes(packed_matrix); |
125 | FreeBuffers(packed_matrix); |
126 | cache_.erase(oldest); |
127 | } |
128 | |
129 | } // namespace ruy |
130 | |