1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef RUY_RUY_PREPACKED_CACHE_H_ |
17 | #define RUY_RUY_PREPACKED_CACHE_H_ |
18 | |
19 | #include <cstddef> |
20 | #include <cstdint> |
21 | #include <unordered_map> |
22 | |
23 | #include "ruy/mat.h" |
24 | |
25 | namespace ruy { |
26 | |
27 | // "Low effort" Least Recently Used Cache for Prepacked Matrices |
28 | // A cache mechanism for prepacked matrices that ejects oldest entries. |
29 | // The implementation is "low effort" in the following ways: |
30 | // - we just linearly search for the oldest entry when doing an ejection |
31 | // - the ejection policy is very simple: if the new size would be above the |
32 | // . threshold, we will eject entries until the size is below the threshold. |
33 | // Current use cases (RNNs with GEMV operations) indicate that ejection is rare |
34 | // and memory constraints are tight, so we devote no additional storage to the |
35 | // LRU mechanism and accept O(n) search to eject oldest entry. In practice, |
36 | // the number of total entries has not been shown to be large. |
37 | // |
38 | // An instance of PrepackedCache is always owned by a Context. Just like |
39 | // Context, no "thread safety" consideration is applicable to this class: |
40 | // no two threads may simulaneously be accessing the same instance. |
41 | class PrepackedCache final { |
42 | public: |
43 | enum class Action { kGotExistingEntry, kInsertedNewEntry }; |
44 | |
45 | static constexpr int kDefaultMaxBuffersBytes = 1 << 28; |
46 | |
47 | // A key in this key-value cache. Equality of keys implies interchangeable |
48 | // packed matrices, so we must be careful to make this Key type specific |
49 | // enough, and its equality comparison operator strict enough. |
50 | // |
51 | // These keys need to be used before the packed matrix buffers are allocated |
52 | // (since they are used to determine whether to allocate a new buffer). |
53 | // So they instead use the source matrix's data pointer. On the other hand, |
54 | // the packed matrix's layout structure is already available by the time we |
55 | // need to handle Keys, and that's fortunate because it is more specific |
56 | // than the source matrix's layout: it also encodes details about the kernel's |
57 | // small-scale block layout. In the past, we made the kernel function pointer |
58 | // part of the cache key, but all that is relevant here is the packed layout. |
59 | // |
60 | // The only other field that needs to be involved is the zero_point, for |
61 | // quantized matrices, although it seems far-fetched that the same matrix |
62 | // data would be reused with different zero_point values. |
63 | // |
64 | // The data types (PEMat::data_type and PEMat::sums_type) are omitted based on |
65 | // the "strict aliasing" model: each memory location should contain data of |
66 | // only one type. This could be relaxed in the future, by adding data types |
67 | // to this Key type, if a use case arises. |
68 | struct Key { |
69 | // The source matrix's data pointer. |
70 | const void* src_data; |
71 | // The packed matrix's layout, see PEMat::layout. |
72 | PMatLayout packed_layout; |
73 | // The packed matrix's zero point (for integer-quantized matrices only). |
74 | std::int32_t zero_point; |
75 | }; |
76 | |
77 | friend bool operator==(const Key& a, const Key& b) { |
78 | return a.src_data == b.src_data && a.packed_layout == b.packed_layout && |
79 | a.zero_point == b.zero_point; |
80 | } |
81 | |
82 | struct KeyHash { |
83 | std::size_t operator()(const Key&) const; |
84 | }; |
85 | |
86 | // A dummy timestamp to associate to each entry (see struct Entry) to |
87 | // determine which entry is "least recently used" when ejecting entries. |
88 | // This is just an integer counter, not related to physical time. |
89 | // It does not need to be atomic because only one thread uses an instance |
90 | // of PrepackedCache at a time (see class comment). |
91 | using Timestamp = std::uint64_t; |
92 | |
93 | struct Entry { |
94 | PEMat packed_matrix; |
95 | Timestamp timestamp; |
96 | }; |
97 | |
98 | explicit PrepackedCache(int max_buffers_bytes = kDefaultMaxBuffersBytes) |
99 | : max_buffers_bytes_(max_buffers_bytes) {} |
100 | |
101 | ~PrepackedCache(); |
102 | |
103 | // Returns the total size in bytes of buffers held in this cache. |
104 | std::ptrdiff_t BuffersBytes() const { return buffers_bytes_; } |
105 | |
106 | // Returns the number of packed matrices held in this cache. |
107 | int MatrixCount() const { return cache_.size(); } |
108 | |
109 | // This is the method by which new matrices are cached, and existing cache |
110 | // entries are queried. |
111 | // `src_data` is the source matrix data pointer. |
112 | // `packed_matrix` is a packed matrix structure where all fields have already |
113 | // been populated, except for the `data` and `sums` pointers which have not |
114 | // yet been allocated. |
115 | // |
116 | // This method: |
117 | // 1. Queries the cache for an entry matching the given `src_data` pointer and |
118 | // the relevant fields of `packed_matrix`, particularly its `layout`. |
119 | // 2. If a matching cache entry does not exist, it is created and inserted |
120 | // into the cache, and its `data` and `sums` buffers are allocated. |
121 | // 3. The `packed_matrix` has its `data` and `sums` pointers set to point |
122 | // to the allocated buffers. |
123 | // 4. The cache entry's timestamp is updated so it's the most recently used |
124 | // entry. |
125 | // 5. The return value is Action::kInsertedNewEntry if at step 2 a new |
126 | // entry was created. Otherwise it is Action::kGotExistingEntry. |
127 | Action Get(const void* src_data, PEMat* packed_matrix); |
128 | |
129 | private: |
130 | void EjectOne(); |
131 | void EjectUntilRoomFor(std::ptrdiff_t new_bytes); |
132 | |
133 | std::unordered_map<Key, Entry, KeyHash> cache_; |
134 | const std::ptrdiff_t max_buffers_bytes_; |
135 | std::ptrdiff_t buffers_bytes_ = 0; |
136 | Timestamp timestamp_ = 0; |
137 | }; |
138 | |
139 | } // namespace ruy |
140 | |
141 | #endif // RUY_RUY_PREPACKED_CACHE_H_ |
142 | |