1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_UTIL_PRESIZED_CUCKOO_MAP_H_ |
17 | #define TENSORFLOW_CORE_UTIL_PRESIZED_CUCKOO_MAP_H_ |
18 | |
19 | #include <algorithm> |
20 | #include <vector> |
21 | #include "tensorflow/core/framework/types.h" |
22 | #include "tensorflow/core/platform/macros.h" |
23 | #include "tensorflow/core/platform/prefetch.h" |
24 | |
25 | namespace tensorflow { |
26 | |
27 | // Class for efficiently storing key->value mappings when the size is |
28 | // known in advance and the keys are pre-hashed into uint64s. |
29 | // Keys should have "good enough" randomness (be spread across the |
30 | // entire 64 bit space). |
31 | // |
32 | // Important: Clients wishing to use deterministic keys must |
33 | // ensure that their keys fall in the range 0 .. (uint64max-1); |
34 | // the table uses 2^64-1 as the "not occupied" flag. |
35 | // |
36 | // Inserted keys must be unique, and there are no update |
37 | // or delete functions (until some subsequent use of this table |
38 | // requires them). |
39 | // |
40 | // Threads must synchronize their access to a PresizedCuckooMap. |
41 | // |
42 | // The cuckoo hash table is 4-way associative (each "bucket" has 4 |
43 | // "slots" for key/value entries). Uses breadth-first-search to find |
44 | // a good cuckoo path with less data movement (see |
45 | // http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf ) |
46 | |
47 | namespace presized_cuckoo_map { |
48 | // Utility function to compute (x * y) >> 64, or "multiply high". |
49 | // On x86-64, this is a single instruction, but not all platforms |
50 | // support the __uint128_t type, so we provide a generic |
51 | // implementation as well. |
52 | inline uint64 multiply_high_u64(uint64 x, uint64 y) { |
53 | #if defined(__SIZEOF_INT128__) |
54 | return (uint64)(((__uint128_t)x * (__uint128_t)y) >> 64); |
55 | #else |
56 | // For platforms without int128 support, do it the long way. |
57 | uint64 x_lo = x & 0xffffffff; |
58 | uint64 x_hi = x >> 32; |
59 | uint64 buckets_lo = y & 0xffffffff; |
60 | uint64 buckets_hi = y >> 32; |
61 | uint64 prod_hi = x_hi * buckets_hi; |
62 | uint64 prod_lo = x_lo * buckets_lo; |
63 | uint64 prod_mid1 = x_hi * buckets_lo; |
64 | uint64 prod_mid2 = x_lo * buckets_hi; |
65 | uint64 carry = |
66 | ((prod_mid1 & 0xffffffff) + (prod_mid2 & 0xffffffff) + (prod_lo >> 32)) >> |
67 | 32; |
68 | return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry; |
69 | #endif |
70 | } |
71 | } // namespace presized_cuckoo_map |
72 | |
73 | template <class value> |
74 | class PresizedCuckooMap { |
75 | public: |
76 | // The key type is fixed as a pre-hashed key for this specialized use. |
77 | typedef uint64 key_type; |
78 | |
79 | explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); } |
80 | |
81 | void Clear(uint64 num_entries) { |
82 | cpq_.reset(new CuckooPathQueue()); |
83 | double n(num_entries); |
84 | n /= kLoadFactor; |
85 | num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket); |
86 | // Very small cuckoo tables don't work, because the probability |
87 | // of having same-bucket hashes is large. We compromise for those |
88 | // uses by having a larger static starting size. |
89 | num_buckets_ += 32; |
90 | Bucket empty_bucket; |
91 | for (int i = 0; i < kSlotsPerBucket; i++) { |
92 | empty_bucket.keys[i] = kUnusedSlot; |
93 | } |
94 | buckets_.clear(); |
95 | buckets_.resize(num_buckets_, empty_bucket); |
96 | } |
97 | |
98 | // Returns false if k is already in table or if the table |
99 | // is full; true otherwise. |
100 | bool InsertUnique(const key_type k, const value& v) { |
101 | uint64 tk = key_transform(k); |
102 | uint64 b1 = fast_map_to_buckets(tk); |
103 | uint64 b2 = fast_map_to_buckets(h2(tk)); |
104 | |
105 | // Merged find and duplicate checking. |
106 | uint64 target_bucket = 0; |
107 | int target_slot = kNoSpace; |
108 | |
109 | for (auto bucket : {b1, b2}) { |
110 | Bucket* bptr = &buckets_[bucket]; |
111 | for (int slot = 0; slot < kSlotsPerBucket; slot++) { |
112 | if (bptr->keys[slot] == k) { // Duplicates are not allowed. |
113 | return false; |
114 | } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) { |
115 | target_bucket = bucket; |
116 | target_slot = slot; |
117 | } |
118 | } |
119 | } |
120 | |
121 | if (target_slot != kNoSpace) { |
122 | InsertInternal(tk, v, target_bucket, target_slot); |
123 | return true; |
124 | } |
125 | |
126 | return CuckooInsert(tk, v, b1, b2); |
127 | } |
128 | |
129 | // Returns true if found. Sets *out = value. |
130 | bool Find(const key_type k, value* out) const { |
131 | uint64 tk = key_transform(k); |
132 | return FindInBucket(k, fast_map_to_buckets(tk), out) || |
133 | FindInBucket(k, fast_map_to_buckets(h2(tk)), out); |
134 | } |
135 | |
136 | // Prefetch memory associated with the key k into cache levels specified by |
137 | // hint. |
138 | template <port::PrefetchHint hint = port::PREFETCH_HINT_T0> |
139 | void PrefetchKey(const key_type k) const { |
140 | const uint64 tk = key_transform(k); |
141 | port::prefetch<hint>(&buckets_[fast_map_to_buckets(tk)].keys); |
142 | port::prefetch<hint>(&buckets_[fast_map_to_buckets(h2(tk))].keys); |
143 | } |
144 | |
145 | int64_t MemoryUsed() const { |
146 | return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue); |
147 | } |
148 | |
149 | private: |
150 | static constexpr int kSlotsPerBucket = 4; |
151 | |
152 | // The load factor is chosen slightly conservatively for speed and |
153 | // to avoid the need for a table rebuild on insertion failure. |
154 | // 0.94 is achievable, but 0.85 is faster and keeps the code simple |
155 | // at the cost of a small amount of memory. |
156 | // NOTE: 0 < kLoadFactor <= 1.0 |
157 | static constexpr double kLoadFactor = 0.85; |
158 | |
159 | // Cuckoo insert: The maximum number of entries to scan should be ~400 |
160 | // (Source: Personal communication with Michael Mitzenmacher; empirical |
161 | // experiments validate.). After trying 400 candidate locations, declare |
162 | // the table full - it's probably full of unresolvable cycles. Less than |
163 | // 400 reduces max occupancy; much more results in very poor performance |
164 | // around the full point. For (2,4) a max BFS path len of 5 results in ~682 |
165 | // nodes to visit, calculated below, and is a good value. |
166 | |
167 | static constexpr uint8 kMaxBFSPathLen = 5; |
168 | |
169 | // Constants for BFS cuckoo path search: |
170 | // The visited list must be maintained for all but the last level of search |
171 | // in order to trace back the path. The BFS search has two roots |
172 | // and each can go to a total depth (including the root) of 5. |
173 | // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682. |
174 | // The visited queue, however, does not need to hold the deepest level, |
175 | // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170 |
176 | static constexpr int kMaxQueueSize = 682; |
177 | static constexpr int kVisitedListSize = 170; |
178 | |
179 | static constexpr int kNoSpace = -1; // SpaceAvailable return |
180 | static constexpr uint64 kUnusedSlot = ~(0ULL); |
181 | |
182 | // Buckets are organized with key_types clustered for access speed |
183 | // and for compactness while remaining aligned. |
184 | struct Bucket { |
185 | key_type keys[kSlotsPerBucket]; |
186 | value values[kSlotsPerBucket]; |
187 | }; |
188 | |
189 | // Insert uses the BFS optimization (search before moving) to reduce |
190 | // the number of cache lines dirtied during search. |
191 | |
192 | struct CuckooPathEntry { |
193 | uint64 bucket; |
194 | int depth; |
195 | int parent; // To index in the visited array. |
196 | int parent_slot; // Which slot in our parent did we come from? -1 == root. |
197 | }; |
198 | |
199 | // CuckooPathQueue is a trivial circular queue for path entries. |
200 | // The caller is responsible for not inserting more than kMaxQueueSize |
201 | // entries. Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue |
202 | // that it reuses across inserts. |
203 | class CuckooPathQueue { |
204 | public: |
205 | CuckooPathQueue() : head_(0), tail_(0) {} |
206 | |
207 | void push_back(CuckooPathEntry e) { |
208 | queue_[tail_] = e; |
209 | tail_ = (tail_ + 1) % kMaxQueueSize; |
210 | } |
211 | |
212 | CuckooPathEntry pop_front() { |
213 | CuckooPathEntry& e = queue_[head_]; |
214 | head_ = (head_ + 1) % kMaxQueueSize; |
215 | return e; |
216 | } |
217 | |
218 | bool empty() const { return head_ == tail_; } |
219 | |
220 | bool full() const { return ((tail_ + 1) % kMaxQueueSize) == head_; } |
221 | |
222 | void reset() { head_ = tail_ = 0; } |
223 | |
224 | private: |
225 | CuckooPathEntry queue_[kMaxQueueSize]; |
226 | int head_; |
227 | int tail_; |
228 | }; |
229 | |
230 | typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath; |
231 | |
232 | // Callers are expected to have pre-hashed the keys into a uint64 |
233 | // and are expected to be able to handle (a very low rate) of |
234 | // collisions, OR must ensure that their keys are always in |
235 | // the range 0 - (uint64max - 1). This transforms 'not found flag' |
236 | // keys into something else. |
237 | inline uint64 key_transform(const key_type k) const { |
238 | return k + (k == kUnusedSlot); |
239 | } |
240 | |
241 | // h2 performs a very quick mix of h to generate the second bucket hash. |
242 | // Assumes there is plenty of remaining entropy in the initial h. |
243 | inline uint64 h2(uint64 h) const { |
244 | const uint64 m = 0xc6a4a7935bd1e995; |
245 | return m * ((h >> 32) | (h << 32)); |
246 | } |
247 | |
248 | // alt_bucket identifies the "other" bucket for key k, where |
249 | // other is "the one that isn't bucket b" |
250 | inline uint64 alt_bucket(key_type k, uint64 b) const { |
251 | if (fast_map_to_buckets(k) != b) { |
252 | return fast_map_to_buckets(k); |
253 | } |
254 | return fast_map_to_buckets(h2(k)); |
255 | } |
256 | |
257 | inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) { |
258 | Bucket* bptr = &buckets_[b]; |
259 | bptr->keys[slot] = k; |
260 | bptr->values[slot] = v; |
261 | } |
262 | |
263 | // For the associative cuckoo table, check all of the slots in |
264 | // the bucket to see if the key is present. |
265 | bool FindInBucket(key_type k, uint64 b, value* out) const { |
266 | const Bucket& bref = buckets_[b]; |
267 | for (int i = 0; i < kSlotsPerBucket; i++) { |
268 | if (bref.keys[i] == k) { |
269 | *out = bref.values[i]; |
270 | return true; |
271 | } |
272 | } |
273 | return false; |
274 | } |
275 | |
276 | // returns either kNoSpace or the index of an |
277 | // available slot (0 <= slot < kSlotsPerBucket) |
278 | inline int SpaceAvailable(uint64 bucket) const { |
279 | const Bucket& bref = buckets_[bucket]; |
280 | for (int i = 0; i < kSlotsPerBucket; i++) { |
281 | if (bref.keys[i] == kUnusedSlot) { |
282 | return i; |
283 | } |
284 | } |
285 | return kNoSpace; |
286 | } |
287 | |
288 | inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket, |
289 | int dst_slot) { |
290 | Bucket& src_ref = buckets_[src_bucket]; |
291 | Bucket& dst_ref = buckets_[dst_bucket]; |
292 | dst_ref.keys[dst_slot] = src_ref.keys[src_slot]; |
293 | dst_ref.values[dst_slot] = src_ref.values[src_slot]; |
294 | } |
295 | |
296 | bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) { |
297 | int visited_end = 0; |
298 | cpq_->reset(); |
299 | |
300 | cpq_->push_back({b1, 1, 0, 0}); // Note depth starts at 1. |
301 | cpq_->push_back({b2, 1, 0, 0}); |
302 | |
303 | while (!cpq_->empty()) { |
304 | CuckooPathEntry e = cpq_->pop_front(); |
305 | int free_slot; |
306 | free_slot = SpaceAvailable(e.bucket); |
307 | if (free_slot != kNoSpace) { |
308 | while (e.depth > 1) { |
309 | // "copy" instead of "swap" because one entry is always zero. |
310 | // After, write target key/value over top of last copied entry. |
311 | CuckooPathEntry parent = visited_[e.parent]; |
312 | CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot); |
313 | free_slot = e.parent_slot; |
314 | e = parent; |
315 | } |
316 | InsertInternal(k, v, e.bucket, free_slot); |
317 | return true; |
318 | } else { |
319 | if (e.depth < (kMaxBFSPathLen)) { |
320 | auto parent_index = visited_end; |
321 | visited_[visited_end] = e; |
322 | visited_end++; |
323 | // Don't always start with the same slot, to even out the path depth. |
324 | int start_slot = (k + e.bucket) % kSlotsPerBucket; |
325 | const Bucket& bref = buckets_[e.bucket]; |
326 | for (int i = 0; i < kSlotsPerBucket; i++) { |
327 | int slot = (start_slot + i) % kSlotsPerBucket; |
328 | uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket); |
329 | // Optimization: Avoid single-step cycles (from e, don't |
330 | // add a child node that is actually e's parent). |
331 | uint64 e_parent_bucket = visited_[e.parent].bucket; |
332 | if (next_bucket != e_parent_bucket) { |
333 | cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot}); |
334 | } |
335 | } |
336 | } |
337 | } |
338 | } |
339 | |
340 | LOG(WARNING) << "Cuckoo path finding failed: Table too small?" ; |
341 | return false; |
342 | } |
343 | |
344 | inline uint64 fast_map_to_buckets(uint64 x) const { |
345 | // Map x (uniform in 2^64) to the range [0, num_buckets_ -1] |
346 | // using Lemire's alternative to modulo reduction: |
347 | // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ |
348 | // Instead of x % N, use (x * N) >> 64. |
349 | return presized_cuckoo_map::multiply_high_u64(x, num_buckets_); |
350 | } |
351 | |
352 | // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket. |
353 | uint64 num_buckets_; |
354 | std::vector<Bucket> buckets_; |
355 | |
356 | std::unique_ptr<CuckooPathQueue> cpq_; |
357 | CuckooPathEntry visited_[kVisitedListSize]; |
358 | |
359 | TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap); |
360 | }; |
361 | |
362 | } // namespace tensorflow |
363 | |
364 | #endif // TENSORFLOW_CORE_UTIL_PRESIZED_CUCKOO_MAP_H_ |
365 | |