1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <assert.h> |
20 | #include <errno.h> |
21 | #include <stdint.h> |
22 | |
23 | #include <type_traits> |
24 | |
25 | #include <folly/Portability.h> |
26 | #include <folly/concurrency/CacheLocality.h> |
27 | #include <folly/portability/SysMman.h> |
28 | #include <folly/portability/Unistd.h> |
29 | #include <folly/synchronization/AtomicStruct.h> |
30 | |
31 | // Ignore shadowing warnings within this file, so includers can use -Wshadow. |
32 | FOLLY_PUSH_WARNING |
33 | FOLLY_GNU_DISABLE_WARNING("-Wshadow" ) |
34 | |
35 | namespace folly { |
36 | |
37 | namespace detail { |
38 | template <typename Pool> |
39 | struct IndexedMemPoolRecycler; |
40 | } // namespace detail |
41 | |
42 | template < |
43 | typename T, |
44 | bool EagerRecycleWhenTrivial = false, |
45 | bool EagerRecycleWhenNotTrivial = true> |
46 | struct IndexedMemPoolTraits { |
47 | static constexpr bool eagerRecycle() { |
48 | return std::is_trivial<T>::value ? EagerRecycleWhenTrivial |
49 | : EagerRecycleWhenNotTrivial; |
50 | } |
51 | |
52 | /// Called when the element pointed to by ptr is allocated for the |
53 | /// first time. |
54 | static void initialize(T* ptr) { |
55 | if (!eagerRecycle()) { |
56 | new (ptr) T(); |
57 | } |
58 | } |
59 | |
60 | /// Called when the element pointed to by ptr is freed at the pool |
61 | /// destruction time. |
62 | static void cleanup(T* ptr) { |
63 | if (!eagerRecycle()) { |
64 | ptr->~T(); |
65 | } |
66 | } |
67 | |
68 | /// Called when the element is allocated with the arguments forwarded from |
69 | /// IndexedMemPool::allocElem. |
70 | template <typename... Args> |
71 | static void onAllocate(T* ptr, Args&&... args) { |
72 | static_assert( |
73 | sizeof...(Args) == 0 || eagerRecycle(), |
74 | "emplace-style allocation requires eager recycle, " |
75 | "which is defaulted only for non-trivial types" ); |
76 | if (eagerRecycle()) { |
77 | new (ptr) T(std::forward<Args>(args)...); |
78 | } |
79 | } |
80 | |
81 | /// Called when the element is recycled. |
82 | static void onRecycle(T* ptr) { |
83 | if (eagerRecycle()) { |
84 | ptr->~T(); |
85 | } |
86 | } |
87 | }; |
88 | |
89 | /// IndexedMemPool traits that implements the lazy lifecycle strategy. In this |
90 | /// strategy elements are default-constructed the first time they are allocated, |
91 | /// and destroyed when the pool itself is destroyed. |
92 | template <typename T> |
93 | using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>; |
94 | |
95 | /// IndexedMemPool traits that implements the eager lifecycle strategy. In this |
96 | /// strategy elements are constructed when they are allocated from the pool and |
97 | /// destroyed when recycled. |
98 | template <typename T> |
99 | using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>; |
100 | |
101 | /// Instances of IndexedMemPool dynamically allocate and then pool their |
102 | /// element type (T), returning 4-byte integer indices that can be passed |
103 | /// to the pool's operator[] method to access or obtain pointers to the |
104 | /// actual elements. The memory backing items returned from the pool |
105 | /// will always be readable, even if items have been returned to the pool. |
106 | /// These two features are useful for lock-free algorithms. The indexing |
107 | /// behavior makes it easy to build tagged pointer-like-things, since |
108 | /// a large number of elements can be managed using fewer bits than a |
109 | /// full pointer. The access-after-free behavior makes it safe to read |
110 | /// from T-s even after they have been recycled, since it is guaranteed |
111 | /// that the memory won't have been returned to the OS and unmapped |
112 | /// (the algorithm must still use a mechanism to validate that the read |
113 | /// was correct, but it doesn't have to worry about page faults), and if |
114 | /// the elements use internal sequence numbers it can be guaranteed that |
115 | /// there won't be an ABA match due to the element being overwritten with |
116 | /// a different type that has the same bit pattern. |
117 | /// |
118 | /// The object lifecycle strategy is controlled by the Traits parameter. |
119 | /// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to |
120 | /// construct objects when they are allocated from the pool and destroy |
121 | /// them when they are recycled. In this mode allocIndex and allocElem |
122 | /// have emplace-like semantics. In another strategy, implemented by |
123 | /// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the |
124 | /// first time they are removed from the pool, and deleted when the pool |
125 | /// itself is deleted. By default the first mode is used for non-trivial |
126 | /// T, and the second is used for trivial T. Clients can customize the |
127 | /// object lifecycle by providing their own Traits implementation. |
128 | /// See IndexedMemPoolTraits for a Traits example. |
129 | /// |
130 | /// IMPORTANT: Space for extra elements is allocated to account for those |
131 | /// that are inaccessible because they are in other local lists, so the |
132 | /// actual number of items that can be allocated ranges from capacity to |
133 | /// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if |
134 | /// you are trying to maximize the capacity of the pool while constraining |
135 | /// the bit size of the resulting pointers, because the pointers will |
136 | /// actually range up to the boosted capacity. See maxIndexForCapacity |
137 | /// and capacityForMaxIndex. |
138 | /// |
139 | /// To avoid contention, NumLocalLists_ free lists of limited (less than |
140 | /// or equal to LocalListLimit_) size are maintained, and each thread |
141 | /// retrieves and returns entries from its associated local list. If the |
142 | /// local list becomes too large then elements are placed in bulk in a |
143 | /// global free list. This allows items to be efficiently recirculated |
144 | /// from consumers to producers. AccessSpreader is used to access the |
145 | /// local lists, so there is no performance advantage to having more |
146 | /// local lists than L1 caches. |
147 | /// |
148 | /// The pool mmap-s the entire necessary address space when the pool is |
149 | /// constructed, but delays element construction. This means that only |
150 | /// elements that are actually returned to the caller get paged into the |
151 | /// process's resident set (RSS). |
152 | template < |
153 | typename T, |
154 | uint32_t NumLocalLists_ = 32, |
155 | uint32_t LocalListLimit_ = 200, |
156 | template <typename> class Atom = std::atomic, |
157 | typename Traits = IndexedMemPoolTraits<T>> |
158 | struct IndexedMemPool { |
159 | typedef T value_type; |
160 | |
161 | typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>> |
162 | UniquePtr; |
163 | |
164 | IndexedMemPool(const IndexedMemPool&) = delete; |
165 | IndexedMemPool& operator=(const IndexedMemPool&) = delete; |
166 | |
167 | static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits" ); |
168 | enum { |
169 | NumLocalLists = NumLocalLists_, |
170 | LocalListLimit = LocalListLimit_, |
171 | }; |
172 | |
173 | static_assert( |
174 | std::is_nothrow_default_constructible<Atom<uint32_t>>::value, |
175 | "Atom must be nothrow default constructible" ); |
176 | |
177 | // these are public because clients may need to reason about the number |
178 | // of bits required to hold indices from a pool, given its capacity |
179 | |
180 | static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) { |
181 | // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated |
182 | // tracking |
183 | return uint32_t(std::min( |
184 | uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit, |
185 | uint64_t(std::numeric_limits<uint32_t>::max() - 1))); |
186 | } |
187 | |
188 | static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) { |
189 | return maxIndex - (NumLocalLists - 1) * LocalListLimit; |
190 | } |
191 | |
192 | /// Constructs a pool that can allocate at least _capacity_ elements, |
193 | /// even if all the local lists are full |
194 | explicit IndexedMemPool(uint32_t capacity) |
195 | : actualCapacity_(maxIndexForCapacity(capacity)), |
196 | size_(0), |
197 | globalHead_(TaggedPtr{}) { |
198 | const size_t needed = sizeof(Slot) * (actualCapacity_ + 1); |
199 | size_t pagesize = size_t(sysconf(_SC_PAGESIZE)); |
200 | mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize; |
201 | assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize); |
202 | assert((mmapLength_ % pagesize) == 0); |
203 | |
204 | slots_ = static_cast<Slot*>(mmap( |
205 | nullptr, |
206 | mmapLength_, |
207 | PROT_READ | PROT_WRITE, |
208 | MAP_PRIVATE | MAP_ANONYMOUS, |
209 | -1, |
210 | 0)); |
211 | if (slots_ == MAP_FAILED) { |
212 | assert(errno == ENOMEM); |
213 | throw std::bad_alloc(); |
214 | } |
215 | } |
216 | |
217 | /// Destroys all of the contained elements |
218 | ~IndexedMemPool() { |
219 | using A = Atom<uint32_t>; |
220 | for (uint32_t i = maxAllocatedIndex(); i > 0; --i) { |
221 | Traits::cleanup(&slots_[i].elem); |
222 | slots_[i].localNext.~A(); |
223 | slots_[i].globalNext.~A(); |
224 | } |
225 | munmap(slots_, mmapLength_); |
226 | } |
227 | |
228 | /// Returns a lower bound on the number of elements that may be |
229 | /// simultaneously allocated and not yet recycled. Because of the |
230 | /// local lists it is possible that more elements than this are returned |
231 | /// successfully |
232 | uint32_t capacity() { |
233 | return capacityForMaxIndex(actualCapacity_); |
234 | } |
235 | |
236 | /// Returns the maximum index of elements ever allocated in this pool |
237 | /// including elements that have been recycled. |
238 | uint32_t maxAllocatedIndex() const { |
239 | // Take the minimum since it is possible that size_ > actualCapacity_. |
240 | // This can happen if there are multiple concurrent requests |
241 | // when size_ == actualCapacity_ - 1. |
242 | return std::min(uint32_t(size_), uint32_t(actualCapacity_)); |
243 | } |
244 | |
245 | /// Finds a slot with a non-zero index, emplaces a T there if we're |
246 | /// using the eager recycle lifecycle mode, and returns the index, |
247 | /// or returns 0 if no elements are available. Passes a pointer to |
248 | /// the element to Traits::onAllocate before the slot is marked as |
249 | /// allocated. |
250 | template <typename... Args> |
251 | uint32_t allocIndex(Args&&... args) { |
252 | auto idx = localPop(localHead()); |
253 | if (idx != 0) { |
254 | Slot& s = slot(idx); |
255 | Traits::onAllocate(&s.elem, std::forward<Args>(args)...); |
256 | markAllocated(s); |
257 | } |
258 | return idx; |
259 | } |
260 | |
261 | /// If an element is available, returns a std::unique_ptr to it that will |
262 | /// recycle the element to the pool when it is reclaimed, otherwise returns |
263 | /// a null (falsy) std::unique_ptr. Passes a pointer to the element to |
264 | /// Traits::onAllocate before the slot is marked as allocated. |
265 | template <typename... Args> |
266 | UniquePtr allocElem(Args&&... args) { |
267 | auto idx = allocIndex(std::forward<Args>(args)...); |
268 | T* ptr = idx == 0 ? nullptr : &slot(idx).elem; |
269 | return UniquePtr(ptr, typename UniquePtr::deleter_type(this)); |
270 | } |
271 | |
272 | /// Gives up ownership previously granted by alloc() |
273 | void recycleIndex(uint32_t idx) { |
274 | assert(isAllocated(idx)); |
275 | localPush(localHead(), idx); |
276 | } |
277 | |
278 | /// Provides access to the pooled element referenced by idx |
279 | T& operator[](uint32_t idx) { |
280 | return slot(idx).elem; |
281 | } |
282 | |
283 | /// Provides access to the pooled element referenced by idx |
284 | const T& operator[](uint32_t idx) const { |
285 | return slot(idx).elem; |
286 | } |
287 | |
288 | /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also, |
289 | /// pool.locateElem(nullptr) == 0 |
290 | uint32_t locateElem(const T* elem) const { |
291 | if (!elem) { |
292 | return 0; |
293 | } |
294 | |
295 | static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD" ); |
296 | |
297 | auto slot = reinterpret_cast<const Slot*>( |
298 | reinterpret_cast<const char*>(elem) - offsetof(Slot, elem)); |
299 | auto rv = uint32_t(slot - slots_); |
300 | |
301 | // this assert also tests that rv is in range |
302 | assert(elem == &(*this)[rv]); |
303 | return rv; |
304 | } |
305 | |
306 | /// Returns true iff idx has been alloc()ed and not recycleIndex()ed |
307 | bool isAllocated(uint32_t idx) const { |
308 | return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-1); |
309 | } |
310 | |
311 | private: |
312 | ///////////// types |
313 | |
314 | struct Slot { |
315 | T elem; |
316 | Atom<uint32_t> localNext; |
317 | Atom<uint32_t> globalNext; |
318 | |
319 | Slot() : localNext{}, globalNext{} {} |
320 | }; |
321 | |
322 | struct TaggedPtr { |
323 | uint32_t idx; |
324 | |
325 | // size is bottom 8 bits, tag in top 24. g++'s code generation for |
326 | // bitfields seems to depend on the phase of the moon, plus we can |
327 | // do better because we can rely on other checks to avoid masking |
328 | uint32_t tagAndSize; |
329 | |
330 | enum : uint32_t { |
331 | SizeBits = 8, |
332 | SizeMask = (1U << SizeBits) - 1, |
333 | TagIncr = 1U << SizeBits, |
334 | }; |
335 | |
336 | uint32_t size() const { |
337 | return tagAndSize & SizeMask; |
338 | } |
339 | |
340 | TaggedPtr withSize(uint32_t repl) const { |
341 | assert(repl <= LocalListLimit); |
342 | return TaggedPtr{idx, (tagAndSize & ~SizeMask) | repl}; |
343 | } |
344 | |
345 | TaggedPtr withSizeIncr() const { |
346 | assert(size() < LocalListLimit); |
347 | return TaggedPtr{idx, tagAndSize + 1}; |
348 | } |
349 | |
350 | TaggedPtr withSizeDecr() const { |
351 | assert(size() > 0); |
352 | return TaggedPtr{idx, tagAndSize - 1}; |
353 | } |
354 | |
355 | TaggedPtr withIdx(uint32_t repl) const { |
356 | return TaggedPtr{repl, tagAndSize + TagIncr}; |
357 | } |
358 | |
359 | TaggedPtr withEmpty() const { |
360 | return withIdx(0).withSize(0); |
361 | } |
362 | }; |
363 | |
364 | struct alignas(hardware_destructive_interference_size) LocalList { |
365 | AtomicStruct<TaggedPtr, Atom> head; |
366 | |
367 | LocalList() : head(TaggedPtr{}) {} |
368 | }; |
369 | |
370 | ////////// fields |
371 | |
372 | /// the number of bytes allocated from mmap, which is a multiple of |
373 | /// the page size of the machine |
374 | size_t mmapLength_; |
375 | |
376 | /// the actual number of slots that we will allocate, to guarantee |
377 | /// that we will satisfy the capacity requested at construction time. |
378 | /// They will be numbered 1..actualCapacity_ (note the 1-based counting), |
379 | /// and occupy slots_[1..actualCapacity_]. |
380 | uint32_t actualCapacity_; |
381 | |
382 | /// this records the number of slots that have actually been constructed. |
383 | /// To allow use of atomic ++ instead of CAS, we let this overflow. |
384 | /// The actual number of constructed elements is min(actualCapacity_, |
385 | /// size_) |
386 | Atom<uint32_t> size_; |
387 | |
388 | /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are |
389 | /// actually constructed. Note that slots_[0] is not constructed or used |
390 | alignas(hardware_destructive_interference_size) Slot* slots_; |
391 | |
392 | /// use AccessSpreader to find your list. We use stripes instead of |
393 | /// thread-local to avoid the need to grow or shrink on thread start |
394 | /// or join. These are heads of lists chained with localNext |
395 | LocalList local_[NumLocalLists]; |
396 | |
397 | /// this is the head of a list of node chained by globalNext, that are |
398 | /// themselves each the head of a list chained by localNext |
399 | alignas(hardware_destructive_interference_size) |
400 | AtomicStruct<TaggedPtr, Atom> globalHead_; |
401 | |
402 | ///////////// private methods |
403 | |
404 | uint32_t slotIndex(uint32_t idx) const { |
405 | assert( |
406 | 0 < idx && idx <= actualCapacity_ && |
407 | idx <= size_.load(std::memory_order_acquire)); |
408 | return idx; |
409 | } |
410 | |
411 | Slot& slot(uint32_t idx) { |
412 | return slots_[slotIndex(idx)]; |
413 | } |
414 | |
415 | const Slot& slot(uint32_t idx) const { |
416 | return slots_[slotIndex(idx)]; |
417 | } |
418 | |
419 | // localHead references a full list chained by localNext. s should |
420 | // reference slot(localHead), it is passed as a micro-optimization |
421 | void globalPush(Slot& s, uint32_t localHead) { |
422 | while (true) { |
423 | TaggedPtr gh = globalHead_.load(std::memory_order_acquire); |
424 | s.globalNext.store(gh.idx, std::memory_order_relaxed); |
425 | if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) { |
426 | // success |
427 | return; |
428 | } |
429 | } |
430 | } |
431 | |
432 | // idx references a single node |
433 | void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) { |
434 | Slot& s = slot(idx); |
435 | TaggedPtr h = head.load(std::memory_order_acquire); |
436 | bool recycled = false; |
437 | while (true) { |
438 | s.localNext.store(h.idx, std::memory_order_release); |
439 | if (!recycled) { |
440 | Traits::onRecycle(&slot(idx).elem); |
441 | recycled = true; |
442 | } |
443 | |
444 | if (h.size() == LocalListLimit) { |
445 | // push will overflow local list, steal it instead |
446 | if (head.compare_exchange_strong(h, h.withEmpty())) { |
447 | // steal was successful, put everything in the global list |
448 | globalPush(s, idx); |
449 | return; |
450 | } |
451 | } else { |
452 | // local list has space |
453 | if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) { |
454 | // success |
455 | return; |
456 | } |
457 | } |
458 | // h was updated by failing CAS |
459 | } |
460 | } |
461 | |
462 | // returns 0 if empty |
463 | uint32_t globalPop() { |
464 | while (true) { |
465 | TaggedPtr gh = globalHead_.load(std::memory_order_acquire); |
466 | if (gh.idx == 0 || |
467 | globalHead_.compare_exchange_strong( |
468 | gh, |
469 | gh.withIdx( |
470 | slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) { |
471 | // global list is empty, or pop was successful |
472 | return gh.idx; |
473 | } |
474 | } |
475 | } |
476 | |
477 | // returns 0 if allocation failed |
478 | uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) { |
479 | while (true) { |
480 | TaggedPtr h = head.load(std::memory_order_acquire); |
481 | if (h.idx != 0) { |
482 | // local list is non-empty, try to pop |
483 | Slot& s = slot(h.idx); |
484 | auto next = s.localNext.load(std::memory_order_relaxed); |
485 | if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) { |
486 | // success |
487 | return h.idx; |
488 | } |
489 | continue; |
490 | } |
491 | |
492 | uint32_t idx = globalPop(); |
493 | if (idx == 0) { |
494 | // global list is empty, allocate and construct new slot |
495 | if (size_.load(std::memory_order_relaxed) >= actualCapacity_ || |
496 | (idx = ++size_) > actualCapacity_) { |
497 | // allocation failed |
498 | return 0; |
499 | } |
500 | Slot& s = slot(idx); |
501 | // Atom is enforced above to be nothrow-default-constructible |
502 | // As an optimization, use default-initialization (no parens) rather |
503 | // than direct-initialization (with parens): these locations are |
504 | // stored-to before they are loaded-from |
505 | new (&s.localNext) Atom<uint32_t>; |
506 | new (&s.globalNext) Atom<uint32_t>; |
507 | Traits::initialize(&s.elem); |
508 | return idx; |
509 | } |
510 | |
511 | Slot& s = slot(idx); |
512 | auto next = s.localNext.load(std::memory_order_relaxed); |
513 | if (head.compare_exchange_strong( |
514 | h, h.withIdx(next).withSize(LocalListLimit))) { |
515 | // global list moved to local list, keep head for us |
516 | return idx; |
517 | } |
518 | // local bulk push failed, return idx to the global list and try again |
519 | globalPush(s, idx); |
520 | } |
521 | } |
522 | |
523 | AtomicStruct<TaggedPtr, Atom>& localHead() { |
524 | auto stripe = AccessSpreader<Atom>::current(NumLocalLists); |
525 | return local_[stripe].head; |
526 | } |
527 | |
528 | void markAllocated(Slot& slot) { |
529 | slot.localNext.store(uint32_t(-1), std::memory_order_release); |
530 | } |
531 | |
532 | public: |
533 | static constexpr std::size_t kSlotSize = sizeof(Slot); |
534 | }; |
535 | |
536 | namespace detail { |
537 | |
538 | /// This is a stateful Deleter functor, which allows std::unique_ptr |
539 | /// to track elements allocated from an IndexedMemPool by tracking the |
540 | /// associated pool. See IndexedMemPool::allocElem. |
541 | template <typename Pool> |
542 | struct IndexedMemPoolRecycler { |
543 | Pool* pool; |
544 | |
545 | explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {} |
546 | |
547 | IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default; |
548 | IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) = |
549 | default; |
550 | |
551 | void operator()(typename Pool::value_type* elem) const { |
552 | pool->recycleIndex(pool->locateElem(elem)); |
553 | } |
554 | }; |
555 | |
556 | } // namespace detail |
557 | |
558 | } // namespace folly |
559 | |
560 | FOLLY_POP_WARNING |
561 | |