1/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <assert.h>
20#include <errno.h>
21#include <stdint.h>
22
23#include <type_traits>
24
25#include <folly/Portability.h>
26#include <folly/concurrency/CacheLocality.h>
27#include <folly/portability/SysMman.h>
28#include <folly/portability/Unistd.h>
29#include <folly/synchronization/AtomicStruct.h>
30
31// Ignore shadowing warnings within this file, so includers can use -Wshadow.
32FOLLY_PUSH_WARNING
33FOLLY_GNU_DISABLE_WARNING("-Wshadow")
34
35namespace folly {
36
37namespace detail {
38template <typename Pool>
39struct IndexedMemPoolRecycler;
40} // namespace detail
41
42template <
43 typename T,
44 bool EagerRecycleWhenTrivial = false,
45 bool EagerRecycleWhenNotTrivial = true>
46struct IndexedMemPoolTraits {
47 static constexpr bool eagerRecycle() {
48 return std::is_trivial<T>::value ? EagerRecycleWhenTrivial
49 : EagerRecycleWhenNotTrivial;
50 }
51
52 /// Called when the element pointed to by ptr is allocated for the
53 /// first time.
54 static void initialize(T* ptr) {
55 if (!eagerRecycle()) {
56 new (ptr) T();
57 }
58 }
59
60 /// Called when the element pointed to by ptr is freed at the pool
61 /// destruction time.
62 static void cleanup(T* ptr) {
63 if (!eagerRecycle()) {
64 ptr->~T();
65 }
66 }
67
68 /// Called when the element is allocated with the arguments forwarded from
69 /// IndexedMemPool::allocElem.
70 template <typename... Args>
71 static void onAllocate(T* ptr, Args&&... args) {
72 static_assert(
73 sizeof...(Args) == 0 || eagerRecycle(),
74 "emplace-style allocation requires eager recycle, "
75 "which is defaulted only for non-trivial types");
76 if (eagerRecycle()) {
77 new (ptr) T(std::forward<Args>(args)...);
78 }
79 }
80
81 /// Called when the element is recycled.
82 static void onRecycle(T* ptr) {
83 if (eagerRecycle()) {
84 ptr->~T();
85 }
86 }
87};
88
89/// IndexedMemPool traits that implements the lazy lifecycle strategy. In this
90/// strategy elements are default-constructed the first time they are allocated,
91/// and destroyed when the pool itself is destroyed.
92template <typename T>
93using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>;
94
95/// IndexedMemPool traits that implements the eager lifecycle strategy. In this
96/// strategy elements are constructed when they are allocated from the pool and
97/// destroyed when recycled.
98template <typename T>
99using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>;
100
101/// Instances of IndexedMemPool dynamically allocate and then pool their
102/// element type (T), returning 4-byte integer indices that can be passed
103/// to the pool's operator[] method to access or obtain pointers to the
104/// actual elements. The memory backing items returned from the pool
105/// will always be readable, even if items have been returned to the pool.
106/// These two features are useful for lock-free algorithms. The indexing
107/// behavior makes it easy to build tagged pointer-like-things, since
108/// a large number of elements can be managed using fewer bits than a
109/// full pointer. The access-after-free behavior makes it safe to read
110/// from T-s even after they have been recycled, since it is guaranteed
111/// that the memory won't have been returned to the OS and unmapped
112/// (the algorithm must still use a mechanism to validate that the read
113/// was correct, but it doesn't have to worry about page faults), and if
114/// the elements use internal sequence numbers it can be guaranteed that
115/// there won't be an ABA match due to the element being overwritten with
116/// a different type that has the same bit pattern.
117///
118/// The object lifecycle strategy is controlled by the Traits parameter.
119/// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to
120/// construct objects when they are allocated from the pool and destroy
121/// them when they are recycled. In this mode allocIndex and allocElem
122/// have emplace-like semantics. In another strategy, implemented by
123/// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the
124/// first time they are removed from the pool, and deleted when the pool
125/// itself is deleted. By default the first mode is used for non-trivial
126/// T, and the second is used for trivial T. Clients can customize the
127/// object lifecycle by providing their own Traits implementation.
128/// See IndexedMemPoolTraits for a Traits example.
129///
130/// IMPORTANT: Space for extra elements is allocated to account for those
131/// that are inaccessible because they are in other local lists, so the
132/// actual number of items that can be allocated ranges from capacity to
133/// capacity + (NumLocalLists_-1)*LocalListLimit_. This is important if
134/// you are trying to maximize the capacity of the pool while constraining
135/// the bit size of the resulting pointers, because the pointers will
136/// actually range up to the boosted capacity. See maxIndexForCapacity
137/// and capacityForMaxIndex.
138///
139/// To avoid contention, NumLocalLists_ free lists of limited (less than
140/// or equal to LocalListLimit_) size are maintained, and each thread
141/// retrieves and returns entries from its associated local list. If the
142/// local list becomes too large then elements are placed in bulk in a
143/// global free list. This allows items to be efficiently recirculated
144/// from consumers to producers. AccessSpreader is used to access the
145/// local lists, so there is no performance advantage to having more
146/// local lists than L1 caches.
147///
148/// The pool mmap-s the entire necessary address space when the pool is
149/// constructed, but delays element construction. This means that only
150/// elements that are actually returned to the caller get paged into the
151/// process's resident set (RSS).
152template <
153 typename T,
154 uint32_t NumLocalLists_ = 32,
155 uint32_t LocalListLimit_ = 200,
156 template <typename> class Atom = std::atomic,
157 typename Traits = IndexedMemPoolTraits<T>>
158struct IndexedMemPool {
159 typedef T value_type;
160
161 typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
162 UniquePtr;
163
164 IndexedMemPool(const IndexedMemPool&) = delete;
165 IndexedMemPool& operator=(const IndexedMemPool&) = delete;
166
167 static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
168 enum {
169 NumLocalLists = NumLocalLists_,
170 LocalListLimit = LocalListLimit_,
171 };
172
173 static_assert(
174 std::is_nothrow_default_constructible<Atom<uint32_t>>::value,
175 "Atom must be nothrow default constructible");
176
177 // these are public because clients may need to reason about the number
178 // of bits required to hold indices from a pool, given its capacity
179
180 static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
181 // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
182 // tracking
183 return uint32_t(std::min(
184 uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
185 uint64_t(std::numeric_limits<uint32_t>::max() - 1)));
186 }
187
188 static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
189 return maxIndex - (NumLocalLists - 1) * LocalListLimit;
190 }
191
192 /// Constructs a pool that can allocate at least _capacity_ elements,
193 /// even if all the local lists are full
194 explicit IndexedMemPool(uint32_t capacity)
195 : actualCapacity_(maxIndexForCapacity(capacity)),
196 size_(0),
197 globalHead_(TaggedPtr{}) {
198 const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
199 size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
200 mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
201 assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
202 assert((mmapLength_ % pagesize) == 0);
203
204 slots_ = static_cast<Slot*>(mmap(
205 nullptr,
206 mmapLength_,
207 PROT_READ | PROT_WRITE,
208 MAP_PRIVATE | MAP_ANONYMOUS,
209 -1,
210 0));
211 if (slots_ == MAP_FAILED) {
212 assert(errno == ENOMEM);
213 throw std::bad_alloc();
214 }
215 }
216
217 /// Destroys all of the contained elements
218 ~IndexedMemPool() {
219 using A = Atom<uint32_t>;
220 for (uint32_t i = maxAllocatedIndex(); i > 0; --i) {
221 Traits::cleanup(&slots_[i].elem);
222 slots_[i].localNext.~A();
223 slots_[i].globalNext.~A();
224 }
225 munmap(slots_, mmapLength_);
226 }
227
228 /// Returns a lower bound on the number of elements that may be
229 /// simultaneously allocated and not yet recycled. Because of the
230 /// local lists it is possible that more elements than this are returned
231 /// successfully
232 uint32_t capacity() {
233 return capacityForMaxIndex(actualCapacity_);
234 }
235
236 /// Returns the maximum index of elements ever allocated in this pool
237 /// including elements that have been recycled.
238 uint32_t maxAllocatedIndex() const {
239 // Take the minimum since it is possible that size_ > actualCapacity_.
240 // This can happen if there are multiple concurrent requests
241 // when size_ == actualCapacity_ - 1.
242 return std::min(uint32_t(size_), uint32_t(actualCapacity_));
243 }
244
245 /// Finds a slot with a non-zero index, emplaces a T there if we're
246 /// using the eager recycle lifecycle mode, and returns the index,
247 /// or returns 0 if no elements are available. Passes a pointer to
248 /// the element to Traits::onAllocate before the slot is marked as
249 /// allocated.
250 template <typename... Args>
251 uint32_t allocIndex(Args&&... args) {
252 auto idx = localPop(localHead());
253 if (idx != 0) {
254 Slot& s = slot(idx);
255 Traits::onAllocate(&s.elem, std::forward<Args>(args)...);
256 markAllocated(s);
257 }
258 return idx;
259 }
260
261 /// If an element is available, returns a std::unique_ptr to it that will
262 /// recycle the element to the pool when it is reclaimed, otherwise returns
263 /// a null (falsy) std::unique_ptr. Passes a pointer to the element to
264 /// Traits::onAllocate before the slot is marked as allocated.
265 template <typename... Args>
266 UniquePtr allocElem(Args&&... args) {
267 auto idx = allocIndex(std::forward<Args>(args)...);
268 T* ptr = idx == 0 ? nullptr : &slot(idx).elem;
269 return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
270 }
271
272 /// Gives up ownership previously granted by alloc()
273 void recycleIndex(uint32_t idx) {
274 assert(isAllocated(idx));
275 localPush(localHead(), idx);
276 }
277
278 /// Provides access to the pooled element referenced by idx
279 T& operator[](uint32_t idx) {
280 return slot(idx).elem;
281 }
282
283 /// Provides access to the pooled element referenced by idx
284 const T& operator[](uint32_t idx) const {
285 return slot(idx).elem;
286 }
287
288 /// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
289 /// pool.locateElem(nullptr) == 0
290 uint32_t locateElem(const T* elem) const {
291 if (!elem) {
292 return 0;
293 }
294
295 static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
296
297 auto slot = reinterpret_cast<const Slot*>(
298 reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
299 auto rv = uint32_t(slot - slots_);
300
301 // this assert also tests that rv is in range
302 assert(elem == &(*this)[rv]);
303 return rv;
304 }
305
306 /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
307 bool isAllocated(uint32_t idx) const {
308 return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-1);
309 }
310
311 private:
312 ///////////// types
313
314 struct Slot {
315 T elem;
316 Atom<uint32_t> localNext;
317 Atom<uint32_t> globalNext;
318
319 Slot() : localNext{}, globalNext{} {}
320 };
321
322 struct TaggedPtr {
323 uint32_t idx;
324
325 // size is bottom 8 bits, tag in top 24. g++'s code generation for
326 // bitfields seems to depend on the phase of the moon, plus we can
327 // do better because we can rely on other checks to avoid masking
328 uint32_t tagAndSize;
329
330 enum : uint32_t {
331 SizeBits = 8,
332 SizeMask = (1U << SizeBits) - 1,
333 TagIncr = 1U << SizeBits,
334 };
335
336 uint32_t size() const {
337 return tagAndSize & SizeMask;
338 }
339
340 TaggedPtr withSize(uint32_t repl) const {
341 assert(repl <= LocalListLimit);
342 return TaggedPtr{idx, (tagAndSize & ~SizeMask) | repl};
343 }
344
345 TaggedPtr withSizeIncr() const {
346 assert(size() < LocalListLimit);
347 return TaggedPtr{idx, tagAndSize + 1};
348 }
349
350 TaggedPtr withSizeDecr() const {
351 assert(size() > 0);
352 return TaggedPtr{idx, tagAndSize - 1};
353 }
354
355 TaggedPtr withIdx(uint32_t repl) const {
356 return TaggedPtr{repl, tagAndSize + TagIncr};
357 }
358
359 TaggedPtr withEmpty() const {
360 return withIdx(0).withSize(0);
361 }
362 };
363
364 struct alignas(hardware_destructive_interference_size) LocalList {
365 AtomicStruct<TaggedPtr, Atom> head;
366
367 LocalList() : head(TaggedPtr{}) {}
368 };
369
370 ////////// fields
371
372 /// the number of bytes allocated from mmap, which is a multiple of
373 /// the page size of the machine
374 size_t mmapLength_;
375
376 /// the actual number of slots that we will allocate, to guarantee
377 /// that we will satisfy the capacity requested at construction time.
378 /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
379 /// and occupy slots_[1..actualCapacity_].
380 uint32_t actualCapacity_;
381
382 /// this records the number of slots that have actually been constructed.
383 /// To allow use of atomic ++ instead of CAS, we let this overflow.
384 /// The actual number of constructed elements is min(actualCapacity_,
385 /// size_)
386 Atom<uint32_t> size_;
387
388 /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
389 /// actually constructed. Note that slots_[0] is not constructed or used
390 alignas(hardware_destructive_interference_size) Slot* slots_;
391
392 /// use AccessSpreader to find your list. We use stripes instead of
393 /// thread-local to avoid the need to grow or shrink on thread start
394 /// or join. These are heads of lists chained with localNext
395 LocalList local_[NumLocalLists];
396
397 /// this is the head of a list of node chained by globalNext, that are
398 /// themselves each the head of a list chained by localNext
399 alignas(hardware_destructive_interference_size)
400 AtomicStruct<TaggedPtr, Atom> globalHead_;
401
402 ///////////// private methods
403
404 uint32_t slotIndex(uint32_t idx) const {
405 assert(
406 0 < idx && idx <= actualCapacity_ &&
407 idx <= size_.load(std::memory_order_acquire));
408 return idx;
409 }
410
411 Slot& slot(uint32_t idx) {
412 return slots_[slotIndex(idx)];
413 }
414
415 const Slot& slot(uint32_t idx) const {
416 return slots_[slotIndex(idx)];
417 }
418
419 // localHead references a full list chained by localNext. s should
420 // reference slot(localHead), it is passed as a micro-optimization
421 void globalPush(Slot& s, uint32_t localHead) {
422 while (true) {
423 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
424 s.globalNext.store(gh.idx, std::memory_order_relaxed);
425 if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
426 // success
427 return;
428 }
429 }
430 }
431
432 // idx references a single node
433 void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) {
434 Slot& s = slot(idx);
435 TaggedPtr h = head.load(std::memory_order_acquire);
436 bool recycled = false;
437 while (true) {
438 s.localNext.store(h.idx, std::memory_order_release);
439 if (!recycled) {
440 Traits::onRecycle(&slot(idx).elem);
441 recycled = true;
442 }
443
444 if (h.size() == LocalListLimit) {
445 // push will overflow local list, steal it instead
446 if (head.compare_exchange_strong(h, h.withEmpty())) {
447 // steal was successful, put everything in the global list
448 globalPush(s, idx);
449 return;
450 }
451 } else {
452 // local list has space
453 if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
454 // success
455 return;
456 }
457 }
458 // h was updated by failing CAS
459 }
460 }
461
462 // returns 0 if empty
463 uint32_t globalPop() {
464 while (true) {
465 TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
466 if (gh.idx == 0 ||
467 globalHead_.compare_exchange_strong(
468 gh,
469 gh.withIdx(
470 slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
471 // global list is empty, or pop was successful
472 return gh.idx;
473 }
474 }
475 }
476
477 // returns 0 if allocation failed
478 uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) {
479 while (true) {
480 TaggedPtr h = head.load(std::memory_order_acquire);
481 if (h.idx != 0) {
482 // local list is non-empty, try to pop
483 Slot& s = slot(h.idx);
484 auto next = s.localNext.load(std::memory_order_relaxed);
485 if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
486 // success
487 return h.idx;
488 }
489 continue;
490 }
491
492 uint32_t idx = globalPop();
493 if (idx == 0) {
494 // global list is empty, allocate and construct new slot
495 if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
496 (idx = ++size_) > actualCapacity_) {
497 // allocation failed
498 return 0;
499 }
500 Slot& s = slot(idx);
501 // Atom is enforced above to be nothrow-default-constructible
502 // As an optimization, use default-initialization (no parens) rather
503 // than direct-initialization (with parens): these locations are
504 // stored-to before they are loaded-from
505 new (&s.localNext) Atom<uint32_t>;
506 new (&s.globalNext) Atom<uint32_t>;
507 Traits::initialize(&s.elem);
508 return idx;
509 }
510
511 Slot& s = slot(idx);
512 auto next = s.localNext.load(std::memory_order_relaxed);
513 if (head.compare_exchange_strong(
514 h, h.withIdx(next).withSize(LocalListLimit))) {
515 // global list moved to local list, keep head for us
516 return idx;
517 }
518 // local bulk push failed, return idx to the global list and try again
519 globalPush(s, idx);
520 }
521 }
522
523 AtomicStruct<TaggedPtr, Atom>& localHead() {
524 auto stripe = AccessSpreader<Atom>::current(NumLocalLists);
525 return local_[stripe].head;
526 }
527
528 void markAllocated(Slot& slot) {
529 slot.localNext.store(uint32_t(-1), std::memory_order_release);
530 }
531
532 public:
533 static constexpr std::size_t kSlotSize = sizeof(Slot);
534};
535
536namespace detail {
537
538/// This is a stateful Deleter functor, which allows std::unique_ptr
539/// to track elements allocated from an IndexedMemPool by tracking the
540/// associated pool. See IndexedMemPool::allocElem.
541template <typename Pool>
542struct IndexedMemPoolRecycler {
543 Pool* pool;
544
545 explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
546
547 IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default;
548 IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) =
549 default;
550
551 void operator()(typename Pool::value_type* elem) const {
552 pool->recycleIndex(pool->locateElem(elem));
553 }
554};
555
556} // namespace detail
557
558} // namespace folly
559
560FOLLY_POP_WARNING
561