IndexedMemPool.h source code [glow/thirdparty/folly/folly/IndexedMemPool.h]

1	/*
2	* Copyright (c) Facebook, Inc. and its affiliates.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <assert.h>
20	#include <errno.h>
21	#include <stdint.h>
22
23	#include <type_traits>
24
25	#include <folly/Portability.h>
26	#include <folly/concurrency/CacheLocality.h>
27	#include <folly/portability/SysMman.h>
28	#include <folly/portability/Unistd.h>
29	#include <folly/synchronization/AtomicStruct.h>
30
31	// Ignore shadowing warnings within this file, so includers can use -Wshadow.
32	FOLLY_PUSH_WARNING
33	FOLLY_GNU_DISABLE_WARNING("-Wshadow")
34
35	namespace folly {
36
37	namespace detail {
38	template <typename Pool>
39	struct IndexedMemPoolRecycler;
40	} // namespace detail
41
42	template <
43	typename T,
44	bool EagerRecycleWhenTrivial = false,
45	bool EagerRecycleWhenNotTrivial = true>
46	struct IndexedMemPoolTraits {
47	static constexpr bool eagerRecycle() {
48	return std::is_trivial<T>::value ? EagerRecycleWhenTrivial
49	: EagerRecycleWhenNotTrivial;
50	}
51
52	/// Called when the element pointed to by ptr is allocated for the
53	/// first time.
54	static void initialize(T* ptr) {
55	if (!eagerRecycle()) {
56	new (ptr) T();
57	}
58	}
59
60	/// Called when the element pointed to by ptr is freed at the pool
61	/// destruction time.
62	static void cleanup(T* ptr) {
63	if (!eagerRecycle()) {
64	ptr->~T();
65	}
66	}
67
68	/// Called when the element is allocated with the arguments forwarded from
69	/// IndexedMemPool::allocElem.
70	template <typename... Args>
71	static void onAllocate(T* ptr, Args&&... args) {
72	static_assert(
73	sizeof...(Args) == `0` \|\| eagerRecycle(),
74	"emplace-style allocation requires eager recycle, "
75	"which is defaulted only for non-trivial types");
76	if (eagerRecycle()) {
77	new (ptr) T(std::forward<Args>(args)...);
78	}
79	}
80
81	/// Called when the element is recycled.
82	static void onRecycle(T* ptr) {
83	if (eagerRecycle()) {
84	ptr->~T();
85	}
86	}
87	};
88
89	/// IndexedMemPool traits that implements the lazy lifecycle strategy. In this
90	/// strategy elements are default-constructed the first time they are allocated,
91	/// and destroyed when the pool itself is destroyed.
92	template <typename T>
93	using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>;
94
95	/// IndexedMemPool traits that implements the eager lifecycle strategy. In this
96	/// strategy elements are constructed when they are allocated from the pool and
97	/// destroyed when recycled.
98	template <typename T>
99	using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>;
100
101	/// Instances of IndexedMemPool dynamically allocate and then pool their
102	/// element type (T), returning 4-byte integer indices that can be passed
103	/// to the pool's operator[] method to access or obtain pointers to the
104	/// actual elements. The memory backing items returned from the pool
105	/// will always be readable, even if items have been returned to the pool.
106	/// These two features are useful for lock-free algorithms. The indexing
107	/// behavior makes it easy to build tagged pointer-like-things, since
108	/// a large number of elements can be managed using fewer bits than a
109	/// full pointer. The access-after-free behavior makes it safe to read
110	/// from T-s even after they have been recycled, since it is guaranteed
111	/// that the memory won't have been returned to the OS and unmapped
112	/// (the algorithm must still use a mechanism to validate that the read
113	/// was correct, but it doesn't have to worry about page faults), and if
114	/// the elements use internal sequence numbers it can be guaranteed that
115	/// there won't be an ABA match due to the element being overwritten with
116	/// a different type that has the same bit pattern.
117	///
118	/// The object lifecycle strategy is controlled by the Traits parameter.
119	/// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to
120	/// construct objects when they are allocated from the pool and destroy
121	/// them when they are recycled. In this mode allocIndex and allocElem
122	/// have emplace-like semantics. In another strategy, implemented by
123	/// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the
124	/// first time they are removed from the pool, and deleted when the pool
125	/// itself is deleted. By default the first mode is used for non-trivial
126	/// T, and the second is used for trivial T. Clients can customize the
127	/// object lifecycle by providing their own Traits implementation.
128	/// See IndexedMemPoolTraits for a Traits example.
129	///
130	/// IMPORTANT: Space for extra elements is allocated to account for those
131	/// that are inaccessible because they are in other local lists, so the
132	/// actual number of items that can be allocated ranges from capacity to
133	/// capacity + (NumLocalLists_-1)LocalListLimit_. This is important if*
134	/// you are trying to maximize the capacity of the pool while constraining
135	/// the bit size of the resulting pointers, because the pointers will
136	/// actually range up to the boosted capacity. See maxIndexForCapacity
137	/// and capacityForMaxIndex.
138	///
139	/// To avoid contention, NumLocalLists_ free lists of limited (less than
140	/// or equal to LocalListLimit_) size are maintained, and each thread
141	/// retrieves and returns entries from its associated local list. If the
142	/// local list becomes too large then elements are placed in bulk in a
143	/// global free list. This allows items to be efficiently recirculated
144	/// from consumers to producers. AccessSpreader is used to access the
145	/// local lists, so there is no performance advantage to having more
146	/// local lists than L1 caches.
147	///
148	/// The pool mmap-s the entire necessary address space when the pool is
149	/// constructed, but delays element construction. This means that only
150	/// elements that are actually returned to the caller get paged into the
151	/// process's resident set (RSS).
152	template <
153	typename T,
154	uint32_t NumLocalLists_ = `32`,
155	uint32_t LocalListLimit_ = `200`,
156	template <typename> class Atom = std::atomic,
157	typename Traits = IndexedMemPoolTraits<T>>
158	struct IndexedMemPool {
159	typedef T value_type;
160
161	typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
162	UniquePtr;
163
164	IndexedMemPool(const IndexedMemPool&) = delete;
165	IndexedMemPool& operator=(const IndexedMemPool&) = delete;
166
167	static_assert(LocalListLimit_ <= `255`, "LocalListLimit must fit in 8 bits");
168	enum {
169	NumLocalLists = NumLocalLists_,
170	LocalListLimit = LocalListLimit_,
171	};
172
173	static_assert(
174	std::is_nothrow_default_constructible<Atom<uint32_t>>::value,
175	"Atom must be nothrow default constructible");
176
177	// these are public because clients may need to reason about the number
178	// of bits required to hold indices from a pool, given its capacity
179
180	static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
181	// index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
182	// tracking
183	return uint32_t(std::min(
184	uint64_t(capacity) + (NumLocalLists - `1`) * LocalListLimit,
185	uint64_t(std::numeric_limits<uint32_t>::max() - `1`)));
186	}
187
188	static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
189	return maxIndex - (NumLocalLists - `1`) * LocalListLimit;
190	}
191
192	/// Constructs a pool that can allocate at least _capacity_ elements,
193	/// even if all the local lists are full
194	explicit IndexedMemPool(uint32_t capacity)
195	: actualCapacity_(maxIndexForCapacity(capacity)),
196	size_(`0`),
197	globalHead_(TaggedPtr{}) {
198	const size_t needed = sizeof(Slot) * (actualCapacity_ + `1`);
199	size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
200	mmapLength_ = ((needed - `1`) & ~(pagesize - `1`)) + pagesize;
201	assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
202	assert((mmapLength_ % pagesize) == `0`);
203
204	slots_ = static_cast<Slot*>(mmap(
205	nullptr,
206	mmapLength_,
207	PROT_READ \| PROT_WRITE,
208	MAP_PRIVATE \| MAP_ANONYMOUS,
209	-`1`,
210	`0`));
211	if (slots_ == MAP_FAILED) {
212	assert(errno == ENOMEM);
213	throw std::bad_alloc ();
214	}
215	}
216
217	/// Destroys all of the contained elements
218	~IndexedMemPool() {
219	using A = Atom<uint32_t>;
220	for (uint32_t i = maxAllocatedIndex(); i > `0`; --i) {
221	Traits::cleanup(&slots_[i].elem);
222	slots_[i].localNext.~A();
223	slots_[i].globalNext.~A();
224	}
225	munmap(slots_, mmapLength_);
226	}
227
228	/// Returns a lower bound on the number of elements that may be
229	/// simultaneously allocated and not yet recycled. Because of the
230	/// local lists it is possible that more elements than this are returned
231	/// successfully
232	uint32_t capacity() {
233	return capacityForMaxIndex(actualCapacity_);
234	}
235
236	/// Returns the maximum index of elements ever allocated in this pool
237	/// including elements that have been recycled.
238	uint32_t maxAllocatedIndex() const {
239	// Take the minimum since it is possible that size_ > actualCapacity_.
240	// This can happen if there are multiple concurrent requests
241	// when size_ == actualCapacity_ - 1.
242	return std::min(uint32_t(size_), uint32_t(actualCapacity_));
243	}
244
245	/// Finds a slot with a non-zero index, emplaces a T there if we're
246	/// using the eager recycle lifecycle mode, and returns the index,
247	/// or returns 0 if no elements are available. Passes a pointer to
248	/// the element to Traits::onAllocate before the slot is marked as
249	/// allocated.
250	template <typename... Args>
251	uint32_t allocIndex(Args&&... args) {
252	auto idx = localPop(localHead());
253	if (idx != `0`) {
254	Slot& s = slot(idx);
255	Traits::onAllocate(&s.elem, std::forward<Args>(args)...);
256	markAllocated(s);
257	}
258	return idx;
259	}
260
261	/// If an element is available, returns a std::unique_ptr to it that will
262	/// recycle the element to the pool when it is reclaimed, otherwise returns
263	/// a null (falsy) std::unique_ptr. Passes a pointer to the element to
264	/// Traits::onAllocate before the slot is marked as allocated.
265	template <typename... Args>
266	UniquePtr allocElem(Args&&... args) {
267	auto idx = allocIndex(std::forward<Args>(args)...);
268	T* ptr = idx == `0` ? nullptr : &slot(idx).elem;
269	return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
270	}
271
272	/// Gives up ownership previously granted by alloc()
273	void recycleIndex(uint32_t idx) {
274	assert(isAllocated(idx));
275	localPush(localHead(), idx);
276	}
277
278	/// Provides access to the pooled element referenced by idx
279	T& operator[](uint32_t idx) {
280	return slot(idx).elem;
281	}
282
283	/// Provides access to the pooled element referenced by idx
284	const T& operator[](uint32_t idx) const {
285	return slot(idx).elem;
286	}
287
288	/// If elem == &pool[idx], then pool.locateElem(elem) == idx. Also,
289	/// pool.locateElem(nullptr) == 0
290	uint32_t locateElem(const T* elem) const {
291	if (!elem) {
292	return `0`;
293	}
294
295	static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
296
297	auto slot = reinterpret_cast<const Slot*>(
298	reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
299	auto rv = uint32_t(slot - slots_);
300
301	// this assert also tests that rv is in range
302	assert(elem == &(*this)[rv]);
303	return rv;
304	}
305
306	/// Returns true iff idx has been alloc()ed and not recycleIndex()ed
307	bool isAllocated(uint32_t idx) const {
308	return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-`1`);
309	}
310
311	private:
312	///////////// types
313
314	struct Slot {
315	T elem;
316	Atom<uint32_t> localNext;
317	Atom<uint32_t> globalNext;
318
319	Slot() : localNext{}, globalNext{} {}
320	};
321
322	struct TaggedPtr {
323	uint32_t idx;
324
325	// size is bottom 8 bits, tag in top 24. g++'s code generation for
326	// bitfields seems to depend on the phase of the moon, plus we can
327	// do better because we can rely on other checks to avoid masking
328	uint32_t tagAndSize;
329
330	enum : uint32_t {
331	SizeBits = `8`,
332	SizeMask = (`1U` << SizeBits) - `1`,
333	TagIncr = `1U` << SizeBits,
334	};
335
336	uint32_t size() const {
337	return tagAndSize & SizeMask;
338	}
339
340	TaggedPtr withSize(uint32_t repl) const {
341	assert(repl <= LocalListLimit);
342	return TaggedPtr{idx, (tagAndSize & ~SizeMask) \| repl};
343	}
344
345	TaggedPtr withSizeIncr() const {
346	assert(size() < LocalListLimit);
347	return TaggedPtr{idx, tagAndSize + `1`};
348	}
349
350	TaggedPtr withSizeDecr() const {
351	assert(size() > `0`);
352	return TaggedPtr{idx, tagAndSize - `1`};
353	}
354
355	TaggedPtr withIdx(uint32_t repl) const {
356	return TaggedPtr{repl, tagAndSize + TagIncr};
357	}
358
359	TaggedPtr withEmpty() const {
360	return withIdx(`0`).withSize(`0`);
361	}
362	};
363
364	struct alignas(hardware_destructive_interference_size) LocalList {
365	AtomicStruct<TaggedPtr, Atom> head;
366
367	LocalList() : head(TaggedPtr{}) {}
368	};
369
370	////////// fields
371
372	/// the number of bytes allocated from mmap, which is a multiple of
373	/// the page size of the machine
374	size_t mmapLength_;
375
376	/// the actual number of slots that we will allocate, to guarantee
377	/// that we will satisfy the capacity requested at construction time.
378	/// They will be numbered 1..actualCapacity_ (note the 1-based counting),
379	/// and occupy slots_[1..actualCapacity_].
380	uint32_t actualCapacity_;
381
382	/// this records the number of slots that have actually been constructed.
383	/// To allow use of atomic ++ instead of CAS, we let this overflow.
384	/// The actual number of constructed elements is min(actualCapacity_,
385	/// size_)
386	Atom<uint32_t> size_;
387
388	/// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
389	/// actually constructed. Note that slots_[0] is not constructed or used
390	alignas(hardware_destructive_interference_size) Slot* slots_;
391
392	/// use AccessSpreader to find your list. We use stripes instead of
393	/// thread-local to avoid the need to grow or shrink on thread start
394	/// or join. These are heads of lists chained with localNext
395	LocalList local_[NumLocalLists];
396
397	/// this is the head of a list of node chained by globalNext, that are
398	/// themselves each the head of a list chained by localNext
399	alignas(hardware_destructive_interference_size)
400	AtomicStruct<TaggedPtr, Atom> globalHead_;
401
402	///////////// private methods
403
404	uint32_t slotIndex(uint32_t idx) const {
405	assert(
406	`0` < idx && idx <= actualCapacity_ &&
407	idx <= size_.load(std::memory_order_acquire));
408	return idx;
409	}
410
411	Slot& slot(uint32_t idx) {
412	return slots_[slotIndex(idx)];
413	}
414
415	const Slot& slot(uint32_t idx) const {
416	return slots_[slotIndex(idx)];
417	}
418
419	// localHead references a full list chained by localNext. s should
420	// reference slot(localHead), it is passed as a micro-optimization
421	void globalPush(Slot& s, uint32_t localHead) {
422	while (true) {
423	TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
424	s.globalNext.store(gh.idx, std::memory_order_relaxed);
425	if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
426	// success
427	return;
428	}
429	}
430	}
431
432	// idx references a single node
433	void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) {
434	Slot& s = slot(idx);
435	TaggedPtr h = head.load(std::memory_order_acquire);
436	bool recycled = false;
437	while (true) {
438	s.localNext.store(h.idx, std::memory_order_release);
439	if (!recycled) {
440	Traits::onRecycle(&slot(idx).elem);
441	recycled = true;
442	}
443
444	if (h.size() == LocalListLimit) {
445	// push will overflow local list, steal it instead
446	if (head.compare_exchange_strong(h, h.withEmpty())) {
447	// steal was successful, put everything in the global list
448	globalPush(s, idx);
449	return;
450	}
451	} else {
452	// local list has space
453	if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
454	// success
455	return;
456	}
457	}
458	// h was updated by failing CAS
459	}
460	}
461
462	// returns 0 if empty
463	uint32_t globalPop() {
464	while (true) {
465	TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
466	if (gh.idx == `0` \|\|
467	globalHead_.compare_exchange_strong(
468	gh,
469	gh.withIdx(
470	slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
471	// global list is empty, or pop was successful
472	return gh.idx;
473	}
474	}
475	}
476
477	// returns 0 if allocation failed
478	uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) {
479	while (true) {
480	TaggedPtr h = head.load(std::memory_order_acquire);
481	if (h.idx != `0`) {
482	// local list is non-empty, try to pop
483	Slot& s = slot(h.idx);
484	auto next = s.localNext.load(std::memory_order_relaxed);
485	if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
486	// success
487	return h.idx;
488	}
489	continue;
490	}
491
492	uint32_t idx = globalPop();
493	if (idx == `0`) {
494	// global list is empty, allocate and construct new slot
495	if (size_.load(std::memory_order_relaxed) >= actualCapacity_ \|\|
496	(idx = ++size_) > actualCapacity_) {
497	// allocation failed
498	return `0`;
499	}
500	Slot& s = slot(idx);
501	// Atom is enforced above to be nothrow-default-constructible
502	// As an optimization, use default-initialization (no parens) rather
503	// than direct-initialization (with parens): these locations are
504	// stored-to before they are loaded-from
505	new (&s.localNext) Atom<uint32_t>;
506	new (&s.globalNext) Atom<uint32_t>;
507	Traits::initialize(&s.elem);
508	return idx;
509	}
510
511	Slot& s = slot(idx);
512	auto next = s.localNext.load(std::memory_order_relaxed);
513	if (head.compare_exchange_strong(
514	h, h.withIdx(next).withSize(LocalListLimit))) {
515	// global list moved to local list, keep head for us
516	return idx;
517	}
518	// local bulk push failed, return idx to the global list and try again
519	globalPush(s, idx);
520	}
521	}
522
523	AtomicStruct<TaggedPtr, Atom>& localHead() {
524	auto stripe = AccessSpreader<Atom>::current(NumLocalLists);
525	return local_[stripe].head;
526	}
527
528	void markAllocated(Slot& slot) {
529	slot.localNext.store(uint32_t(-`1`), std::memory_order_release);
530	}
531
532	public:
533	static constexpr std::size_t kSlotSize = sizeof(Slot);
534	};
535
536	namespace detail {
537
538	/// This is a stateful Deleter functor, which allows std::unique_ptr
539	/// to track elements allocated from an IndexedMemPool by tracking the
540	/// associated pool. See IndexedMemPool::allocElem.
541	template <typename Pool>
542	struct IndexedMemPoolRecycler {
543	Pool* pool;
544
545	explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
546
547	IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default;
548	IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) =
549	default;
550
551	void operator()(typename Pool::value_type* elem) const {
552	pool->recycleIndex(pool->locateElem(elem));
553	}
554	};
555
556	} // namespace detail
557
558	} // namespace folly
559
560	FOLLY_POP_WARNING
561

Browse the source code of glow/thirdparty/folly/folly/IndexedMemPool.h