SharedMutex.h source code [glow/thirdparty/folly/folly/SharedMutex.h]

1	/*
2	* Copyright (c) Facebook, Inc. and its affiliates.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	// @author Nathan Bronson ([email protected])
18
19	#pragma once
20
21	#include <stdint.h>
22
23	#include <atomic>
24	#include <thread>
25	#include <type_traits>
26
27	#include <folly/CPortability.h>
28	#include <folly/Likely.h>
29	#include <folly/concurrency/CacheLocality.h>
30	#include <folly/detail/Futex.h>
31	#include <folly/portability/Asm.h>
32	#include <folly/portability/SysResource.h>
33	#include <folly/synchronization/SanitizeThread.h>
34
35	// SharedMutex is a reader-writer lock. It is small, very fast, scalable
36	// on multi-core, and suitable for use when readers or writers may block.
37	// Unlike most other reader-writer locks, its throughput with concurrent
38	// readers scales linearly; it is able to acquire and release the lock
39	// in shared mode without cache line ping-ponging. It is suitable for
40	// a wide range of lock hold times because it starts with spinning,
41	// proceeds to using sched_yield with a preemption heuristic, and then
42	// waits using futex and precise wakeups.
43	//
44	// SharedMutex provides all of the methods of folly::RWSpinLock,
45	// boost::shared_mutex, boost::upgrade_mutex, and C++14's
46	// std::shared_timed_mutex. All operations that can block are available
47	// in try, try-for, and try-until (system_clock or steady_clock) versions.
48	//
49	// SharedMutexReadPriority gives priority to readers,
50	// SharedMutexWritePriority gives priority to writers. SharedMutex is an
51	// alias for SharedMutexWritePriority, because writer starvation is more
52	// likely than reader starvation for the read-heavy workloads targeted
53	// by SharedMutex.
54	//
55	// In my tests SharedMutex is as good or better than the other
56	// reader-writer locks in use at Facebook for almost all use cases,
57	// sometimes by a wide margin. (If it is rare that there are actually
58	// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
59	// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
60	// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
61	// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
62	// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
63	// It is generally as good or better than the rest when evaluating size,
64	// speed, scalability, or latency outliers. In the corner cases where
65	// it is not the fastest (such as single-threaded use or heavy write
66	// contention) it is never very much worse than the best. See the bottom
67	// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
68	//
69	// Comparison to folly::RWSpinLock:
70	//
71	// SharedMutex is faster than RWSpinLock when there are actually*
72	// concurrent read accesses (sometimes much faster), and ~5 nanoseconds
73	// slower when there is not actually any contention. SharedMutex is
74	// faster in every (benchmarked) scenario where the shared mode of
75	// the lock is actually useful.
76	//
77	// Concurrent shared access to SharedMutex scales linearly, while total*
78	// RWSpinLock throughput drops as more threads try to access the lock
79	// in shared mode. Under very heavy read contention SharedMutex can
80	// be two orders of magnitude faster than RWSpinLock (or any reader
81	// writer lock that doesn't use striping or deferral).
82	//
83	// SharedMutex can safely protect blocking calls, because after an*
84	// initial period of spinning it waits using futex().
85	//
86	// RWSpinLock prioritizes readers, SharedMutex has both reader- and*
87	// writer-priority variants, but defaults to write priority.
88	//
89	// RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's*
90	// doesn't. Both semantics are reasonable. The boost documentation
91	// doesn't explicitly talk about this behavior (except by omitting
92	// any statement that those lock modes conflict), but the boost
93	// implementations do allow new readers while the upgradeable mode
94	// is held. See https://github.com/boostorg/thread/blob/master/
95	// include/boost/thread/pthread/shared_mutex.hpp
96	//
97	// RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder*
98	// (UpgradeableHolder would be even more pedantically correct).
99	// SharedMutex's holders have fewer methods (no reset) and are less
100	// tolerant (promotion and downgrade crash if the donor doesn't own
101	// the lock, and you must use the default constructor rather than
102	// passing a nullptr to the pointer constructor).
103	//
104	// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
105	// and "shared" modes. At all times num_threads_holding_exclusive +
106	// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
107	// 0 \|\| num_threads_holding_shared == 0. RWSpinLock has the additional
108	// constraint that num_threads_holding_shared cannot increase while
109	// num_threads_holding_upgrade is non-zero.
110	//
111	// Comparison to the internal RWLock:
112	//
113	// SharedMutex doesn't allow a maximum reader count to be configured,*
114	// so it can't be used as a semaphore in the same way as RWLock.
115	//
116	// SharedMutex is 4 bytes, RWLock is 256.*
117	//
118	// SharedMutex is as fast or faster than RWLock in all of my*
119	// microbenchmarks, and has positive rather than negative scalability.
120	//
121	// RWLock and SharedMutex are both writer priority locks.*
122	//
123	// SharedMutex avoids latency outliers as well as RWLock.*
124	//
125	// SharedMutex uses different names (t != 0 below):*
126	//
127	// RWLock::lock(0) => SharedMutex::lock()
128	//
129	// RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t))
130	//
131	// RWLock::tryLock() => SharedMutex::try_lock()
132	//
133	// RWLock::unlock() => SharedMutex::unlock()
134	//
135	// RWLock::enter(0) => SharedMutex::lock_shared()
136	//
137	// RWLock::enter(t) =>
138	// SharedMutex::try_lock_shared_for(milliseconds(t))
139	//
140	// RWLock::tryEnter() => SharedMutex::try_lock_shared()
141	//
142	// RWLock::leave() => SharedMutex::unlock_shared()
143	//
144	// RWLock allows the reader count to be adjusted by a value other*
145	// than 1 during enter() or leave(). SharedMutex doesn't currently
146	// implement this feature.
147	//
148	// RWLock's methods are marked const, SharedMutex's aren't.*
149	//
150	// Reader-writer locks have the potential to allow concurrent access
151	// to shared read-mostly data, but in practice they often provide no
152	// improvement over a mutex. The problem is the cache coherence protocol
153	// of modern CPUs. Coherence is provided by making sure that when a cache
154	// line is written it is present in only one core's cache. Since a memory
155	// write is required to acquire a reader-writer lock in shared mode, the
156	// cache line holding the lock is invalidated in all of the other caches.
157	// This leads to cache misses when another thread wants to acquire or
158	// release the lock concurrently. When the RWLock is colocated with the
159	// data it protects (common), cache misses can also continue occur when
160	// a thread that already holds the lock tries to read the protected data.
161	//
162	// Ideally, a reader-writer lock would allow multiple cores to acquire
163	// and release the lock in shared mode without incurring any cache misses.
164	// This requires that each core records its shared access in a cache line
165	// that isn't read or written by other read-locking cores. (Writers will
166	// have to check all of the cache lines.) Typical server hardware when
167	// this comment was written has 16 L1 caches and cache lines of 64 bytes,
168	// so a lock striped over all L1 caches would occupy a prohibitive 1024
169	// bytes. Nothing says that we need a separate set of per-core memory
170	// locations for each lock, however. Each SharedMutex instance is only
171	// 4 bytes, but all locks together share a 2K area in which they make a
172	// core-local record of lock acquisitions.
173	//
174	// SharedMutex's strategy of using a shared set of core-local stripes has
175	// a potential downside, because it means that acquisition of any lock in
176	// write mode can conflict with acquisition of any lock in shared mode.
177	// If a lock instance doesn't actually experience concurrency then this
178	// downside will outweight the upside of improved scalability for readers.
179	// To avoid this problem we dynamically detect concurrent accesses to
180	// SharedMutex, and don't start using the deferred mode unless we actually
181	// observe concurrency. See kNumSharedToStartDeferring.
182	//
183	// It is explicitly allowed to call unlock_shared() from a different
184	// thread than lock_shared(), so long as they are properly paired.
185	// unlock_shared() needs to find the location at which lock_shared()
186	// recorded the lock, which might be in the lock itself or in any of
187	// the shared slots. If you can conveniently pass state from lock
188	// acquisition to release then the fastest mechanism is to std::move
189	// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
190	// lock_shared(Token&) and unlock_shared(Token&)). The guard or token
191	// will tell unlock_shared where in deferredReaders[] to look for the
192	// deferred lock. The Token-less version of unlock_shared() works in all
193	// cases, but is optimized for the common (no inter-thread handoff) case.
194	//
195	// In both read- and write-priority mode, a waiting lock() (exclusive mode)
196	// only blocks readers after it has waited for an active upgrade lock to be
197	// released; until the upgrade lock is released (or upgraded or downgraded)
198	// readers will still be able to enter. Preferences about lock acquisition
199	// are not guaranteed to be enforced perfectly (even if they were, there
200	// is theoretically the chance that a thread could be arbitrarily suspended
201	// between calling lock() and SharedMutex code actually getting executed).
202	//
203	// try__for methods always try at least once, even if the duration*
204	// is zero or negative. The duration type must be compatible with
205	// std::chrono::steady_clock. try__until methods also always try at*
206	// least once. std::chrono::system_clock and std::chrono::steady_clock
207	// are supported.
208	//
209	// If you have observed by profiling that your SharedMutex-s are getting
210	// cache misses on deferredReaders[] due to another SharedMutex user, then
211	// you can use the tag type to create your own instantiation of the type.
212	// The contention threshold (see kNumSharedToStartDeferring) should make
213	// this unnecessary in all but the most extreme cases. Make sure to check
214	// that the increased icache and dcache footprint of the tagged result is
215	// worth it.
216
217	// SharedMutex's use of thread local storage is an optimization, so
218	// for the case where thread local storage is not supported, define it
219	// away.
220
221	// Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version
222	// (the default) of this mutex is annotated appropriately so that TSAN can
223	// perform lock inversion analysis. However, the SharedMutexReadPriority version
224	// is not annotated. This is because TSAN's lock order heuristic
225	// assumes that two calls to lock_shared must be ordered, which leads
226	// to too many false positives for the reader-priority case.
227	//
228	// Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an
229	// independent thread B is waiting for exclusive access. Then a thread C's
230	// lock_shared can't proceed until A has released the lock. Discounting
231	// situations that never use exclusive mode (so no lock is necessary at all)
232	// this means that without higher-level reasoning it is not safe to ignore
233	// reader <-> reader interactions.
234	//
235	// This reasoning does not apply to SharedMutexReadPriority, because there are
236	// no actions by a thread B that can make C need to wait for A. Since the
237	// overwhelming majority of SharedMutex instances use write priority, we
238	// restrict the TSAN annotations to only SharedMutexWritePriority.
239
240	#ifndef FOLLY_SHAREDMUTEX_TLS
241	#if !FOLLY_MOBILE
242	#define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
243	#else
244	#define FOLLY_SHAREDMUTEX_TLS
245	#endif
246	#endif
247
248	namespace folly {
249
250	struct SharedMutexToken {
251	enum class Type : uint16_t {
252	INVALID = `0`,
253	INLINE_SHARED,
254	DEFERRED_SHARED,
255	};
256
257	Type type_;
258	uint16_t slot_;
259	};
260
261	namespace detail {
262	// Returns a guard that gives permission for the current thread to
263	// annotate, and adjust the annotation bits in, the SharedMutex at ptr.
264	std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr);
265	} // namespace detail
266
267	template <
268	bool ReaderPriority,
269	typename Tag_ = void,
270	template <typename> class Atom = std::atomic,
271	bool BlockImmediately = false,
272	bool AnnotateForThreadSanitizer = kIsSanitizeThread && !ReaderPriority>
273	class SharedMutexImpl {
274	public:
275	static constexpr bool kReaderPriority = ReaderPriority;
276
277	typedef Tag_ Tag;
278
279	typedef SharedMutexToken Token;
280
281	class FOLLY_NODISCARD ReadHolder;
282	class FOLLY_NODISCARD UpgradeHolder;
283	class FOLLY_NODISCARD WriteHolder;
284
285	constexpr SharedMutexImpl() noexcept : state_(`0`) {}
286
287	SharedMutexImpl(const SharedMutexImpl&) = delete;
288	SharedMutexImpl(SharedMutexImpl&&) = delete;
289	SharedMutexImpl& operator=(const SharedMutexImpl&) = delete;
290	SharedMutexImpl& operator=(SharedMutexImpl&&) = delete;
291
292	// It is an error to destroy an SharedMutex that still has
293	// any outstanding locks. This is checked if NDEBUG isn't defined.
294	// SharedMutex's exclusive mode can be safely used to guard the lock's
295	// own destruction. If, for example, you acquire the lock in exclusive
296	// mode and then observe that the object containing the lock is no longer
297	// needed, you can unlock() and then immediately destroy the lock.
298	// See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
299	// description about why this property needs to be explicitly mentioned.
300	~SharedMutexImpl() {
301	auto state = state_.load(std::memory_order_relaxed);
302	if (UNLIKELY((state & kHasS) != `0`)) {
303	cleanupTokenlessSharedDeferred(state);
304	}
305
306	if (folly::kIsDebug) {
307	// These asserts check that everybody has released the lock before it
308	// is destroyed. If you arrive here while debugging that is likely
309	// the problem. (You could also have general heap corruption.)
310
311	// if a futexWait fails to go to sleep because the value has been
312	// changed, we don't necessarily clean up the wait bits, so it is
313	// possible they will be set here in a correct system
314	assert((state & ~(kWaitingAny \| kMayDefer \| kAnnotationCreated)) == `0`);
315	if ((state & kMayDefer) != `0`) {
316	for (uint32_t slot = `0`; slot < kMaxDeferredReaders; ++slot) {
317	auto slotValue =
318	deferredReader(slot)->load(std::memory_order_relaxed);
319	assert(!slotValueIsThis(slotValue));
320	(void)slotValue;
321	}
322	}
323	}
324	annotateDestroy();
325	}
326
327	// Checks if an exclusive lock could succeed so that lock elision could be
328	// enabled. Different from the two eligible_for_lock_{upgrade\|shared}_elision
329	// functions, this is a conservative check since kMayDefer indicates
330	// "may-existing" deferred readers.
331	bool eligible_for_lock_elision() const {
332	// We rely on the transaction for linearization. Wait bits are
333	// irrelevant because a successful transaction will be in and out
334	// without affecting the wakeup. kBegunE is also okay for a similar
335	// reason.
336	auto state = state_.load(std::memory_order_relaxed);
337	return (state & (kHasS \| kMayDefer \| kHasE \| kHasU)) == `0`;
338	}
339
340	// Checks if an upgrade lock could succeed so that lock elision could be
341	// enabled.
342	bool eligible_for_lock_upgrade_elision() const {
343	auto state = state_.load(std::memory_order_relaxed);
344	return (state & (kHasE \| kHasU)) == `0`;
345	}
346
347	// Checks if a shared lock could succeed so that lock elision could be
348	// enabled.
349	bool eligible_for_lock_shared_elision() const {
350	// No need to honor kBegunE because a transaction doesn't block anybody
351	auto state = state_.load(std::memory_order_relaxed);
352	return (state & kHasE) == `0`;
353	}
354
355	void lock() {
356	WaitForever ctx;
357	(void)lockExclusiveImpl(kHasSolo, ctx);
358	annotateAcquired(annotate_rwlock_level::wrlock);
359	}
360
361	bool try_lock() {
362	WaitNever ctx;
363	auto result = lockExclusiveImpl(kHasSolo, ctx);
364	annotateTryAcquired(result, annotate_rwlock_level::wrlock);
365	return result;
366	}
367
368	template <class Rep, class Period>
369	bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
370	WaitForDuration<Rep, Period> ctx(duration);
371	auto result = lockExclusiveImpl(kHasSolo, ctx);
372	annotateTryAcquired(result, annotate_rwlock_level::wrlock);
373	return result;
374	}
375
376	template <class Clock, class Duration>
377	bool try_lock_until(
378	const std::chrono::time_point<Clock, Duration>& absDeadline) {
379	WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
380	auto result = lockExclusiveImpl(kHasSolo, ctx);
381	annotateTryAcquired(result, annotate_rwlock_level::wrlock);
382	return result;
383	}
384
385	void unlock() {
386	annotateReleased(annotate_rwlock_level::wrlock);
387	// It is possible that we have a left-over kWaitingNotS if the last
388	// unlock_shared() that let our matching lock() complete finished
389	// releasing before lock()'s futexWait went to sleep. Clean it up now
390	auto state = (state_ &= ~(kWaitingNotS \| kPrevDefer \| kHasE));
391	assert((state & ~(kWaitingAny \| kAnnotationCreated)) == `0`);
392	wakeRegisteredWaiters(state, kWaitingE \| kWaitingU \| kWaitingS);
393	}
394
395	// Managing the token yourself makes unlock_shared a bit faster
396
397	void lock_shared() {
398	WaitForever ctx;
399	(void)lockSharedImpl(nullptr, ctx);
400	annotateAcquired(annotate_rwlock_level::rdlock);
401	}
402
403	void lock_shared(Token& token) {
404	WaitForever ctx;
405	(void)lockSharedImpl(&token, ctx);
406	annotateAcquired(annotate_rwlock_level::rdlock);
407	}
408
409	bool try_lock_shared() {
410	WaitNever ctx;
411	auto result = lockSharedImpl(nullptr, ctx);
412	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
413	return result;
414	}
415
416	bool try_lock_shared(Token& token) {
417	WaitNever ctx;
418	auto result = lockSharedImpl(&token, ctx);
419	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
420	return result;
421	}
422
423	template <class Rep, class Period>
424	bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
425	WaitForDuration<Rep, Period> ctx(duration);
426	auto result = lockSharedImpl(nullptr, ctx);
427	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
428	return result;
429	}
430
431	template <class Rep, class Period>
432	bool try_lock_shared_for(
433	const std::chrono::duration<Rep, Period>& duration,
434	Token& token) {
435	WaitForDuration<Rep, Period> ctx(duration);
436	auto result = lockSharedImpl(&token, ctx);
437	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
438	return result;
439	}
440
441	template <class Clock, class Duration>
442	bool try_lock_shared_until(
443	const std::chrono::time_point<Clock, Duration>& absDeadline) {
444	WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
445	auto result = lockSharedImpl(nullptr, ctx);
446	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
447	return result;
448	}
449
450	template <class Clock, class Duration>
451	bool try_lock_shared_until(
452	const std::chrono::time_point<Clock, Duration>& absDeadline,
453	Token& token) {
454	WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
455	auto result = lockSharedImpl(&token, ctx);
456	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
457	return result;
458	}
459
460	void unlock_shared() {
461	annotateReleased(annotate_rwlock_level::rdlock);
462
463	auto state = state_.load(std::memory_order_acquire);
464
465	// kPrevDefer can only be set if HasE or BegunE is set
466	assert((state & (kPrevDefer \| kHasE \| kBegunE)) != kPrevDefer);
467
468	// lock() strips kMayDefer immediately, but then copies it to
469	// kPrevDefer so we can tell if the pre-lock() lock_shared() might
470	// have deferred
471	if ((state & (kMayDefer \| kPrevDefer)) == `0` \|\|
472	!tryUnlockTokenlessSharedDeferred()) {
473	// Matching lock_shared() couldn't have deferred, or the deferred
474	// lock has already been inlined by applyDeferredReaders()
475	unlockSharedInline();
476	}
477	}
478
479	void unlock_shared(Token& token) {
480	annotateReleased(annotate_rwlock_level::rdlock);
481
482	assert(
483	token.type_ == Token::Type::INLINE_SHARED \|\|
484	token.type_ == Token::Type::DEFERRED_SHARED);
485
486	if (token.type_ != Token::Type::DEFERRED_SHARED \|\|
487	!tryUnlockSharedDeferred(token.slot_)) {
488	unlockSharedInline();
489	}
490	if (folly::kIsDebug) {
491	token.type_ = Token::Type::INVALID;
492	}
493	}
494
495	void unlock_and_lock_shared() {
496	annotateReleased(annotate_rwlock_level::wrlock);
497	annotateAcquired(annotate_rwlock_level::rdlock);
498	// We can't use state_ -=, because we need to clear 2 bits (1 of which
499	// has an uncertain initial state) and set 1 other. We might as well
500	// clear the relevant wake bits at the same time. Note that since S
501	// doesn't block the beginning of a transition to E (writer priority
502	// can cut off new S, reader priority grabs BegunE and blocks deferred
503	// S) we need to wake E as well.
504	auto state = state_.load(std::memory_order_acquire);
505	do {
506	assert(
507	(state & ~(kWaitingAny \| kPrevDefer \| kAnnotationCreated)) == kHasE);
508	} while (!state_.compare_exchange_strong(
509	state, (state & ~(kWaitingAny \| kPrevDefer \| kHasE)) + kIncrHasS));
510	if ((state & (kWaitingE \| kWaitingU \| kWaitingS)) != `0`) {
511	futexWakeAll(kWaitingE \| kWaitingU \| kWaitingS);
512	}
513	}
514
515	void unlock_and_lock_shared(Token& token) {
516	unlock_and_lock_shared();
517	token.type_ = Token::Type::INLINE_SHARED;
518	}
519
520	void lock_upgrade() {
521	WaitForever ctx;
522	(void)lockUpgradeImpl(ctx);
523	// For TSAN: treat upgrade locks as equivalent to read locks
524	annotateAcquired(annotate_rwlock_level::rdlock);
525	}
526
527	bool try_lock_upgrade() {
528	WaitNever ctx;
529	auto result = lockUpgradeImpl(ctx);
530	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
531	return result;
532	}
533
534	template <class Rep, class Period>
535	bool try_lock_upgrade_for(
536	const std::chrono::duration<Rep, Period>& duration) {
537	WaitForDuration<Rep, Period> ctx(duration);
538	auto result = lockUpgradeImpl(ctx);
539	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
540	return result;
541	}
542
543	template <class Clock, class Duration>
544	bool try_lock_upgrade_until(
545	const std::chrono::time_point<Clock, Duration>& absDeadline) {
546	WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
547	auto result = lockUpgradeImpl(ctx);
548	annotateTryAcquired(result, annotate_rwlock_level::rdlock);
549	return result;
550	}
551
552	void unlock_upgrade() {
553	annotateReleased(annotate_rwlock_level::rdlock);
554	auto state = (state_ -= kHasU);
555	assert((state & (kWaitingNotS \| kHasSolo)) == `0`);
556	wakeRegisteredWaiters(state, kWaitingE \| kWaitingU);
557	}
558
559	void unlock_upgrade_and_lock() {
560	// no waiting necessary, so waitMask is empty
561	WaitForever ctx;
562	(void)lockExclusiveImpl(`0`, ctx);
563	annotateReleased(annotate_rwlock_level::rdlock);
564	annotateAcquired(annotate_rwlock_level::wrlock);
565	}
566
567	void unlock_upgrade_and_lock_shared() {
568	// No need to annotate for TSAN here because we model upgrade and shared
569	// locks as the same.
570	auto state = (state_ -= kHasU - kIncrHasS);
571	assert((state & (kWaitingNotS \| kHasSolo)) == `0`);
572	wakeRegisteredWaiters(state, kWaitingE \| kWaitingU);
573	}
574
575	void unlock_upgrade_and_lock_shared(Token& token) {
576	unlock_upgrade_and_lock_shared();
577	token.type_ = Token::Type::INLINE_SHARED;
578	}
579
580	void unlock_and_lock_upgrade() {
581	annotateReleased(annotate_rwlock_level::wrlock);
582	annotateAcquired(annotate_rwlock_level::rdlock);
583	// We can't use state_ -=, because we need to clear 2 bits (1 of
584	// which has an uncertain initial state) and set 1 other. We might
585	// as well clear the relevant wake bits at the same time.
586	auto state = state_.load(std::memory_order_acquire);
587	while (true) {
588	assert(
589	(state & ~(kWaitingAny \| kPrevDefer \| kAnnotationCreated)) == kHasE);
590	auto after =
591	(state & ~(kWaitingNotS \| kWaitingS \| kPrevDefer \| kHasE)) + kHasU;
592	if (state_.compare_exchange_strong(state, after)) {
593	if ((state & kWaitingS) != `0`) {
594	futexWakeAll(kWaitingS);
595	}
596	return;
597	}
598	}
599	}
600
601	private:
602	typedef typename folly::detail::Futex<Atom> Futex;
603
604	// Internally we use four kinds of wait contexts. These are structs
605	// that provide a doWait method that returns true if a futex wake
606	// was issued that intersects with the waitMask, false if there was a
607	// timeout and no more waiting should be performed. Spinning occurs
608	// before the wait context is invoked.
609
610	struct WaitForever {
611	bool canBlock() {
612	return true;
613	}
614	bool canTimeOut() {
615	return false;
616	}
617	bool shouldTimeOut() {
618	return false;
619	}
620
621	bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
622	detail::futexWait(&futex, expected, waitMask);
623	return true;
624	}
625	};
626
627	struct WaitNever {
628	bool canBlock() {
629	return false;
630	}
631	bool canTimeOut() {
632	return true;
633	}
634	bool shouldTimeOut() {
635	return true;
636	}
637
638	bool doWait(
639	Futex& / futex /,
640	uint32_t / expected /,
641	uint32_t / waitMask /) {
642	return false;
643	}
644	};
645
646	template <class Rep, class Period>
647	struct WaitForDuration {
648	std::chrono::duration<Rep, Period> duration_;
649	bool deadlineComputed_;
650	std::chrono::steady_clock::time_point deadline_;
651
652	explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
653	: duration_(duration), deadlineComputed_(false) {}
654
655	std::chrono::steady_clock::time_point deadline() {
656	if (!deadlineComputed_) {
657	deadline_ = std::chrono::steady_clock::now() + duration_;
658	deadlineComputed_ = true;
659	}
660	return deadline_;
661	}
662
663	bool canBlock() {
664	return duration_.count() > `0`;
665	}
666	bool canTimeOut() {
667	return true;
668	}
669
670	bool shouldTimeOut() {
671	return std::chrono::steady_clock::now() > deadline();
672	}
673
674	bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
675	auto result =
676	detail::futexWaitUntil(&futex, expected, deadline(), waitMask);
677	return result != folly::detail::FutexResult::TIMEDOUT;
678	}
679	};
680
681	template <class Clock, class Duration>
682	struct WaitUntilDeadline {
683	std::chrono::time_point<Clock, Duration> absDeadline_;
684
685	bool canBlock() {
686	return true;
687	}
688	bool canTimeOut() {
689	return true;
690	}
691	bool shouldTimeOut() {
692	return Clock::now() > absDeadline_;
693	}
694
695	bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
696	auto result =
697	detail::futexWaitUntil(&futex, expected, absDeadline_, waitMask);
698	return result != folly::detail::FutexResult::TIMEDOUT;
699	}
700	};
701
702	void annotateLazyCreate() {
703	if (AnnotateForThreadSanitizer &&
704	(state_.load() & kAnnotationCreated) == `0`) {
705	auto guard = detail::sharedMutexAnnotationGuard(this);
706	// check again
707	if ((state_.load() & kAnnotationCreated) == `0`) {
708	state_.fetch_or(kAnnotationCreated);
709	annotate_benign_race_sized(
710	&state_, sizeof(state_), "init TSAN", __FILE__, __LINE__);
711	annotate_rwlock_create(this, __FILE__, __LINE__);
712	}
713	}
714	}
715
716	void annotateDestroy() {
717	if (AnnotateForThreadSanitizer) {
718	annotateLazyCreate();
719	annotate_rwlock_destroy(this, __FILE__, __LINE__);
720	}
721	}
722
723	void annotateAcquired(annotate_rwlock_level w) {
724	if (AnnotateForThreadSanitizer) {
725	annotateLazyCreate();
726	annotate_rwlock_acquired(this, w, __FILE__, __LINE__);
727	}
728	}
729
730	void annotateTryAcquired(bool result, annotate_rwlock_level w) {
731	if (AnnotateForThreadSanitizer) {
732	annotateLazyCreate();
733	annotate_rwlock_try_acquired(this, w, result, __FILE__, __LINE__);
734	}
735	}
736
737	void annotateReleased(annotate_rwlock_level w) {
738	if (AnnotateForThreadSanitizer) {
739	assert((state_.load() & kAnnotationCreated) != `0`);
740	annotate_rwlock_released(this, w, __FILE__, __LINE__);
741	}
742	}
743
744	// 32 bits of state
745	Futex state_{};
746
747	// S count needs to be on the end, because we explicitly allow it to
748	// underflow. This can occur while we are in the middle of applying
749	// deferred locks (we remove them from deferredReaders[] before
750	// inlining them), or during token-less unlock_shared() if a racing
751	// lock_shared();unlock_shared() moves the deferredReaders slot while
752	// the first unlock_shared() is scanning. The former case is cleaned
753	// up before we finish applying the locks. The latter case can persist
754	// until destruction, when it is cleaned up.
755	static constexpr uint32_t kIncrHasS = `1` << `11`;
756	static constexpr uint32_t kHasS = ~(kIncrHasS - `1`);
757
758	// Set if annotation has been completed for this instance. That annotation
759	// (and setting this bit afterward) must be guarded by one of the mutexes in
760	// annotationCreationGuards.
761	static constexpr uint32_t kAnnotationCreated = `1` << `10`;
762
763	// If false, then there are definitely no deferred read locks for this
764	// instance. Cleared after initialization and when exclusively locked.
765	static constexpr uint32_t kMayDefer = `1` << `9`;
766
767	// lock() cleared kMayDefer as soon as it starts draining readers (so
768	// that it doesn't have to do a second CAS once drain completes), but
769	// unlock_shared() still needs to know whether to scan deferredReaders[]
770	// or not. We copy kMayDefer to kPrevDefer when setting kHasE or
771	// kBegunE, and clear it when clearing those bits.
772	static constexpr uint32_t kPrevDefer = `1` << `8`;
773
774	// Exclusive-locked blocks all read locks and write locks. This bit
775	// may be set before all readers have finished, but in that case the
776	// thread that sets it won't return to the caller until all read locks
777	// have been released.
778	static constexpr uint32_t kHasE = `1` << `7`;
779
780	// Exclusive-draining means that lock() is waiting for existing readers
781	// to leave, but that new readers may still acquire shared access.
782	// This is only used in reader priority mode. New readers during
783	// drain must be inline. The difference between this and kHasU is that
784	// kBegunE prevents kMayDefer from being set.
785	static constexpr uint32_t kBegunE = `1` << `6`;
786
787	// At most one thread may have either exclusive or upgrade lock
788	// ownership. Unlike exclusive mode, ownership of the lock in upgrade
789	// mode doesn't preclude other threads holding the lock in shared mode.
790	// boost's concept for this doesn't explicitly say whether new shared
791	// locks can be acquired one lock_upgrade has succeeded, but doesn't
792	// list that as disallowed. RWSpinLock disallows new read locks after
793	// lock_upgrade has been acquired, but the boost implementation doesn't.
794	// We choose the latter.
795	static constexpr uint32_t kHasU = `1` << `5`;
796
797	// There are three states that we consider to be "solo", in that they
798	// cannot coexist with other solo states. These are kHasE, kBegunE,
799	// and kHasU. Note that S doesn't conflict with any of these, because
800	// setting the kHasE is only one of the two steps needed to actually
801	// acquire the lock in exclusive mode (the other is draining the existing
802	// S holders).
803	static constexpr uint32_t kHasSolo = kHasE \| kBegunE \| kHasU;
804
805	// Once a thread sets kHasE it needs to wait for the current readers
806	// to exit the lock. We give this a separate wait identity from the
807	// waiting to set kHasE so that we can perform partial wakeups (wake
808	// one instead of wake all).
809	static constexpr uint32_t kWaitingNotS = `1` << `4`;
810
811	// When waking writers we can either wake them all, in which case we
812	// can clear kWaitingE, or we can call futexWake(1). futexWake tells
813	// us if anybody woke up, but even if we detect that nobody woke up we
814	// can't clear the bit after the fact without issuing another wakeup.
815	// To avoid thundering herds when there are lots of pending lock()
816	// without needing to call futexWake twice when there is only one
817	// waiter, kWaitingE actually encodes if we have observed multiple
818	// concurrent waiters. Tricky: ABA issues on futexWait mean that when
819	// we see kWaitingESingle we can't assume that there is only one.
820	static constexpr uint32_t kWaitingESingle = `1` << `2`;
821	static constexpr uint32_t kWaitingEMultiple = `1` << `3`;
822	static constexpr uint32_t kWaitingE = kWaitingESingle \| kWaitingEMultiple;
823
824	// kWaitingU is essentially a 1 bit saturating counter. It always
825	// requires a wakeAll.
826	static constexpr uint32_t kWaitingU = `1` << `1`;
827
828	// All blocked lock_shared() should be awoken, so it is correct (not
829	// suboptimal) to wakeAll if there are any shared readers.
830	static constexpr uint32_t kWaitingS = `1` << `0`;
831
832	// kWaitingAny is a mask of all of the bits that record the state of
833	// threads, rather than the state of the lock. It is convenient to be
834	// able to mask them off during asserts.
835	static constexpr uint32_t kWaitingAny =
836	kWaitingNotS \| kWaitingE \| kWaitingU \| kWaitingS;
837
838	// The reader count at which a reader will attempt to use the lock
839	// in deferred mode. If this value is 2, then the second concurrent
840	// reader will set kMayDefer and use deferredReaders[]. kMayDefer is
841	// cleared during exclusive access, so this threshold must be reached
842	// each time a lock is held in exclusive mode.
843	static constexpr uint32_t kNumSharedToStartDeferring = `2`;
844
845	// The typical number of spins that a thread will wait for a state
846	// transition. There is no bound on the number of threads that can wait
847	// for a writer, so we are pretty conservative here to limit the chance
848	// that we are starving the writer of CPU. Each spin is 6 or 7 nanos,
849	// almost all of which is in the pause instruction.
850	static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? `1000` : `2`;
851
852	// The maximum number of soft yields before falling back to futex.
853	// If the preemption heuristic is activated we will fall back before
854	// this. A soft yield takes ~900 nanos (two sched_yield plus a call
855	// to getrusage, with checks of the goal at each step). Soft yields
856	// aren't compatible with deterministic execution under test (unlike
857	// futexWaitUntil, which has a capricious but deterministic back end).
858	static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? `1000` : `0`;
859
860	// If AccessSpreader assigns indexes from 0..kn-1 on a system where some*
861	// level of the memory hierarchy is symmetrically divided into k pieces
862	// (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
863	// that are the same after integer division by k share that resource.
864	// Our strategy for deferred readers is to probe up to numSlots/4 slots,
865	// using the full granularity of AccessSpreader for the start slot
866	// and then search outward. We can use AccessSpreader::current(n)
867	// without managing our own spreader if kMaxDeferredReaders <=
868	// AccessSpreader::kMaxCpus, which is currently 128.
869	//
870	// Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
871	// with 64 byte cache lines. That means we need 6416 bytes of*
872	// deferredReaders[] to give each L1 its own playground. On x86_64
873	// each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
874	// kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If*
875	// kDeferredSearchDistance kDeferredSeparationFactor <=*
876	// 64 / 8 then we will search only within a single cache line, which
877	// guarantees we won't have inter-L1 contention. We give ourselves
878	// a factor of 2 on the core count, which should hold us for a couple
879	// processor generations. deferredReaders[] is 2048 bytes currently.
880	public:
881	static constexpr uint32_t kMaxDeferredReaders = `64`;
882	static constexpr uint32_t kDeferredSearchDistance = `2`;
883	static constexpr uint32_t kDeferredSeparationFactor = `4`;
884
885	private:
886	static_assert(
887	!(kMaxDeferredReaders & (kMaxDeferredReaders - `1`)),
888	"kMaxDeferredReaders must be a power of 2");
889	static_assert(
890	!(kDeferredSearchDistance & (kDeferredSearchDistance - `1`)),
891	"kDeferredSearchDistance must be a power of 2");
892
893	// The number of deferred locks that can be simultaneously acquired
894	// by a thread via the token-less methods without performing any heap
895	// allocations. Each of these costs 3 pointers (24 bytes, probably)
896	// per thread. There's not much point in making this larger than
897	// kDeferredSearchDistance.
898	static constexpr uint32_t kTokenStackTLSCapacity = `2`;
899
900	// We need to make sure that if there is a lock_shared()
901	// and lock_shared(token) followed by unlock_shared() and
902	// unlock_shared(token), the token-less unlock doesn't null
903	// out deferredReaders[token.slot_]. If we allowed that, then
904	// unlock_shared(token) wouldn't be able to assume that its lock
905	// had been inlined by applyDeferredReaders when it finds that
906	// deferredReaders[token.slot_] no longer points to this. We accomplish
907	// this by stealing bit 0 from the pointer to record that the slot's
908	// element has no token, hence our use of uintptr_t in deferredReaders[].
909	static constexpr uintptr_t kTokenless = `0x1`;
910
911	// This is the starting location for Token-less unlock_shared().
912	static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
913
914	// Last deferred reader slot used.
915	static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
916
917	// Only indexes divisible by kDeferredSeparationFactor are used.
918	// If any of those elements points to a SharedMutexImpl, then it
919	// should be considered that there is a shared lock on that instance.
920	// See kTokenless.
921	public:
922	typedef Atom<uintptr_t> DeferredReaderSlot;
923
924	private:
925	alignas(hardware_destructive_interference_size) static DeferredReaderSlot
926	deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor];
927
928	// Performs an exclusive lock, waiting for state_ & waitMask to be
929	// zero first
930	template <class WaitContext>
931	bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
932	uint32_t state = state_.load(std::memory_order_acquire);
933	if (LIKELY(
934	(state & (preconditionGoalMask \| kMayDefer \| kHasS)) == `0` &&
935	state_.compare_exchange_strong(state, (state \| kHasE) & ~kHasU))) {
936	return true;
937	} else {
938	return lockExclusiveImpl(state, preconditionGoalMask, ctx);
939	}
940	}
941
942	template <class WaitContext>
943	bool lockExclusiveImpl(
944	uint32_t& state,
945	uint32_t preconditionGoalMask,
946	WaitContext& ctx) {
947	while (true) {
948	if (UNLIKELY((state & preconditionGoalMask) != `0`) &&
949	!waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
950	ctx.canTimeOut()) {
951	return false;
952	}
953
954	uint32_t after = (state & kMayDefer) == `0` ? `0` : kPrevDefer;
955	if (!kReaderPriority \|\| (state & (kMayDefer \| kHasS)) == `0`) {
956	// Block readers immediately, either because we are in write
957	// priority mode or because we can acquire the lock in one
958	// step. Note that if state has kHasU, then we are doing an
959	// unlock_upgrade_and_lock() and we should clear it (reader
960	// priority branch also does this).
961	after \|= (state \| kHasE) & ~(kHasU \| kMayDefer);
962	} else {
963	after \|= (state \| kBegunE) & ~(kHasU \| kMayDefer);
964	}
965	if (state_.compare_exchange_strong(state, after)) {
966	auto before = state;
967	state = after;
968
969	// If we set kHasE (writer priority) then no new readers can
970	// arrive. If we set kBegunE then they can still enter, but
971	// they must be inline. Either way we need to either spin on
972	// deferredReaders[] slots, or inline them so that we can wait on
973	// kHasS to zero itself. deferredReaders[] is pointers, which on
974	// x86_64 are bigger than futex() can handle, so we inline the
975	// deferred locks instead of trying to futexWait on each slot.
976	// Readers are responsible for rechecking state_ after recording
977	// a deferred read to avoid atomicity problems between the state_
978	// CAS and applyDeferredReader's reads of deferredReaders[].
979	if (UNLIKELY((before & kMayDefer) != `0`)) {
980	applyDeferredReaders(state, ctx);
981	}
982	while (true) {
983	assert((state & (kHasE \| kBegunE)) != `0` && (state & kHasU) == `0`);
984	if (UNLIKELY((state & kHasS) != `0`) &&
985	!waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
986	ctx.canTimeOut()) {
987	// Ugh. We blocked new readers and other writers for a while,
988	// but were unable to complete. Move on. On the plus side
989	// we can clear kWaitingNotS because nobody else can piggyback
990	// on it.
991	state = (state_ &= ~(kPrevDefer \| kHasE \| kBegunE \| kWaitingNotS));
992	wakeRegisteredWaiters(state, kWaitingE \| kWaitingU \| kWaitingS);
993	return false;
994	}
995
996	if (kReaderPriority && (state & kHasE) == `0`) {
997	assert((state & kBegunE) != `0`);
998	if (!state_.compare_exchange_strong(
999	state, (state & ~kBegunE) \| kHasE)) {
1000	continue;
1001	}
1002	}
1003
1004	return true;
1005	}
1006	}
1007	}
1008	}
1009
1010	template <class WaitContext>
1011	bool waitForZeroBits(
1012	uint32_t& state,
1013	uint32_t goal,
1014	uint32_t waitMask,
1015	WaitContext& ctx) {
1016	uint32_t spinCount = `0`;
1017	while (true) {
1018	state = state_.load(std::memory_order_acquire);
1019	if ((state & goal) == `0`) {
1020	return true;
1021	}
1022	asm_volatile_pause();
1023	++spinCount;
1024	if (UNLIKELY(spinCount >= kMaxSpinCount)) {
1025	return ctx.canBlock() &&
1026	yieldWaitForZeroBits(state, goal, waitMask, ctx);
1027	}
1028	}
1029	}
1030
1031	template <class WaitContext>
1032	bool yieldWaitForZeroBits(
1033	uint32_t& state,
1034	uint32_t goal,
1035	uint32_t waitMask,
1036	WaitContext& ctx) {
1037	#ifdef RUSAGE_THREAD
1038	struct rusage usage;
1039	std::memset(&usage, `0`, sizeof(usage));
1040	long before = -`1`;
1041	#endif
1042	for (uint32_t yieldCount = `0`; yieldCount < kMaxSoftYieldCount;
1043	++yieldCount) {
1044	for (int softState = `0`; softState < `3`; ++softState) {
1045	if (softState < `2`) {
1046	std::this_thread::yield();
1047	} else {
1048	#ifdef RUSAGE_THREAD
1049	getrusage(RUSAGE_THREAD, &usage);
1050	#endif
1051	}
1052	if (((state = state_.load(std::memory_order_acquire)) & goal) == `0`) {
1053	return true;
1054	}
1055	if (ctx.shouldTimeOut()) {
1056	return false;
1057	}
1058	}
1059	#ifdef RUSAGE_THREAD
1060	if (before >= `0` && usage.ru_nivcsw >= before + `2`) {
1061	// One involuntary csw might just be occasional background work,
1062	// but if we get two in a row then we guess that there is someone
1063	// else who can profitably use this CPU. Fall back to futex
1064	break;
1065	}
1066	before = usage.ru_nivcsw;
1067	#endif
1068	}
1069	return futexWaitForZeroBits(state, goal, waitMask, ctx);
1070	}
1071
1072	template <class WaitContext>
1073	bool futexWaitForZeroBits(
1074	uint32_t& state,
1075	uint32_t goal,
1076	uint32_t waitMask,
1077	WaitContext& ctx) {
1078	assert(
1079	waitMask == kWaitingNotS \|\| waitMask == kWaitingE \|\|
1080	waitMask == kWaitingU \|\| waitMask == kWaitingS);
1081
1082	while (true) {
1083	state = state_.load(std::memory_order_acquire);
1084	if ((state & goal) == `0`) {
1085	return true;
1086	}
1087
1088	auto after = state;
1089	if (waitMask == kWaitingE) {
1090	if ((state & kWaitingESingle) != `0`) {
1091	after \|= kWaitingEMultiple;
1092	} else {
1093	after \|= kWaitingESingle;
1094	}
1095	} else {
1096	after \|= waitMask;
1097	}
1098
1099	// CAS is better than atomic \|= here, because it lets us avoid
1100	// setting the wait flag when the goal is concurrently achieved
1101	if (after != state && !state_.compare_exchange_strong(state, after)) {
1102	continue;
1103	}
1104
1105	if (!ctx.doWait(state_, after, waitMask)) {
1106	// timed out
1107	return false;
1108	}
1109	}
1110	}
1111
1112	// Wakes up waiters registered in state_ as appropriate, clearing the
1113	// awaiting bits for anybody that was awoken. Tries to perform direct
1114	// single wakeup of an exclusive waiter if appropriate
1115	void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
1116	if (UNLIKELY((state & wakeMask) != `0`)) {
1117	wakeRegisteredWaitersImpl(state, wakeMask);
1118	}
1119	}
1120
1121	void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
1122	// If there are multiple lock() pending only one of them will actually
1123	// get to wake up, so issuing futexWakeAll will make a thundering herd.
1124	// There's nothing stopping us from issuing futexWake(1) instead,
1125	// so long as the wait bits are still an accurate reflection of
1126	// the waiters. If we notice (via futexWake's return value) that
1127	// nobody woke up then we can try again with the normal wake-all path.
1128	// Note that we can't just clear the bits at that point; we need to
1129	// clear the bits and then issue another wakeup.
1130	//
1131	// It is possible that we wake an E waiter but an outside S grabs the
1132	// lock instead, at which point we should wake pending U and S waiters.
1133	// Rather than tracking state to make the failing E regenerate the
1134	// wakeup, we just disable the optimization in the case that there
1135	// are waiting U or S that we are eligible to wake.
1136	if ((wakeMask & kWaitingE) == kWaitingE &&
1137	(state & wakeMask) == kWaitingE &&
1138	detail::futexWake(&state_, `1`, kWaitingE) > `0`) {
1139	// somebody woke up, so leave state_ as is and clear it later
1140	return;
1141	}
1142
1143	if ((state & wakeMask) != `0`) {
1144	auto prev = state_.fetch_and(~wakeMask);
1145	if ((prev & wakeMask) != `0`) {
1146	futexWakeAll(wakeMask);
1147	}
1148	state = prev & ~wakeMask;
1149	}
1150	}
1151
1152	void futexWakeAll(uint32_t wakeMask) {
1153	detail::futexWake(&state_, std::numeric_limits<int>::max(), wakeMask);
1154	}
1155
1156	DeferredReaderSlot* deferredReader(uint32_t slot) {
1157	return &deferredReaders[slot * kDeferredSeparationFactor];
1158	}
1159
1160	uintptr_t tokenfulSlotValue() {
1161	return reinterpret_cast<uintptr_t>(this);
1162	}
1163
1164	uintptr_t tokenlessSlotValue() {
1165	return tokenfulSlotValue() \| kTokenless;
1166	}
1167
1168	bool slotValueIsThis(uintptr_t slotValue) {
1169	return (slotValue & ~kTokenless) == tokenfulSlotValue();
1170	}
1171
1172	// Clears any deferredReaders[] that point to this, adjusting the inline
1173	// shared lock count to compensate. Does some spinning and yielding
1174	// to avoid the work. Always finishes the application, even if ctx
1175	// times out.
1176	template <class WaitContext>
1177	void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
1178	uint32_t slot = `0`;
1179
1180	uint32_t spinCount = `0`;
1181	while (true) {
1182	while (!slotValueIsThis(
1183	deferredReader(slot)->load(std::memory_order_acquire))) {
1184	if (++slot == kMaxDeferredReaders) {
1185	return;
1186	}
1187	}
1188	asm_volatile_pause();
1189	if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
1190	applyDeferredReaders(state, ctx, slot);
1191	return;
1192	}
1193	}
1194	}
1195
1196	template <class WaitContext>
1197	void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
1198	#ifdef RUSAGE_THREAD
1199	struct rusage usage;
1200	std::memset(&usage, `0`, sizeof(usage));
1201	long before = -`1`;
1202	#endif
1203	for (uint32_t yieldCount = `0`; yieldCount < kMaxSoftYieldCount;
1204	++yieldCount) {
1205	for (int softState = `0`; softState < `3`; ++softState) {
1206	if (softState < `2`) {
1207	std::this_thread::yield();
1208	} else {
1209	#ifdef RUSAGE_THREAD
1210	getrusage(RUSAGE_THREAD, &usage);
1211	#endif
1212	}
1213	while (!slotValueIsThis(
1214	deferredReader(slot)->load(std::memory_order_acquire))) {
1215	if (++slot == kMaxDeferredReaders) {
1216	return;
1217	}
1218	}
1219	if (ctx.shouldTimeOut()) {
1220	// finish applying immediately on timeout
1221	break;
1222	}
1223	}
1224	#ifdef RUSAGE_THREAD
1225	if (before >= `0` && usage.ru_nivcsw >= before + `2`) {
1226	// heuristic says run queue is not empty
1227	break;
1228	}
1229	before = usage.ru_nivcsw;
1230	#endif
1231	}
1232
1233	uint32_t movedSlotCount = `0`;
1234	for (; slot < kMaxDeferredReaders; ++slot) {
1235	auto slotPtr = deferredReader(slot);
1236	auto slotValue = slotPtr->load(std::memory_order_acquire);
1237	if (slotValueIsThis(slotValue) &&
1238	slotPtr->compare_exchange_strong(slotValue, `0`)) {
1239	++movedSlotCount;
1240	}
1241	}
1242
1243	if (movedSlotCount > `0`) {
1244	state = (state_ += movedSlotCount * kIncrHasS);
1245	}
1246	assert((state & (kHasE \| kBegunE)) != `0`);
1247
1248	// if state + kIncrHasS overflows (off the end of state) then either
1249	// we have 2^(32-9) readers (almost certainly an application bug)
1250	// or we had an underflow (also a bug)
1251	assert(state < state + kIncrHasS);
1252	}
1253
1254	// It is straightfoward to make a token-less lock_shared() and
1255	// unlock_shared() either by making the token-less version always use
1256	// INLINE_SHARED mode or by removing the token version. Supporting
1257	// deferred operation for both types is trickier than it appears, because
1258	// the purpose of the token it so that unlock_shared doesn't have to
1259	// look in other slots for its deferred lock. Token-less unlock_shared
1260	// might place a deferred lock in one place and then release a different
1261	// slot that was originally used by the token-ful version. If this was
1262	// important we could solve the problem by differentiating the deferred
1263	// locks so that cross-variety release wouldn't occur. The best way
1264	// is probably to steal a bit from the pointer, making deferredLocks[]
1265	// an array of Atom<uintptr_t>.
1266
1267	template <class WaitContext>
1268	bool lockSharedImpl(Token* token, WaitContext& ctx) {
1269	uint32_t state = state_.load(std::memory_order_relaxed);
1270	if ((state & (kHasS \| kMayDefer \| kHasE)) == `0` &&
1271	state_.compare_exchange_strong(state, state + kIncrHasS)) {
1272	if (token != nullptr) {
1273	token->type_ = Token::Type::INLINE_SHARED;
1274	}
1275	return true;
1276	}
1277	return lockSharedImpl(state, token, ctx);
1278	}
1279
1280	template <class WaitContext>
1281	bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
1282
1283	// Updates the state in/out argument as if the locks were made inline,
1284	// but does not update state_
1285	void cleanupTokenlessSharedDeferred(uint32_t& state) {
1286	for (uint32_t i = `0`; i < kMaxDeferredReaders; ++i) {
1287	auto slotPtr = deferredReader(i);
1288	auto slotValue = slotPtr->load(std::memory_order_relaxed);
1289	if (slotValue == tokenlessSlotValue()) {
1290	slotPtr->store(`0`, std::memory_order_relaxed);
1291	state += kIncrHasS;
1292	if ((state & kHasS) == `0`) {
1293	break;
1294	}
1295	}
1296	}
1297	}
1298
1299	bool tryUnlockTokenlessSharedDeferred();
1300
1301	bool tryUnlockSharedDeferred(uint32_t slot) {
1302	assert(slot < kMaxDeferredReaders);
1303	auto slotValue = tokenfulSlotValue();
1304	return deferredReader(slot)->compare_exchange_strong(slotValue, `0`);
1305	}
1306
1307	uint32_t unlockSharedInline() {
1308	uint32_t state = (state_ -= kIncrHasS);
1309	assert(
1310	(state & (kHasE \| kBegunE \| kMayDefer)) != `0` \|\|
1311	state < state + kIncrHasS);
1312	if ((state & kHasS) == `0`) {
1313	// Only the second half of lock() can be blocked by a non-zero
1314	// reader count, so that's the only thing we need to wake
1315	wakeRegisteredWaiters(state, kWaitingNotS);
1316	}
1317	return state;
1318	}
1319
1320	template <class WaitContext>
1321	bool lockUpgradeImpl(WaitContext& ctx) {
1322	uint32_t state;
1323	do {
1324	if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
1325	return false;
1326	}
1327	} while (!state_.compare_exchange_strong(state, state \| kHasU));
1328	return true;
1329	}
1330
1331	public:
1332	class FOLLY_NODISCARD ReadHolder {
1333	ReadHolder() : lock_(nullptr) {}
1334
1335	public:
1336	explicit ReadHolder(const SharedMutexImpl* lock)
1337	: lock_(const_cast<SharedMutexImpl*>(lock)) {
1338	if (lock_) {
1339	lock_->lock_shared(token_);
1340	}
1341	}
1342
1343	explicit ReadHolder(const SharedMutexImpl& lock)
1344	: lock_(const_cast<SharedMutexImpl*>(&lock)) {
1345	lock_->lock_shared(token_);
1346	}
1347
1348	ReadHolder(ReadHolder&& rhs) noexcept
1349	: lock_(rhs.lock_), token_(rhs.token_) {
1350	rhs.lock_ = nullptr;
1351	}
1352
1353	// Downgrade from upgrade mode
1354	explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
1355	assert(upgraded.lock_ != nullptr);
1356	upgraded.lock_ = nullptr;
1357	lock_->unlock_upgrade_and_lock_shared(token_);
1358	}
1359
1360	// Downgrade from exclusive mode
1361	explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1362	assert(writer.lock_ != nullptr);
1363	writer.lock_ = nullptr;
1364	lock_->unlock_and_lock_shared(token_);
1365	}
1366
1367	ReadHolder& operator=(ReadHolder&& rhs) noexcept {
1368	std::swap(lock_, rhs.lock_);
1369	std::swap(token_, rhs.token_);
1370	return *this;
1371	}
1372
1373	ReadHolder(const ReadHolder& rhs) = delete;
1374	ReadHolder& operator=(const ReadHolder& rhs) = delete;
1375
1376	~ReadHolder() {
1377	unlock();
1378	}
1379
1380	void unlock() {
1381	if (lock_) {
1382	lock_->unlock_shared(token_);
1383	lock_ = nullptr;
1384	}
1385	}
1386
1387	private:
1388	friend class UpgradeHolder;
1389	friend class WriteHolder;
1390	SharedMutexImpl* lock_;
1391	SharedMutexToken token_;
1392	};
1393
1394	class FOLLY_NODISCARD UpgradeHolder {
1395	UpgradeHolder() : lock_(nullptr) {}
1396
1397	public:
1398	explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) {
1399	if (lock_) {
1400	lock_->lock_upgrade();
1401	}
1402	}
1403
1404	explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
1405	lock_->lock_upgrade();
1406	}
1407
1408	// Downgrade from exclusive mode
1409	explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1410	assert(writer.lock_ != nullptr);
1411	writer.lock_ = nullptr;
1412	lock_->unlock_and_lock_upgrade();
1413	}
1414
1415	UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
1416	rhs.lock_ = nullptr;
1417	}
1418
1419	UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
1420	std::swap(lock_, rhs.lock_);
1421	return *this;
1422	}
1423
1424	UpgradeHolder(const UpgradeHolder& rhs) = delete;
1425	UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
1426
1427	~UpgradeHolder() {
1428	unlock();
1429	}
1430
1431	void unlock() {
1432	if (lock_) {
1433	lock_->unlock_upgrade();
1434	lock_ = nullptr;
1435	}
1436	}
1437
1438	private:
1439	friend class WriteHolder;
1440	friend class ReadHolder;
1441	SharedMutexImpl* lock_;
1442	};
1443
1444	class FOLLY_NODISCARD WriteHolder {
1445	WriteHolder() : lock_(nullptr) {}
1446
1447	public:
1448	explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) {
1449	if (lock_) {
1450	lock_->lock();
1451	}
1452	}
1453
1454	explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
1455	lock_->lock();
1456	}
1457
1458	// Promotion from upgrade mode
1459	explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
1460	assert(upgrade.lock_ != nullptr);
1461	upgrade.lock_ = nullptr;
1462	lock_->unlock_upgrade_and_lock();
1463	}
1464
1465	// README:
1466	//
1467	// It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
1468	//
1469	// Shared locks (read) can not safely upgrade to unique locks (write).
1470	// That upgrade path is a well-known recipe for deadlock, so we explicitly
1471	// disallow it.
1472	//
1473	// If you need to do a conditional mutation, you have a few options:
1474	// 1. Check the condition under a shared lock and release it.
1475	// Then maybe check the condition again under a unique lock and maybe do
1476	// the mutation.
1477	// 2. Check the condition once under an upgradeable lock.
1478	// Then maybe upgrade the lock to a unique lock and do the mutation.
1479	// 3. Check the condition and maybe perform the mutation under a unique
1480	// lock.
1481	//
1482	// Relevant upgradeable lock notes:
1483	// At most one upgradeable lock can be held at a time for a given shared*
1484	// mutex, just like a unique lock.
1485	// An upgradeable lock may be held concurrently with any number of shared*
1486	// locks.
1487	// An upgradeable lock may be upgraded atomically to a unique lock.*
1488
1489	WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
1490	rhs.lock_ = nullptr;
1491	}
1492
1493	WriteHolder& operator=(WriteHolder&& rhs) noexcept {
1494	std::swap(lock_, rhs.lock_);
1495	return *this;
1496	}
1497
1498	WriteHolder(const WriteHolder& rhs) = delete;
1499	WriteHolder& operator=(const WriteHolder& rhs) = delete;
1500
1501	~WriteHolder() {
1502	unlock();
1503	}
1504
1505	void unlock() {
1506	if (lock_) {
1507	lock_->unlock();
1508	lock_ = nullptr;
1509	}
1510	}
1511
1512	private:
1513	friend class ReadHolder;
1514	friend class UpgradeHolder;
1515	SharedMutexImpl* lock_;
1516	};
1517
1518	// Adapters for Synchronized<>
1519	friend void acquireRead(SharedMutexImpl& lock) {
1520	lock.lock_shared();
1521	}
1522	friend void acquireReadWrite(SharedMutexImpl& lock) {
1523	lock.lock();
1524	}
1525	friend void releaseRead(SharedMutexImpl& lock) {
1526	lock.unlock_shared();
1527	}
1528	friend void releaseReadWrite(SharedMutexImpl& lock) {
1529	lock.unlock();
1530	}
1531	friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
1532	return lock.try_lock_shared_for(std::chrono::milliseconds (ms));
1533	}
1534	friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
1535	return lock.try_lock_for(std::chrono::milliseconds (ms));
1536	}
1537	};
1538
1539	typedef SharedMutexImpl<true> SharedMutexReadPriority;
1540	typedef SharedMutexImpl<false> SharedMutexWritePriority;
1541	typedef SharedMutexWritePriority SharedMutex;
1542	typedef SharedMutexImpl<false, void, std::atomic, false, false>
1543	SharedMutexSuppressTSAN;
1544
1545	// Prevent the compiler from instantiating these in other translation units.
1546	// They are instantiated once in SharedMutex.cpp
1547	extern template class SharedMutexImpl<true>;
1548	extern template class SharedMutexImpl<false>;
1549
1550	template <
1551	bool ReaderPriority,
1552	typename Tag_,
1553	template <typename> class Atom,
1554	bool BlockImmediately,
1555	bool AnnotateForThreadSanitizer>
1556	alignas(hardware_destructive_interference_size) typename SharedMutexImpl<
1557	ReaderPriority,
1558	Tag_,
1559	Atom,
1560	BlockImmediately,
1561	AnnotateForThreadSanitizer>::DeferredReaderSlot
1562	SharedMutexImpl<
1563	ReaderPriority,
1564	Tag_,
1565	Atom,
1566	BlockImmediately,
1567	AnnotateForThreadSanitizer>::deferredReaders
1568	[kMaxDeferredReaders * kDeferredSeparationFactor] = {};
1569
1570	template <
1571	bool ReaderPriority,
1572	typename Tag_,
1573	template <typename> class Atom,
1574	bool BlockImmediately,
1575	bool AnnotateForThreadSanitizer>
1576	FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl<
1577	ReaderPriority,
1578	Tag_,
1579	Atom,
1580	BlockImmediately,
1581	AnnotateForThreadSanitizer>::tls_lastTokenlessSlot = `0`;
1582
1583	template <
1584	bool ReaderPriority,
1585	typename Tag_,
1586	template <typename> class Atom,
1587	bool BlockImmediately,
1588	bool AnnotateForThreadSanitizer>
1589	FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl<
1590	ReaderPriority,
1591	Tag_,
1592	Atom,
1593	BlockImmediately,
1594	AnnotateForThreadSanitizer>::tls_lastDeferredReaderSlot = `0`;
1595
1596	template <
1597	bool ReaderPriority,
1598	typename Tag_,
1599	template <typename> class Atom,
1600	bool BlockImmediately,
1601	bool AnnotateForThreadSanitizer>
1602	bool SharedMutexImpl<
1603	ReaderPriority,
1604	Tag_,
1605	Atom,
1606	BlockImmediately,
1607	AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() {
1608	auto bestSlot = tls_lastTokenlessSlot;
1609	for (uint32_t i = `0`; i < kMaxDeferredReaders; ++i) {
1610	auto slotPtr = deferredReader(bestSlot ^ i);
1611	auto slotValue = slotPtr->load(std::memory_order_relaxed);
1612	if (slotValue == tokenlessSlotValue() &&
1613	slotPtr->compare_exchange_strong(slotValue, `0`)) {
1614	tls_lastTokenlessSlot = bestSlot ^ i;
1615	return true;
1616	}
1617	}
1618	return false;
1619	}
1620
1621	template <
1622	bool ReaderPriority,
1623	typename Tag_,
1624	template <typename> class Atom,
1625	bool BlockImmediately,
1626	bool AnnotateForThreadSanitizer>
1627	template <class WaitContext>
1628	bool SharedMutexImpl<
1629	ReaderPriority,
1630	Tag_,
1631	Atom,
1632	BlockImmediately,
1633	AnnotateForThreadSanitizer>::
1634	lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
1635	while (true) {
1636	if (UNLIKELY((state & kHasE) != `0`) &&
1637	!waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
1638	return false;
1639	}
1640
1641	uint32_t slot = tls_lastDeferredReaderSlot;
1642	uintptr_t slotValue = `1`; // any non-zero value will do
1643
1644	bool canAlreadyDefer = (state & kMayDefer) != `0`;
1645	bool aboveDeferThreshold =
1646	(state & kHasS) >= (kNumSharedToStartDeferring - `1`) * kIncrHasS;
1647	bool drainInProgress = ReaderPriority && (state & kBegunE) != `0`;
1648	if (canAlreadyDefer \|\| (aboveDeferThreshold && !drainInProgress)) {
1649	/ Try using the most recent slot first. /
1650	slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1651	if (slotValue != `0`) {
1652	// starting point for our empty-slot search, can change after
1653	// calling waitForZeroBits
1654	uint32_t bestSlot =
1655	(uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);
1656
1657	// deferred readers are already enabled, or it is time to
1658	// enable them if we can find a slot
1659	for (uint32_t i = `0`; i < kDeferredSearchDistance; ++i) {
1660	slot = bestSlot ^ i;
1661	assert(slot < kMaxDeferredReaders);
1662	slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1663	if (slotValue == `0`) {
1664	// found empty slot
1665	tls_lastDeferredReaderSlot = slot;
1666	break;
1667	}
1668	}
1669	}
1670	}
1671
1672	if (slotValue != `0`) {
1673	// not yet deferred, or no empty slots
1674	if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
1675	// successfully recorded the read lock inline
1676	if (token != nullptr) {
1677	token->type_ = Token::Type::INLINE_SHARED;
1678	}
1679	return true;
1680	}
1681	// state is updated, try again
1682	continue;
1683	}
1684
1685	// record that deferred readers might be in use if necessary
1686	if ((state & kMayDefer) == `0`) {
1687	if (!state_.compare_exchange_strong(state, state \| kMayDefer)) {
1688	// keep going if CAS failed because somebody else set the bit
1689	// for us
1690	if ((state & (kHasE \| kMayDefer)) != kMayDefer) {
1691	continue;
1692	}
1693	}
1694	// state = state \| kMayDefer;
1695	}
1696
1697	// try to use the slot
1698	bool gotSlot = deferredReader(slot)->compare_exchange_strong(
1699	slotValue,
1700	token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
1701
1702	// If we got the slot, we need to verify that an exclusive lock
1703	// didn't happen since we last checked. If we didn't get the slot we
1704	// need to recheck state_ anyway to make sure we don't waste too much
1705	// work. It is also possible that since we checked state_ someone
1706	// has acquired and released the write lock, clearing kMayDefer.
1707	// Both cases are covered by looking for the readers-possible bit,
1708	// because it is off when the exclusive lock bit is set.
1709	state = state_.load(std::memory_order_acquire);
1710
1711	if (!gotSlot) {
1712	continue;
1713	}
1714
1715	if (token == nullptr) {
1716	tls_lastTokenlessSlot = slot;
1717	}
1718
1719	if ((state & kMayDefer) != `0`) {
1720	assert((state & kHasE) == `0`);
1721	// success
1722	if (token != nullptr) {
1723	token->type_ = Token::Type::DEFERRED_SHARED;
1724	token->slot_ = (uint16_t)slot;
1725	}
1726	return true;
1727	}
1728
1729	// release the slot before retrying
1730	if (token == nullptr) {
1731	// We can't rely on slot. Token-less slot values can be freed by
1732	// any unlock_shared(), so we need to do the full deferredReader
1733	// search during unlock. Unlike unlock_shared(), we can't trust
1734	// kPrevDefer here. This deferred lock isn't visible to lock()
1735	// (that's the whole reason we're undoing it) so there might have
1736	// subsequently been an unlock() and lock() with no intervening
1737	// transition to deferred mode.
1738	if (!tryUnlockTokenlessSharedDeferred()) {
1739	unlockSharedInline();
1740	}
1741	} else {
1742	if (!tryUnlockSharedDeferred(slot)) {
1743	unlockSharedInline();
1744	}
1745	}
1746
1747	// We got here not because the lock was unavailable, but because
1748	// we lost a compare-and-swap. Try-lock is typically allowed to
1749	// have spurious failures, but there is no lock efficiency gain
1750	// from exploiting that freedom here.
1751	}
1752	}
1753
1754	} // namespace folly
1755

Browse the source code of glow/thirdparty/folly/folly/SharedMutex.h