1/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17// @author Nathan Bronson ([email protected])
18
19#pragma once
20
21#include <stdint.h>
22
23#include <atomic>
24#include <thread>
25#include <type_traits>
26
27#include <folly/CPortability.h>
28#include <folly/Likely.h>
29#include <folly/concurrency/CacheLocality.h>
30#include <folly/detail/Futex.h>
31#include <folly/portability/Asm.h>
32#include <folly/portability/SysResource.h>
33#include <folly/synchronization/SanitizeThread.h>
34
35// SharedMutex is a reader-writer lock. It is small, very fast, scalable
36// on multi-core, and suitable for use when readers or writers may block.
37// Unlike most other reader-writer locks, its throughput with concurrent
38// readers scales linearly; it is able to acquire and release the lock
39// in shared mode without cache line ping-ponging. It is suitable for
40// a wide range of lock hold times because it starts with spinning,
41// proceeds to using sched_yield with a preemption heuristic, and then
42// waits using futex and precise wakeups.
43//
44// SharedMutex provides all of the methods of folly::RWSpinLock,
45// boost::shared_mutex, boost::upgrade_mutex, and C++14's
46// std::shared_timed_mutex. All operations that can block are available
47// in try, try-for, and try-until (system_clock or steady_clock) versions.
48//
49// SharedMutexReadPriority gives priority to readers,
50// SharedMutexWritePriority gives priority to writers. SharedMutex is an
51// alias for SharedMutexWritePriority, because writer starvation is more
52// likely than reader starvation for the read-heavy workloads targeted
53// by SharedMutex.
54//
55// In my tests SharedMutex is as good or better than the other
56// reader-writer locks in use at Facebook for almost all use cases,
57// sometimes by a wide margin. (If it is rare that there are actually
58// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
59// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
60// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
61// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
62// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
63// It is generally as good or better than the rest when evaluating size,
64// speed, scalability, or latency outliers. In the corner cases where
65// it is not the fastest (such as single-threaded use or heavy write
66// contention) it is never very much worse than the best. See the bottom
67// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
68//
69// Comparison to folly::RWSpinLock:
70//
71// * SharedMutex is faster than RWSpinLock when there are actually
72// concurrent read accesses (sometimes much faster), and ~5 nanoseconds
73// slower when there is not actually any contention. SharedMutex is
74// faster in every (benchmarked) scenario where the shared mode of
75// the lock is actually useful.
76//
77// * Concurrent shared access to SharedMutex scales linearly, while total
78// RWSpinLock throughput drops as more threads try to access the lock
79// in shared mode. Under very heavy read contention SharedMutex can
80// be two orders of magnitude faster than RWSpinLock (or any reader
81// writer lock that doesn't use striping or deferral).
82//
83// * SharedMutex can safely protect blocking calls, because after an
84// initial period of spinning it waits using futex().
85//
86// * RWSpinLock prioritizes readers, SharedMutex has both reader- and
87// writer-priority variants, but defaults to write priority.
88//
89// * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
90// doesn't. Both semantics are reasonable. The boost documentation
91// doesn't explicitly talk about this behavior (except by omitting
92// any statement that those lock modes conflict), but the boost
93// implementations do allow new readers while the upgradeable mode
94// is held. See https://github.com/boostorg/thread/blob/master/
95// include/boost/thread/pthread/shared_mutex.hpp
96//
97// * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
98// (UpgradeableHolder would be even more pedantically correct).
99// SharedMutex's holders have fewer methods (no reset) and are less
100// tolerant (promotion and downgrade crash if the donor doesn't own
101// the lock, and you must use the default constructor rather than
102// passing a nullptr to the pointer constructor).
103//
104// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
105// and "shared" modes. At all times num_threads_holding_exclusive +
106// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
107// 0 || num_threads_holding_shared == 0. RWSpinLock has the additional
108// constraint that num_threads_holding_shared cannot increase while
109// num_threads_holding_upgrade is non-zero.
110//
111// Comparison to the internal RWLock:
112//
113// * SharedMutex doesn't allow a maximum reader count to be configured,
114// so it can't be used as a semaphore in the same way as RWLock.
115//
116// * SharedMutex is 4 bytes, RWLock is 256.
117//
118// * SharedMutex is as fast or faster than RWLock in all of my
119// microbenchmarks, and has positive rather than negative scalability.
120//
121// * RWLock and SharedMutex are both writer priority locks.
122//
123// * SharedMutex avoids latency outliers as well as RWLock.
124//
125// * SharedMutex uses different names (t != 0 below):
126//
127// RWLock::lock(0) => SharedMutex::lock()
128//
129// RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t))
130//
131// RWLock::tryLock() => SharedMutex::try_lock()
132//
133// RWLock::unlock() => SharedMutex::unlock()
134//
135// RWLock::enter(0) => SharedMutex::lock_shared()
136//
137// RWLock::enter(t) =>
138// SharedMutex::try_lock_shared_for(milliseconds(t))
139//
140// RWLock::tryEnter() => SharedMutex::try_lock_shared()
141//
142// RWLock::leave() => SharedMutex::unlock_shared()
143//
144// * RWLock allows the reader count to be adjusted by a value other
145// than 1 during enter() or leave(). SharedMutex doesn't currently
146// implement this feature.
147//
148// * RWLock's methods are marked const, SharedMutex's aren't.
149//
150// Reader-writer locks have the potential to allow concurrent access
151// to shared read-mostly data, but in practice they often provide no
152// improvement over a mutex. The problem is the cache coherence protocol
153// of modern CPUs. Coherence is provided by making sure that when a cache
154// line is written it is present in only one core's cache. Since a memory
155// write is required to acquire a reader-writer lock in shared mode, the
156// cache line holding the lock is invalidated in all of the other caches.
157// This leads to cache misses when another thread wants to acquire or
158// release the lock concurrently. When the RWLock is colocated with the
159// data it protects (common), cache misses can also continue occur when
160// a thread that already holds the lock tries to read the protected data.
161//
162// Ideally, a reader-writer lock would allow multiple cores to acquire
163// and release the lock in shared mode without incurring any cache misses.
164// This requires that each core records its shared access in a cache line
165// that isn't read or written by other read-locking cores. (Writers will
166// have to check all of the cache lines.) Typical server hardware when
167// this comment was written has 16 L1 caches and cache lines of 64 bytes,
168// so a lock striped over all L1 caches would occupy a prohibitive 1024
169// bytes. Nothing says that we need a separate set of per-core memory
170// locations for each lock, however. Each SharedMutex instance is only
171// 4 bytes, but all locks together share a 2K area in which they make a
172// core-local record of lock acquisitions.
173//
174// SharedMutex's strategy of using a shared set of core-local stripes has
175// a potential downside, because it means that acquisition of any lock in
176// write mode can conflict with acquisition of any lock in shared mode.
177// If a lock instance doesn't actually experience concurrency then this
178// downside will outweight the upside of improved scalability for readers.
179// To avoid this problem we dynamically detect concurrent accesses to
180// SharedMutex, and don't start using the deferred mode unless we actually
181// observe concurrency. See kNumSharedToStartDeferring.
182//
183// It is explicitly allowed to call unlock_shared() from a different
184// thread than lock_shared(), so long as they are properly paired.
185// unlock_shared() needs to find the location at which lock_shared()
186// recorded the lock, which might be in the lock itself or in any of
187// the shared slots. If you can conveniently pass state from lock
188// acquisition to release then the fastest mechanism is to std::move
189// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
190// lock_shared(Token&) and unlock_shared(Token&)). The guard or token
191// will tell unlock_shared where in deferredReaders[] to look for the
192// deferred lock. The Token-less version of unlock_shared() works in all
193// cases, but is optimized for the common (no inter-thread handoff) case.
194//
195// In both read- and write-priority mode, a waiting lock() (exclusive mode)
196// only blocks readers after it has waited for an active upgrade lock to be
197// released; until the upgrade lock is released (or upgraded or downgraded)
198// readers will still be able to enter. Preferences about lock acquisition
199// are not guaranteed to be enforced perfectly (even if they were, there
200// is theoretically the chance that a thread could be arbitrarily suspended
201// between calling lock() and SharedMutex code actually getting executed).
202//
203// try_*_for methods always try at least once, even if the duration
204// is zero or negative. The duration type must be compatible with
205// std::chrono::steady_clock. try_*_until methods also always try at
206// least once. std::chrono::system_clock and std::chrono::steady_clock
207// are supported.
208//
209// If you have observed by profiling that your SharedMutex-s are getting
210// cache misses on deferredReaders[] due to another SharedMutex user, then
211// you can use the tag type to create your own instantiation of the type.
212// The contention threshold (see kNumSharedToStartDeferring) should make
213// this unnecessary in all but the most extreme cases. Make sure to check
214// that the increased icache and dcache footprint of the tagged result is
215// worth it.
216
217// SharedMutex's use of thread local storage is an optimization, so
218// for the case where thread local storage is not supported, define it
219// away.
220
221// Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version
222// (the default) of this mutex is annotated appropriately so that TSAN can
223// perform lock inversion analysis. However, the SharedMutexReadPriority version
224// is not annotated. This is because TSAN's lock order heuristic
225// assumes that two calls to lock_shared must be ordered, which leads
226// to too many false positives for the reader-priority case.
227//
228// Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an
229// independent thread B is waiting for exclusive access. Then a thread C's
230// lock_shared can't proceed until A has released the lock. Discounting
231// situations that never use exclusive mode (so no lock is necessary at all)
232// this means that without higher-level reasoning it is not safe to ignore
233// reader <-> reader interactions.
234//
235// This reasoning does not apply to SharedMutexReadPriority, because there are
236// no actions by a thread B that can make C need to wait for A. Since the
237// overwhelming majority of SharedMutex instances use write priority, we
238// restrict the TSAN annotations to only SharedMutexWritePriority.
239
240#ifndef FOLLY_SHAREDMUTEX_TLS
241#if !FOLLY_MOBILE
242#define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
243#else
244#define FOLLY_SHAREDMUTEX_TLS
245#endif
246#endif
247
248namespace folly {
249
250struct SharedMutexToken {
251 enum class Type : uint16_t {
252 INVALID = 0,
253 INLINE_SHARED,
254 DEFERRED_SHARED,
255 };
256
257 Type type_;
258 uint16_t slot_;
259};
260
261namespace detail {
262// Returns a guard that gives permission for the current thread to
263// annotate, and adjust the annotation bits in, the SharedMutex at ptr.
264std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr);
265} // namespace detail
266
267template <
268 bool ReaderPriority,
269 typename Tag_ = void,
270 template <typename> class Atom = std::atomic,
271 bool BlockImmediately = false,
272 bool AnnotateForThreadSanitizer = kIsSanitizeThread && !ReaderPriority>
273class SharedMutexImpl {
274 public:
275 static constexpr bool kReaderPriority = ReaderPriority;
276
277 typedef Tag_ Tag;
278
279 typedef SharedMutexToken Token;
280
281 class FOLLY_NODISCARD ReadHolder;
282 class FOLLY_NODISCARD UpgradeHolder;
283 class FOLLY_NODISCARD WriteHolder;
284
285 constexpr SharedMutexImpl() noexcept : state_(0) {}
286
287 SharedMutexImpl(const SharedMutexImpl&) = delete;
288 SharedMutexImpl(SharedMutexImpl&&) = delete;
289 SharedMutexImpl& operator=(const SharedMutexImpl&) = delete;
290 SharedMutexImpl& operator=(SharedMutexImpl&&) = delete;
291
292 // It is an error to destroy an SharedMutex that still has
293 // any outstanding locks. This is checked if NDEBUG isn't defined.
294 // SharedMutex's exclusive mode can be safely used to guard the lock's
295 // own destruction. If, for example, you acquire the lock in exclusive
296 // mode and then observe that the object containing the lock is no longer
297 // needed, you can unlock() and then immediately destroy the lock.
298 // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
299 // description about why this property needs to be explicitly mentioned.
300 ~SharedMutexImpl() {
301 auto state = state_.load(std::memory_order_relaxed);
302 if (UNLIKELY((state & kHasS) != 0)) {
303 cleanupTokenlessSharedDeferred(state);
304 }
305
306 if (folly::kIsDebug) {
307 // These asserts check that everybody has released the lock before it
308 // is destroyed. If you arrive here while debugging that is likely
309 // the problem. (You could also have general heap corruption.)
310
311 // if a futexWait fails to go to sleep because the value has been
312 // changed, we don't necessarily clean up the wait bits, so it is
313 // possible they will be set here in a correct system
314 assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0);
315 if ((state & kMayDefer) != 0) {
316 for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
317 auto slotValue =
318 deferredReader(slot)->load(std::memory_order_relaxed);
319 assert(!slotValueIsThis(slotValue));
320 (void)slotValue;
321 }
322 }
323 }
324 annotateDestroy();
325 }
326
327 // Checks if an exclusive lock could succeed so that lock elision could be
328 // enabled. Different from the two eligible_for_lock_{upgrade|shared}_elision
329 // functions, this is a conservative check since kMayDefer indicates
330 // "may-existing" deferred readers.
331 bool eligible_for_lock_elision() const {
332 // We rely on the transaction for linearization. Wait bits are
333 // irrelevant because a successful transaction will be in and out
334 // without affecting the wakeup. kBegunE is also okay for a similar
335 // reason.
336 auto state = state_.load(std::memory_order_relaxed);
337 return (state & (kHasS | kMayDefer | kHasE | kHasU)) == 0;
338 }
339
340 // Checks if an upgrade lock could succeed so that lock elision could be
341 // enabled.
342 bool eligible_for_lock_upgrade_elision() const {
343 auto state = state_.load(std::memory_order_relaxed);
344 return (state & (kHasE | kHasU)) == 0;
345 }
346
347 // Checks if a shared lock could succeed so that lock elision could be
348 // enabled.
349 bool eligible_for_lock_shared_elision() const {
350 // No need to honor kBegunE because a transaction doesn't block anybody
351 auto state = state_.load(std::memory_order_relaxed);
352 return (state & kHasE) == 0;
353 }
354
355 void lock() {
356 WaitForever ctx;
357 (void)lockExclusiveImpl(kHasSolo, ctx);
358 annotateAcquired(annotate_rwlock_level::wrlock);
359 }
360
361 bool try_lock() {
362 WaitNever ctx;
363 auto result = lockExclusiveImpl(kHasSolo, ctx);
364 annotateTryAcquired(result, annotate_rwlock_level::wrlock);
365 return result;
366 }
367
368 template <class Rep, class Period>
369 bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
370 WaitForDuration<Rep, Period> ctx(duration);
371 auto result = lockExclusiveImpl(kHasSolo, ctx);
372 annotateTryAcquired(result, annotate_rwlock_level::wrlock);
373 return result;
374 }
375
376 template <class Clock, class Duration>
377 bool try_lock_until(
378 const std::chrono::time_point<Clock, Duration>& absDeadline) {
379 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
380 auto result = lockExclusiveImpl(kHasSolo, ctx);
381 annotateTryAcquired(result, annotate_rwlock_level::wrlock);
382 return result;
383 }
384
385 void unlock() {
386 annotateReleased(annotate_rwlock_level::wrlock);
387 // It is possible that we have a left-over kWaitingNotS if the last
388 // unlock_shared() that let our matching lock() complete finished
389 // releasing before lock()'s futexWait went to sleep. Clean it up now
390 auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
391 assert((state & ~(kWaitingAny | kAnnotationCreated)) == 0);
392 wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
393 }
394
395 // Managing the token yourself makes unlock_shared a bit faster
396
397 void lock_shared() {
398 WaitForever ctx;
399 (void)lockSharedImpl(nullptr, ctx);
400 annotateAcquired(annotate_rwlock_level::rdlock);
401 }
402
403 void lock_shared(Token& token) {
404 WaitForever ctx;
405 (void)lockSharedImpl(&token, ctx);
406 annotateAcquired(annotate_rwlock_level::rdlock);
407 }
408
409 bool try_lock_shared() {
410 WaitNever ctx;
411 auto result = lockSharedImpl(nullptr, ctx);
412 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
413 return result;
414 }
415
416 bool try_lock_shared(Token& token) {
417 WaitNever ctx;
418 auto result = lockSharedImpl(&token, ctx);
419 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
420 return result;
421 }
422
423 template <class Rep, class Period>
424 bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
425 WaitForDuration<Rep, Period> ctx(duration);
426 auto result = lockSharedImpl(nullptr, ctx);
427 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
428 return result;
429 }
430
431 template <class Rep, class Period>
432 bool try_lock_shared_for(
433 const std::chrono::duration<Rep, Period>& duration,
434 Token& token) {
435 WaitForDuration<Rep, Period> ctx(duration);
436 auto result = lockSharedImpl(&token, ctx);
437 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
438 return result;
439 }
440
441 template <class Clock, class Duration>
442 bool try_lock_shared_until(
443 const std::chrono::time_point<Clock, Duration>& absDeadline) {
444 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
445 auto result = lockSharedImpl(nullptr, ctx);
446 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
447 return result;
448 }
449
450 template <class Clock, class Duration>
451 bool try_lock_shared_until(
452 const std::chrono::time_point<Clock, Duration>& absDeadline,
453 Token& token) {
454 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
455 auto result = lockSharedImpl(&token, ctx);
456 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
457 return result;
458 }
459
460 void unlock_shared() {
461 annotateReleased(annotate_rwlock_level::rdlock);
462
463 auto state = state_.load(std::memory_order_acquire);
464
465 // kPrevDefer can only be set if HasE or BegunE is set
466 assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
467
468 // lock() strips kMayDefer immediately, but then copies it to
469 // kPrevDefer so we can tell if the pre-lock() lock_shared() might
470 // have deferred
471 if ((state & (kMayDefer | kPrevDefer)) == 0 ||
472 !tryUnlockTokenlessSharedDeferred()) {
473 // Matching lock_shared() couldn't have deferred, or the deferred
474 // lock has already been inlined by applyDeferredReaders()
475 unlockSharedInline();
476 }
477 }
478
479 void unlock_shared(Token& token) {
480 annotateReleased(annotate_rwlock_level::rdlock);
481
482 assert(
483 token.type_ == Token::Type::INLINE_SHARED ||
484 token.type_ == Token::Type::DEFERRED_SHARED);
485
486 if (token.type_ != Token::Type::DEFERRED_SHARED ||
487 !tryUnlockSharedDeferred(token.slot_)) {
488 unlockSharedInline();
489 }
490 if (folly::kIsDebug) {
491 token.type_ = Token::Type::INVALID;
492 }
493 }
494
495 void unlock_and_lock_shared() {
496 annotateReleased(annotate_rwlock_level::wrlock);
497 annotateAcquired(annotate_rwlock_level::rdlock);
498 // We can't use state_ -=, because we need to clear 2 bits (1 of which
499 // has an uncertain initial state) and set 1 other. We might as well
500 // clear the relevant wake bits at the same time. Note that since S
501 // doesn't block the beginning of a transition to E (writer priority
502 // can cut off new S, reader priority grabs BegunE and blocks deferred
503 // S) we need to wake E as well.
504 auto state = state_.load(std::memory_order_acquire);
505 do {
506 assert(
507 (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE);
508 } while (!state_.compare_exchange_strong(
509 state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
510 if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
511 futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
512 }
513 }
514
515 void unlock_and_lock_shared(Token& token) {
516 unlock_and_lock_shared();
517 token.type_ = Token::Type::INLINE_SHARED;
518 }
519
520 void lock_upgrade() {
521 WaitForever ctx;
522 (void)lockUpgradeImpl(ctx);
523 // For TSAN: treat upgrade locks as equivalent to read locks
524 annotateAcquired(annotate_rwlock_level::rdlock);
525 }
526
527 bool try_lock_upgrade() {
528 WaitNever ctx;
529 auto result = lockUpgradeImpl(ctx);
530 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
531 return result;
532 }
533
534 template <class Rep, class Period>
535 bool try_lock_upgrade_for(
536 const std::chrono::duration<Rep, Period>& duration) {
537 WaitForDuration<Rep, Period> ctx(duration);
538 auto result = lockUpgradeImpl(ctx);
539 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
540 return result;
541 }
542
543 template <class Clock, class Duration>
544 bool try_lock_upgrade_until(
545 const std::chrono::time_point<Clock, Duration>& absDeadline) {
546 WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
547 auto result = lockUpgradeImpl(ctx);
548 annotateTryAcquired(result, annotate_rwlock_level::rdlock);
549 return result;
550 }
551
552 void unlock_upgrade() {
553 annotateReleased(annotate_rwlock_level::rdlock);
554 auto state = (state_ -= kHasU);
555 assert((state & (kWaitingNotS | kHasSolo)) == 0);
556 wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
557 }
558
559 void unlock_upgrade_and_lock() {
560 // no waiting necessary, so waitMask is empty
561 WaitForever ctx;
562 (void)lockExclusiveImpl(0, ctx);
563 annotateReleased(annotate_rwlock_level::rdlock);
564 annotateAcquired(annotate_rwlock_level::wrlock);
565 }
566
567 void unlock_upgrade_and_lock_shared() {
568 // No need to annotate for TSAN here because we model upgrade and shared
569 // locks as the same.
570 auto state = (state_ -= kHasU - kIncrHasS);
571 assert((state & (kWaitingNotS | kHasSolo)) == 0);
572 wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
573 }
574
575 void unlock_upgrade_and_lock_shared(Token& token) {
576 unlock_upgrade_and_lock_shared();
577 token.type_ = Token::Type::INLINE_SHARED;
578 }
579
580 void unlock_and_lock_upgrade() {
581 annotateReleased(annotate_rwlock_level::wrlock);
582 annotateAcquired(annotate_rwlock_level::rdlock);
583 // We can't use state_ -=, because we need to clear 2 bits (1 of
584 // which has an uncertain initial state) and set 1 other. We might
585 // as well clear the relevant wake bits at the same time.
586 auto state = state_.load(std::memory_order_acquire);
587 while (true) {
588 assert(
589 (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE);
590 auto after =
591 (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
592 if (state_.compare_exchange_strong(state, after)) {
593 if ((state & kWaitingS) != 0) {
594 futexWakeAll(kWaitingS);
595 }
596 return;
597 }
598 }
599 }
600
601 private:
602 typedef typename folly::detail::Futex<Atom> Futex;
603
604 // Internally we use four kinds of wait contexts. These are structs
605 // that provide a doWait method that returns true if a futex wake
606 // was issued that intersects with the waitMask, false if there was a
607 // timeout and no more waiting should be performed. Spinning occurs
608 // before the wait context is invoked.
609
610 struct WaitForever {
611 bool canBlock() {
612 return true;
613 }
614 bool canTimeOut() {
615 return false;
616 }
617 bool shouldTimeOut() {
618 return false;
619 }
620
621 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
622 detail::futexWait(&futex, expected, waitMask);
623 return true;
624 }
625 };
626
627 struct WaitNever {
628 bool canBlock() {
629 return false;
630 }
631 bool canTimeOut() {
632 return true;
633 }
634 bool shouldTimeOut() {
635 return true;
636 }
637
638 bool doWait(
639 Futex& /* futex */,
640 uint32_t /* expected */,
641 uint32_t /* waitMask */) {
642 return false;
643 }
644 };
645
646 template <class Rep, class Period>
647 struct WaitForDuration {
648 std::chrono::duration<Rep, Period> duration_;
649 bool deadlineComputed_;
650 std::chrono::steady_clock::time_point deadline_;
651
652 explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
653 : duration_(duration), deadlineComputed_(false) {}
654
655 std::chrono::steady_clock::time_point deadline() {
656 if (!deadlineComputed_) {
657 deadline_ = std::chrono::steady_clock::now() + duration_;
658 deadlineComputed_ = true;
659 }
660 return deadline_;
661 }
662
663 bool canBlock() {
664 return duration_.count() > 0;
665 }
666 bool canTimeOut() {
667 return true;
668 }
669
670 bool shouldTimeOut() {
671 return std::chrono::steady_clock::now() > deadline();
672 }
673
674 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
675 auto result =
676 detail::futexWaitUntil(&futex, expected, deadline(), waitMask);
677 return result != folly::detail::FutexResult::TIMEDOUT;
678 }
679 };
680
681 template <class Clock, class Duration>
682 struct WaitUntilDeadline {
683 std::chrono::time_point<Clock, Duration> absDeadline_;
684
685 bool canBlock() {
686 return true;
687 }
688 bool canTimeOut() {
689 return true;
690 }
691 bool shouldTimeOut() {
692 return Clock::now() > absDeadline_;
693 }
694
695 bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
696 auto result =
697 detail::futexWaitUntil(&futex, expected, absDeadline_, waitMask);
698 return result != folly::detail::FutexResult::TIMEDOUT;
699 }
700 };
701
702 void annotateLazyCreate() {
703 if (AnnotateForThreadSanitizer &&
704 (state_.load() & kAnnotationCreated) == 0) {
705 auto guard = detail::sharedMutexAnnotationGuard(this);
706 // check again
707 if ((state_.load() & kAnnotationCreated) == 0) {
708 state_.fetch_or(kAnnotationCreated);
709 annotate_benign_race_sized(
710 &state_, sizeof(state_), "init TSAN", __FILE__, __LINE__);
711 annotate_rwlock_create(this, __FILE__, __LINE__);
712 }
713 }
714 }
715
716 void annotateDestroy() {
717 if (AnnotateForThreadSanitizer) {
718 annotateLazyCreate();
719 annotate_rwlock_destroy(this, __FILE__, __LINE__);
720 }
721 }
722
723 void annotateAcquired(annotate_rwlock_level w) {
724 if (AnnotateForThreadSanitizer) {
725 annotateLazyCreate();
726 annotate_rwlock_acquired(this, w, __FILE__, __LINE__);
727 }
728 }
729
730 void annotateTryAcquired(bool result, annotate_rwlock_level w) {
731 if (AnnotateForThreadSanitizer) {
732 annotateLazyCreate();
733 annotate_rwlock_try_acquired(this, w, result, __FILE__, __LINE__);
734 }
735 }
736
737 void annotateReleased(annotate_rwlock_level w) {
738 if (AnnotateForThreadSanitizer) {
739 assert((state_.load() & kAnnotationCreated) != 0);
740 annotate_rwlock_released(this, w, __FILE__, __LINE__);
741 }
742 }
743
744 // 32 bits of state
745 Futex state_{};
746
747 // S count needs to be on the end, because we explicitly allow it to
748 // underflow. This can occur while we are in the middle of applying
749 // deferred locks (we remove them from deferredReaders[] before
750 // inlining them), or during token-less unlock_shared() if a racing
751 // lock_shared();unlock_shared() moves the deferredReaders slot while
752 // the first unlock_shared() is scanning. The former case is cleaned
753 // up before we finish applying the locks. The latter case can persist
754 // until destruction, when it is cleaned up.
755 static constexpr uint32_t kIncrHasS = 1 << 11;
756 static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
757
758 // Set if annotation has been completed for this instance. That annotation
759 // (and setting this bit afterward) must be guarded by one of the mutexes in
760 // annotationCreationGuards.
761 static constexpr uint32_t kAnnotationCreated = 1 << 10;
762
763 // If false, then there are definitely no deferred read locks for this
764 // instance. Cleared after initialization and when exclusively locked.
765 static constexpr uint32_t kMayDefer = 1 << 9;
766
767 // lock() cleared kMayDefer as soon as it starts draining readers (so
768 // that it doesn't have to do a second CAS once drain completes), but
769 // unlock_shared() still needs to know whether to scan deferredReaders[]
770 // or not. We copy kMayDefer to kPrevDefer when setting kHasE or
771 // kBegunE, and clear it when clearing those bits.
772 static constexpr uint32_t kPrevDefer = 1 << 8;
773
774 // Exclusive-locked blocks all read locks and write locks. This bit
775 // may be set before all readers have finished, but in that case the
776 // thread that sets it won't return to the caller until all read locks
777 // have been released.
778 static constexpr uint32_t kHasE = 1 << 7;
779
780 // Exclusive-draining means that lock() is waiting for existing readers
781 // to leave, but that new readers may still acquire shared access.
782 // This is only used in reader priority mode. New readers during
783 // drain must be inline. The difference between this and kHasU is that
784 // kBegunE prevents kMayDefer from being set.
785 static constexpr uint32_t kBegunE = 1 << 6;
786
787 // At most one thread may have either exclusive or upgrade lock
788 // ownership. Unlike exclusive mode, ownership of the lock in upgrade
789 // mode doesn't preclude other threads holding the lock in shared mode.
790 // boost's concept for this doesn't explicitly say whether new shared
791 // locks can be acquired one lock_upgrade has succeeded, but doesn't
792 // list that as disallowed. RWSpinLock disallows new read locks after
793 // lock_upgrade has been acquired, but the boost implementation doesn't.
794 // We choose the latter.
795 static constexpr uint32_t kHasU = 1 << 5;
796
797 // There are three states that we consider to be "solo", in that they
798 // cannot coexist with other solo states. These are kHasE, kBegunE,
799 // and kHasU. Note that S doesn't conflict with any of these, because
800 // setting the kHasE is only one of the two steps needed to actually
801 // acquire the lock in exclusive mode (the other is draining the existing
802 // S holders).
803 static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
804
805 // Once a thread sets kHasE it needs to wait for the current readers
806 // to exit the lock. We give this a separate wait identity from the
807 // waiting to set kHasE so that we can perform partial wakeups (wake
808 // one instead of wake all).
809 static constexpr uint32_t kWaitingNotS = 1 << 4;
810
811 // When waking writers we can either wake them all, in which case we
812 // can clear kWaitingE, or we can call futexWake(1). futexWake tells
813 // us if anybody woke up, but even if we detect that nobody woke up we
814 // can't clear the bit after the fact without issuing another wakeup.
815 // To avoid thundering herds when there are lots of pending lock()
816 // without needing to call futexWake twice when there is only one
817 // waiter, kWaitingE actually encodes if we have observed multiple
818 // concurrent waiters. Tricky: ABA issues on futexWait mean that when
819 // we see kWaitingESingle we can't assume that there is only one.
820 static constexpr uint32_t kWaitingESingle = 1 << 2;
821 static constexpr uint32_t kWaitingEMultiple = 1 << 3;
822 static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
823
824 // kWaitingU is essentially a 1 bit saturating counter. It always
825 // requires a wakeAll.
826 static constexpr uint32_t kWaitingU = 1 << 1;
827
828 // All blocked lock_shared() should be awoken, so it is correct (not
829 // suboptimal) to wakeAll if there are any shared readers.
830 static constexpr uint32_t kWaitingS = 1 << 0;
831
832 // kWaitingAny is a mask of all of the bits that record the state of
833 // threads, rather than the state of the lock. It is convenient to be
834 // able to mask them off during asserts.
835 static constexpr uint32_t kWaitingAny =
836 kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
837
838 // The reader count at which a reader will attempt to use the lock
839 // in deferred mode. If this value is 2, then the second concurrent
840 // reader will set kMayDefer and use deferredReaders[]. kMayDefer is
841 // cleared during exclusive access, so this threshold must be reached
842 // each time a lock is held in exclusive mode.
843 static constexpr uint32_t kNumSharedToStartDeferring = 2;
844
845 // The typical number of spins that a thread will wait for a state
846 // transition. There is no bound on the number of threads that can wait
847 // for a writer, so we are pretty conservative here to limit the chance
848 // that we are starving the writer of CPU. Each spin is 6 or 7 nanos,
849 // almost all of which is in the pause instruction.
850 static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
851
852 // The maximum number of soft yields before falling back to futex.
853 // If the preemption heuristic is activated we will fall back before
854 // this. A soft yield takes ~900 nanos (two sched_yield plus a call
855 // to getrusage, with checks of the goal at each step). Soft yields
856 // aren't compatible with deterministic execution under test (unlike
857 // futexWaitUntil, which has a capricious but deterministic back end).
858 static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
859
860 // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
861 // level of the memory hierarchy is symmetrically divided into k pieces
862 // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
863 // that are the same after integer division by k share that resource.
864 // Our strategy for deferred readers is to probe up to numSlots/4 slots,
865 // using the full granularity of AccessSpreader for the start slot
866 // and then search outward. We can use AccessSpreader::current(n)
867 // without managing our own spreader if kMaxDeferredReaders <=
868 // AccessSpreader::kMaxCpus, which is currently 128.
869 //
870 // Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
871 // with 64 byte cache lines. That means we need 64*16 bytes of
872 // deferredReaders[] to give each L1 its own playground. On x86_64
873 // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
874 // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If
875 // kDeferredSearchDistance * kDeferredSeparationFactor <=
876 // 64 / 8 then we will search only within a single cache line, which
877 // guarantees we won't have inter-L1 contention. We give ourselves
878 // a factor of 2 on the core count, which should hold us for a couple
879 // processor generations. deferredReaders[] is 2048 bytes currently.
880 public:
881 static constexpr uint32_t kMaxDeferredReaders = 64;
882 static constexpr uint32_t kDeferredSearchDistance = 2;
883 static constexpr uint32_t kDeferredSeparationFactor = 4;
884
885 private:
886 static_assert(
887 !(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
888 "kMaxDeferredReaders must be a power of 2");
889 static_assert(
890 !(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
891 "kDeferredSearchDistance must be a power of 2");
892
893 // The number of deferred locks that can be simultaneously acquired
894 // by a thread via the token-less methods without performing any heap
895 // allocations. Each of these costs 3 pointers (24 bytes, probably)
896 // per thread. There's not much point in making this larger than
897 // kDeferredSearchDistance.
898 static constexpr uint32_t kTokenStackTLSCapacity = 2;
899
900 // We need to make sure that if there is a lock_shared()
901 // and lock_shared(token) followed by unlock_shared() and
902 // unlock_shared(token), the token-less unlock doesn't null
903 // out deferredReaders[token.slot_]. If we allowed that, then
904 // unlock_shared(token) wouldn't be able to assume that its lock
905 // had been inlined by applyDeferredReaders when it finds that
906 // deferredReaders[token.slot_] no longer points to this. We accomplish
907 // this by stealing bit 0 from the pointer to record that the slot's
908 // element has no token, hence our use of uintptr_t in deferredReaders[].
909 static constexpr uintptr_t kTokenless = 0x1;
910
911 // This is the starting location for Token-less unlock_shared().
912 static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
913
914 // Last deferred reader slot used.
915 static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
916
917 // Only indexes divisible by kDeferredSeparationFactor are used.
918 // If any of those elements points to a SharedMutexImpl, then it
919 // should be considered that there is a shared lock on that instance.
920 // See kTokenless.
921 public:
922 typedef Atom<uintptr_t> DeferredReaderSlot;
923
924 private:
925 alignas(hardware_destructive_interference_size) static DeferredReaderSlot
926 deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor];
927
928 // Performs an exclusive lock, waiting for state_ & waitMask to be
929 // zero first
930 template <class WaitContext>
931 bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
932 uint32_t state = state_.load(std::memory_order_acquire);
933 if (LIKELY(
934 (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
935 state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
936 return true;
937 } else {
938 return lockExclusiveImpl(state, preconditionGoalMask, ctx);
939 }
940 }
941
942 template <class WaitContext>
943 bool lockExclusiveImpl(
944 uint32_t& state,
945 uint32_t preconditionGoalMask,
946 WaitContext& ctx) {
947 while (true) {
948 if (UNLIKELY((state & preconditionGoalMask) != 0) &&
949 !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
950 ctx.canTimeOut()) {
951 return false;
952 }
953
954 uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
955 if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
956 // Block readers immediately, either because we are in write
957 // priority mode or because we can acquire the lock in one
958 // step. Note that if state has kHasU, then we are doing an
959 // unlock_upgrade_and_lock() and we should clear it (reader
960 // priority branch also does this).
961 after |= (state | kHasE) & ~(kHasU | kMayDefer);
962 } else {
963 after |= (state | kBegunE) & ~(kHasU | kMayDefer);
964 }
965 if (state_.compare_exchange_strong(state, after)) {
966 auto before = state;
967 state = after;
968
969 // If we set kHasE (writer priority) then no new readers can
970 // arrive. If we set kBegunE then they can still enter, but
971 // they must be inline. Either way we need to either spin on
972 // deferredReaders[] slots, or inline them so that we can wait on
973 // kHasS to zero itself. deferredReaders[] is pointers, which on
974 // x86_64 are bigger than futex() can handle, so we inline the
975 // deferred locks instead of trying to futexWait on each slot.
976 // Readers are responsible for rechecking state_ after recording
977 // a deferred read to avoid atomicity problems between the state_
978 // CAS and applyDeferredReader's reads of deferredReaders[].
979 if (UNLIKELY((before & kMayDefer) != 0)) {
980 applyDeferredReaders(state, ctx);
981 }
982 while (true) {
983 assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
984 if (UNLIKELY((state & kHasS) != 0) &&
985 !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
986 ctx.canTimeOut()) {
987 // Ugh. We blocked new readers and other writers for a while,
988 // but were unable to complete. Move on. On the plus side
989 // we can clear kWaitingNotS because nobody else can piggyback
990 // on it.
991 state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
992 wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
993 return false;
994 }
995
996 if (kReaderPriority && (state & kHasE) == 0) {
997 assert((state & kBegunE) != 0);
998 if (!state_.compare_exchange_strong(
999 state, (state & ~kBegunE) | kHasE)) {
1000 continue;
1001 }
1002 }
1003
1004 return true;
1005 }
1006 }
1007 }
1008 }
1009
1010 template <class WaitContext>
1011 bool waitForZeroBits(
1012 uint32_t& state,
1013 uint32_t goal,
1014 uint32_t waitMask,
1015 WaitContext& ctx) {
1016 uint32_t spinCount = 0;
1017 while (true) {
1018 state = state_.load(std::memory_order_acquire);
1019 if ((state & goal) == 0) {
1020 return true;
1021 }
1022 asm_volatile_pause();
1023 ++spinCount;
1024 if (UNLIKELY(spinCount >= kMaxSpinCount)) {
1025 return ctx.canBlock() &&
1026 yieldWaitForZeroBits(state, goal, waitMask, ctx);
1027 }
1028 }
1029 }
1030
1031 template <class WaitContext>
1032 bool yieldWaitForZeroBits(
1033 uint32_t& state,
1034 uint32_t goal,
1035 uint32_t waitMask,
1036 WaitContext& ctx) {
1037#ifdef RUSAGE_THREAD
1038 struct rusage usage;
1039 std::memset(&usage, 0, sizeof(usage));
1040 long before = -1;
1041#endif
1042 for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1043 ++yieldCount) {
1044 for (int softState = 0; softState < 3; ++softState) {
1045 if (softState < 2) {
1046 std::this_thread::yield();
1047 } else {
1048#ifdef RUSAGE_THREAD
1049 getrusage(RUSAGE_THREAD, &usage);
1050#endif
1051 }
1052 if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
1053 return true;
1054 }
1055 if (ctx.shouldTimeOut()) {
1056 return false;
1057 }
1058 }
1059#ifdef RUSAGE_THREAD
1060 if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1061 // One involuntary csw might just be occasional background work,
1062 // but if we get two in a row then we guess that there is someone
1063 // else who can profitably use this CPU. Fall back to futex
1064 break;
1065 }
1066 before = usage.ru_nivcsw;
1067#endif
1068 }
1069 return futexWaitForZeroBits(state, goal, waitMask, ctx);
1070 }
1071
1072 template <class WaitContext>
1073 bool futexWaitForZeroBits(
1074 uint32_t& state,
1075 uint32_t goal,
1076 uint32_t waitMask,
1077 WaitContext& ctx) {
1078 assert(
1079 waitMask == kWaitingNotS || waitMask == kWaitingE ||
1080 waitMask == kWaitingU || waitMask == kWaitingS);
1081
1082 while (true) {
1083 state = state_.load(std::memory_order_acquire);
1084 if ((state & goal) == 0) {
1085 return true;
1086 }
1087
1088 auto after = state;
1089 if (waitMask == kWaitingE) {
1090 if ((state & kWaitingESingle) != 0) {
1091 after |= kWaitingEMultiple;
1092 } else {
1093 after |= kWaitingESingle;
1094 }
1095 } else {
1096 after |= waitMask;
1097 }
1098
1099 // CAS is better than atomic |= here, because it lets us avoid
1100 // setting the wait flag when the goal is concurrently achieved
1101 if (after != state && !state_.compare_exchange_strong(state, after)) {
1102 continue;
1103 }
1104
1105 if (!ctx.doWait(state_, after, waitMask)) {
1106 // timed out
1107 return false;
1108 }
1109 }
1110 }
1111
1112 // Wakes up waiters registered in state_ as appropriate, clearing the
1113 // awaiting bits for anybody that was awoken. Tries to perform direct
1114 // single wakeup of an exclusive waiter if appropriate
1115 void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
1116 if (UNLIKELY((state & wakeMask) != 0)) {
1117 wakeRegisteredWaitersImpl(state, wakeMask);
1118 }
1119 }
1120
1121 void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
1122 // If there are multiple lock() pending only one of them will actually
1123 // get to wake up, so issuing futexWakeAll will make a thundering herd.
1124 // There's nothing stopping us from issuing futexWake(1) instead,
1125 // so long as the wait bits are still an accurate reflection of
1126 // the waiters. If we notice (via futexWake's return value) that
1127 // nobody woke up then we can try again with the normal wake-all path.
1128 // Note that we can't just clear the bits at that point; we need to
1129 // clear the bits and then issue another wakeup.
1130 //
1131 // It is possible that we wake an E waiter but an outside S grabs the
1132 // lock instead, at which point we should wake pending U and S waiters.
1133 // Rather than tracking state to make the failing E regenerate the
1134 // wakeup, we just disable the optimization in the case that there
1135 // are waiting U or S that we are eligible to wake.
1136 if ((wakeMask & kWaitingE) == kWaitingE &&
1137 (state & wakeMask) == kWaitingE &&
1138 detail::futexWake(&state_, 1, kWaitingE) > 0) {
1139 // somebody woke up, so leave state_ as is and clear it later
1140 return;
1141 }
1142
1143 if ((state & wakeMask) != 0) {
1144 auto prev = state_.fetch_and(~wakeMask);
1145 if ((prev & wakeMask) != 0) {
1146 futexWakeAll(wakeMask);
1147 }
1148 state = prev & ~wakeMask;
1149 }
1150 }
1151
1152 void futexWakeAll(uint32_t wakeMask) {
1153 detail::futexWake(&state_, std::numeric_limits<int>::max(), wakeMask);
1154 }
1155
1156 DeferredReaderSlot* deferredReader(uint32_t slot) {
1157 return &deferredReaders[slot * kDeferredSeparationFactor];
1158 }
1159
1160 uintptr_t tokenfulSlotValue() {
1161 return reinterpret_cast<uintptr_t>(this);
1162 }
1163
1164 uintptr_t tokenlessSlotValue() {
1165 return tokenfulSlotValue() | kTokenless;
1166 }
1167
1168 bool slotValueIsThis(uintptr_t slotValue) {
1169 return (slotValue & ~kTokenless) == tokenfulSlotValue();
1170 }
1171
1172 // Clears any deferredReaders[] that point to this, adjusting the inline
1173 // shared lock count to compensate. Does some spinning and yielding
1174 // to avoid the work. Always finishes the application, even if ctx
1175 // times out.
1176 template <class WaitContext>
1177 void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
1178 uint32_t slot = 0;
1179
1180 uint32_t spinCount = 0;
1181 while (true) {
1182 while (!slotValueIsThis(
1183 deferredReader(slot)->load(std::memory_order_acquire))) {
1184 if (++slot == kMaxDeferredReaders) {
1185 return;
1186 }
1187 }
1188 asm_volatile_pause();
1189 if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
1190 applyDeferredReaders(state, ctx, slot);
1191 return;
1192 }
1193 }
1194 }
1195
1196 template <class WaitContext>
1197 void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
1198#ifdef RUSAGE_THREAD
1199 struct rusage usage;
1200 std::memset(&usage, 0, sizeof(usage));
1201 long before = -1;
1202#endif
1203 for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
1204 ++yieldCount) {
1205 for (int softState = 0; softState < 3; ++softState) {
1206 if (softState < 2) {
1207 std::this_thread::yield();
1208 } else {
1209#ifdef RUSAGE_THREAD
1210 getrusage(RUSAGE_THREAD, &usage);
1211#endif
1212 }
1213 while (!slotValueIsThis(
1214 deferredReader(slot)->load(std::memory_order_acquire))) {
1215 if (++slot == kMaxDeferredReaders) {
1216 return;
1217 }
1218 }
1219 if (ctx.shouldTimeOut()) {
1220 // finish applying immediately on timeout
1221 break;
1222 }
1223 }
1224#ifdef RUSAGE_THREAD
1225 if (before >= 0 && usage.ru_nivcsw >= before + 2) {
1226 // heuristic says run queue is not empty
1227 break;
1228 }
1229 before = usage.ru_nivcsw;
1230#endif
1231 }
1232
1233 uint32_t movedSlotCount = 0;
1234 for (; slot < kMaxDeferredReaders; ++slot) {
1235 auto slotPtr = deferredReader(slot);
1236 auto slotValue = slotPtr->load(std::memory_order_acquire);
1237 if (slotValueIsThis(slotValue) &&
1238 slotPtr->compare_exchange_strong(slotValue, 0)) {
1239 ++movedSlotCount;
1240 }
1241 }
1242
1243 if (movedSlotCount > 0) {
1244 state = (state_ += movedSlotCount * kIncrHasS);
1245 }
1246 assert((state & (kHasE | kBegunE)) != 0);
1247
1248 // if state + kIncrHasS overflows (off the end of state) then either
1249 // we have 2^(32-9) readers (almost certainly an application bug)
1250 // or we had an underflow (also a bug)
1251 assert(state < state + kIncrHasS);
1252 }
1253
1254 // It is straightfoward to make a token-less lock_shared() and
1255 // unlock_shared() either by making the token-less version always use
1256 // INLINE_SHARED mode or by removing the token version. Supporting
1257 // deferred operation for both types is trickier than it appears, because
1258 // the purpose of the token it so that unlock_shared doesn't have to
1259 // look in other slots for its deferred lock. Token-less unlock_shared
1260 // might place a deferred lock in one place and then release a different
1261 // slot that was originally used by the token-ful version. If this was
1262 // important we could solve the problem by differentiating the deferred
1263 // locks so that cross-variety release wouldn't occur. The best way
1264 // is probably to steal a bit from the pointer, making deferredLocks[]
1265 // an array of Atom<uintptr_t>.
1266
1267 template <class WaitContext>
1268 bool lockSharedImpl(Token* token, WaitContext& ctx) {
1269 uint32_t state = state_.load(std::memory_order_relaxed);
1270 if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
1271 state_.compare_exchange_strong(state, state + kIncrHasS)) {
1272 if (token != nullptr) {
1273 token->type_ = Token::Type::INLINE_SHARED;
1274 }
1275 return true;
1276 }
1277 return lockSharedImpl(state, token, ctx);
1278 }
1279
1280 template <class WaitContext>
1281 bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
1282
1283 // Updates the state in/out argument as if the locks were made inline,
1284 // but does not update state_
1285 void cleanupTokenlessSharedDeferred(uint32_t& state) {
1286 for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1287 auto slotPtr = deferredReader(i);
1288 auto slotValue = slotPtr->load(std::memory_order_relaxed);
1289 if (slotValue == tokenlessSlotValue()) {
1290 slotPtr->store(0, std::memory_order_relaxed);
1291 state += kIncrHasS;
1292 if ((state & kHasS) == 0) {
1293 break;
1294 }
1295 }
1296 }
1297 }
1298
1299 bool tryUnlockTokenlessSharedDeferred();
1300
1301 bool tryUnlockSharedDeferred(uint32_t slot) {
1302 assert(slot < kMaxDeferredReaders);
1303 auto slotValue = tokenfulSlotValue();
1304 return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
1305 }
1306
1307 uint32_t unlockSharedInline() {
1308 uint32_t state = (state_ -= kIncrHasS);
1309 assert(
1310 (state & (kHasE | kBegunE | kMayDefer)) != 0 ||
1311 state < state + kIncrHasS);
1312 if ((state & kHasS) == 0) {
1313 // Only the second half of lock() can be blocked by a non-zero
1314 // reader count, so that's the only thing we need to wake
1315 wakeRegisteredWaiters(state, kWaitingNotS);
1316 }
1317 return state;
1318 }
1319
1320 template <class WaitContext>
1321 bool lockUpgradeImpl(WaitContext& ctx) {
1322 uint32_t state;
1323 do {
1324 if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
1325 return false;
1326 }
1327 } while (!state_.compare_exchange_strong(state, state | kHasU));
1328 return true;
1329 }
1330
1331 public:
1332 class FOLLY_NODISCARD ReadHolder {
1333 ReadHolder() : lock_(nullptr) {}
1334
1335 public:
1336 explicit ReadHolder(const SharedMutexImpl* lock)
1337 : lock_(const_cast<SharedMutexImpl*>(lock)) {
1338 if (lock_) {
1339 lock_->lock_shared(token_);
1340 }
1341 }
1342
1343 explicit ReadHolder(const SharedMutexImpl& lock)
1344 : lock_(const_cast<SharedMutexImpl*>(&lock)) {
1345 lock_->lock_shared(token_);
1346 }
1347
1348 ReadHolder(ReadHolder&& rhs) noexcept
1349 : lock_(rhs.lock_), token_(rhs.token_) {
1350 rhs.lock_ = nullptr;
1351 }
1352
1353 // Downgrade from upgrade mode
1354 explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
1355 assert(upgraded.lock_ != nullptr);
1356 upgraded.lock_ = nullptr;
1357 lock_->unlock_upgrade_and_lock_shared(token_);
1358 }
1359
1360 // Downgrade from exclusive mode
1361 explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1362 assert(writer.lock_ != nullptr);
1363 writer.lock_ = nullptr;
1364 lock_->unlock_and_lock_shared(token_);
1365 }
1366
1367 ReadHolder& operator=(ReadHolder&& rhs) noexcept {
1368 std::swap(lock_, rhs.lock_);
1369 std::swap(token_, rhs.token_);
1370 return *this;
1371 }
1372
1373 ReadHolder(const ReadHolder& rhs) = delete;
1374 ReadHolder& operator=(const ReadHolder& rhs) = delete;
1375
1376 ~ReadHolder() {
1377 unlock();
1378 }
1379
1380 void unlock() {
1381 if (lock_) {
1382 lock_->unlock_shared(token_);
1383 lock_ = nullptr;
1384 }
1385 }
1386
1387 private:
1388 friend class UpgradeHolder;
1389 friend class WriteHolder;
1390 SharedMutexImpl* lock_;
1391 SharedMutexToken token_;
1392 };
1393
1394 class FOLLY_NODISCARD UpgradeHolder {
1395 UpgradeHolder() : lock_(nullptr) {}
1396
1397 public:
1398 explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) {
1399 if (lock_) {
1400 lock_->lock_upgrade();
1401 }
1402 }
1403
1404 explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
1405 lock_->lock_upgrade();
1406 }
1407
1408 // Downgrade from exclusive mode
1409 explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
1410 assert(writer.lock_ != nullptr);
1411 writer.lock_ = nullptr;
1412 lock_->unlock_and_lock_upgrade();
1413 }
1414
1415 UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
1416 rhs.lock_ = nullptr;
1417 }
1418
1419 UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
1420 std::swap(lock_, rhs.lock_);
1421 return *this;
1422 }
1423
1424 UpgradeHolder(const UpgradeHolder& rhs) = delete;
1425 UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
1426
1427 ~UpgradeHolder() {
1428 unlock();
1429 }
1430
1431 void unlock() {
1432 if (lock_) {
1433 lock_->unlock_upgrade();
1434 lock_ = nullptr;
1435 }
1436 }
1437
1438 private:
1439 friend class WriteHolder;
1440 friend class ReadHolder;
1441 SharedMutexImpl* lock_;
1442 };
1443
1444 class FOLLY_NODISCARD WriteHolder {
1445 WriteHolder() : lock_(nullptr) {}
1446
1447 public:
1448 explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) {
1449 if (lock_) {
1450 lock_->lock();
1451 }
1452 }
1453
1454 explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
1455 lock_->lock();
1456 }
1457
1458 // Promotion from upgrade mode
1459 explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
1460 assert(upgrade.lock_ != nullptr);
1461 upgrade.lock_ = nullptr;
1462 lock_->unlock_upgrade_and_lock();
1463 }
1464
1465 // README:
1466 //
1467 // It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
1468 //
1469 // Shared locks (read) can not safely upgrade to unique locks (write).
1470 // That upgrade path is a well-known recipe for deadlock, so we explicitly
1471 // disallow it.
1472 //
1473 // If you need to do a conditional mutation, you have a few options:
1474 // 1. Check the condition under a shared lock and release it.
1475 // Then maybe check the condition again under a unique lock and maybe do
1476 // the mutation.
1477 // 2. Check the condition once under an upgradeable lock.
1478 // Then maybe upgrade the lock to a unique lock and do the mutation.
1479 // 3. Check the condition and maybe perform the mutation under a unique
1480 // lock.
1481 //
1482 // Relevant upgradeable lock notes:
1483 // * At most one upgradeable lock can be held at a time for a given shared
1484 // mutex, just like a unique lock.
1485 // * An upgradeable lock may be held concurrently with any number of shared
1486 // locks.
1487 // * An upgradeable lock may be upgraded atomically to a unique lock.
1488
1489 WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
1490 rhs.lock_ = nullptr;
1491 }
1492
1493 WriteHolder& operator=(WriteHolder&& rhs) noexcept {
1494 std::swap(lock_, rhs.lock_);
1495 return *this;
1496 }
1497
1498 WriteHolder(const WriteHolder& rhs) = delete;
1499 WriteHolder& operator=(const WriteHolder& rhs) = delete;
1500
1501 ~WriteHolder() {
1502 unlock();
1503 }
1504
1505 void unlock() {
1506 if (lock_) {
1507 lock_->unlock();
1508 lock_ = nullptr;
1509 }
1510 }
1511
1512 private:
1513 friend class ReadHolder;
1514 friend class UpgradeHolder;
1515 SharedMutexImpl* lock_;
1516 };
1517
1518 // Adapters for Synchronized<>
1519 friend void acquireRead(SharedMutexImpl& lock) {
1520 lock.lock_shared();
1521 }
1522 friend void acquireReadWrite(SharedMutexImpl& lock) {
1523 lock.lock();
1524 }
1525 friend void releaseRead(SharedMutexImpl& lock) {
1526 lock.unlock_shared();
1527 }
1528 friend void releaseReadWrite(SharedMutexImpl& lock) {
1529 lock.unlock();
1530 }
1531 friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
1532 return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
1533 }
1534 friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
1535 return lock.try_lock_for(std::chrono::milliseconds(ms));
1536 }
1537};
1538
1539typedef SharedMutexImpl<true> SharedMutexReadPriority;
1540typedef SharedMutexImpl<false> SharedMutexWritePriority;
1541typedef SharedMutexWritePriority SharedMutex;
1542typedef SharedMutexImpl<false, void, std::atomic, false, false>
1543 SharedMutexSuppressTSAN;
1544
1545// Prevent the compiler from instantiating these in other translation units.
1546// They are instantiated once in SharedMutex.cpp
1547extern template class SharedMutexImpl<true>;
1548extern template class SharedMutexImpl<false>;
1549
1550template <
1551 bool ReaderPriority,
1552 typename Tag_,
1553 template <typename> class Atom,
1554 bool BlockImmediately,
1555 bool AnnotateForThreadSanitizer>
1556alignas(hardware_destructive_interference_size) typename SharedMutexImpl<
1557 ReaderPriority,
1558 Tag_,
1559 Atom,
1560 BlockImmediately,
1561 AnnotateForThreadSanitizer>::DeferredReaderSlot
1562 SharedMutexImpl<
1563 ReaderPriority,
1564 Tag_,
1565 Atom,
1566 BlockImmediately,
1567 AnnotateForThreadSanitizer>::deferredReaders
1568 [kMaxDeferredReaders * kDeferredSeparationFactor] = {};
1569
1570template <
1571 bool ReaderPriority,
1572 typename Tag_,
1573 template <typename> class Atom,
1574 bool BlockImmediately,
1575 bool AnnotateForThreadSanitizer>
1576FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl<
1577 ReaderPriority,
1578 Tag_,
1579 Atom,
1580 BlockImmediately,
1581 AnnotateForThreadSanitizer>::tls_lastTokenlessSlot = 0;
1582
1583template <
1584 bool ReaderPriority,
1585 typename Tag_,
1586 template <typename> class Atom,
1587 bool BlockImmediately,
1588 bool AnnotateForThreadSanitizer>
1589FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl<
1590 ReaderPriority,
1591 Tag_,
1592 Atom,
1593 BlockImmediately,
1594 AnnotateForThreadSanitizer>::tls_lastDeferredReaderSlot = 0;
1595
1596template <
1597 bool ReaderPriority,
1598 typename Tag_,
1599 template <typename> class Atom,
1600 bool BlockImmediately,
1601 bool AnnotateForThreadSanitizer>
1602bool SharedMutexImpl<
1603 ReaderPriority,
1604 Tag_,
1605 Atom,
1606 BlockImmediately,
1607 AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() {
1608 auto bestSlot = tls_lastTokenlessSlot;
1609 for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
1610 auto slotPtr = deferredReader(bestSlot ^ i);
1611 auto slotValue = slotPtr->load(std::memory_order_relaxed);
1612 if (slotValue == tokenlessSlotValue() &&
1613 slotPtr->compare_exchange_strong(slotValue, 0)) {
1614 tls_lastTokenlessSlot = bestSlot ^ i;
1615 return true;
1616 }
1617 }
1618 return false;
1619}
1620
1621template <
1622 bool ReaderPriority,
1623 typename Tag_,
1624 template <typename> class Atom,
1625 bool BlockImmediately,
1626 bool AnnotateForThreadSanitizer>
1627template <class WaitContext>
1628bool SharedMutexImpl<
1629 ReaderPriority,
1630 Tag_,
1631 Atom,
1632 BlockImmediately,
1633 AnnotateForThreadSanitizer>::
1634 lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
1635 while (true) {
1636 if (UNLIKELY((state & kHasE) != 0) &&
1637 !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
1638 return false;
1639 }
1640
1641 uint32_t slot = tls_lastDeferredReaderSlot;
1642 uintptr_t slotValue = 1; // any non-zero value will do
1643
1644 bool canAlreadyDefer = (state & kMayDefer) != 0;
1645 bool aboveDeferThreshold =
1646 (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
1647 bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
1648 if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
1649 /* Try using the most recent slot first. */
1650 slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1651 if (slotValue != 0) {
1652 // starting point for our empty-slot search, can change after
1653 // calling waitForZeroBits
1654 uint32_t bestSlot =
1655 (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders);
1656
1657 // deferred readers are already enabled, or it is time to
1658 // enable them if we can find a slot
1659 for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
1660 slot = bestSlot ^ i;
1661 assert(slot < kMaxDeferredReaders);
1662 slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
1663 if (slotValue == 0) {
1664 // found empty slot
1665 tls_lastDeferredReaderSlot = slot;
1666 break;
1667 }
1668 }
1669 }
1670 }
1671
1672 if (slotValue != 0) {
1673 // not yet deferred, or no empty slots
1674 if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
1675 // successfully recorded the read lock inline
1676 if (token != nullptr) {
1677 token->type_ = Token::Type::INLINE_SHARED;
1678 }
1679 return true;
1680 }
1681 // state is updated, try again
1682 continue;
1683 }
1684
1685 // record that deferred readers might be in use if necessary
1686 if ((state & kMayDefer) == 0) {
1687 if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
1688 // keep going if CAS failed because somebody else set the bit
1689 // for us
1690 if ((state & (kHasE | kMayDefer)) != kMayDefer) {
1691 continue;
1692 }
1693 }
1694 // state = state | kMayDefer;
1695 }
1696
1697 // try to use the slot
1698 bool gotSlot = deferredReader(slot)->compare_exchange_strong(
1699 slotValue,
1700 token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
1701
1702 // If we got the slot, we need to verify that an exclusive lock
1703 // didn't happen since we last checked. If we didn't get the slot we
1704 // need to recheck state_ anyway to make sure we don't waste too much
1705 // work. It is also possible that since we checked state_ someone
1706 // has acquired and released the write lock, clearing kMayDefer.
1707 // Both cases are covered by looking for the readers-possible bit,
1708 // because it is off when the exclusive lock bit is set.
1709 state = state_.load(std::memory_order_acquire);
1710
1711 if (!gotSlot) {
1712 continue;
1713 }
1714
1715 if (token == nullptr) {
1716 tls_lastTokenlessSlot = slot;
1717 }
1718
1719 if ((state & kMayDefer) != 0) {
1720 assert((state & kHasE) == 0);
1721 // success
1722 if (token != nullptr) {
1723 token->type_ = Token::Type::DEFERRED_SHARED;
1724 token->slot_ = (uint16_t)slot;
1725 }
1726 return true;
1727 }
1728
1729 // release the slot before retrying
1730 if (token == nullptr) {
1731 // We can't rely on slot. Token-less slot values can be freed by
1732 // any unlock_shared(), so we need to do the full deferredReader
1733 // search during unlock. Unlike unlock_shared(), we can't trust
1734 // kPrevDefer here. This deferred lock isn't visible to lock()
1735 // (that's the whole reason we're undoing it) so there might have
1736 // subsequently been an unlock() and lock() with no intervening
1737 // transition to deferred mode.
1738 if (!tryUnlockTokenlessSharedDeferred()) {
1739 unlockSharedInline();
1740 }
1741 } else {
1742 if (!tryUnlockSharedDeferred(slot)) {
1743 unlockSharedInline();
1744 }
1745 }
1746
1747 // We got here not because the lock was unavailable, but because
1748 // we lost a compare-and-swap. Try-lock is typically allowed to
1749 // have spurious failures, but there is no lock efficiency gain
1750 // from exploiting that freedom here.
1751 }
1752}
1753
1754} // namespace folly
1755