1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | // @author Nathan Bronson ([email protected]) |
18 | |
19 | #pragma once |
20 | |
21 | #include <stdint.h> |
22 | |
23 | #include <atomic> |
24 | #include <thread> |
25 | #include <type_traits> |
26 | |
27 | #include <folly/CPortability.h> |
28 | #include <folly/Likely.h> |
29 | #include <folly/concurrency/CacheLocality.h> |
30 | #include <folly/detail/Futex.h> |
31 | #include <folly/portability/Asm.h> |
32 | #include <folly/portability/SysResource.h> |
33 | #include <folly/synchronization/SanitizeThread.h> |
34 | |
35 | // SharedMutex is a reader-writer lock. It is small, very fast, scalable |
36 | // on multi-core, and suitable for use when readers or writers may block. |
37 | // Unlike most other reader-writer locks, its throughput with concurrent |
38 | // readers scales linearly; it is able to acquire and release the lock |
39 | // in shared mode without cache line ping-ponging. It is suitable for |
40 | // a wide range of lock hold times because it starts with spinning, |
41 | // proceeds to using sched_yield with a preemption heuristic, and then |
42 | // waits using futex and precise wakeups. |
43 | // |
44 | // SharedMutex provides all of the methods of folly::RWSpinLock, |
45 | // boost::shared_mutex, boost::upgrade_mutex, and C++14's |
46 | // std::shared_timed_mutex. All operations that can block are available |
47 | // in try, try-for, and try-until (system_clock or steady_clock) versions. |
48 | // |
49 | // SharedMutexReadPriority gives priority to readers, |
50 | // SharedMutexWritePriority gives priority to writers. SharedMutex is an |
51 | // alias for SharedMutexWritePriority, because writer starvation is more |
52 | // likely than reader starvation for the read-heavy workloads targeted |
53 | // by SharedMutex. |
54 | // |
55 | // In my tests SharedMutex is as good or better than the other |
56 | // reader-writer locks in use at Facebook for almost all use cases, |
57 | // sometimes by a wide margin. (If it is rare that there are actually |
58 | // concurrent readers then RWSpinLock can be a few nanoseconds faster.) |
59 | // I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64, |
60 | // boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses |
61 | // spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block. |
62 | // (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.) |
63 | // It is generally as good or better than the rest when evaluating size, |
64 | // speed, scalability, or latency outliers. In the corner cases where |
65 | // it is not the fastest (such as single-threaded use or heavy write |
66 | // contention) it is never very much worse than the best. See the bottom |
67 | // of folly/test/SharedMutexTest.cpp for lots of microbenchmark results. |
68 | // |
69 | // Comparison to folly::RWSpinLock: |
70 | // |
71 | // * SharedMutex is faster than RWSpinLock when there are actually |
72 | // concurrent read accesses (sometimes much faster), and ~5 nanoseconds |
73 | // slower when there is not actually any contention. SharedMutex is |
74 | // faster in every (benchmarked) scenario where the shared mode of |
75 | // the lock is actually useful. |
76 | // |
77 | // * Concurrent shared access to SharedMutex scales linearly, while total |
78 | // RWSpinLock throughput drops as more threads try to access the lock |
79 | // in shared mode. Under very heavy read contention SharedMutex can |
80 | // be two orders of magnitude faster than RWSpinLock (or any reader |
81 | // writer lock that doesn't use striping or deferral). |
82 | // |
83 | // * SharedMutex can safely protect blocking calls, because after an |
84 | // initial period of spinning it waits using futex(). |
85 | // |
86 | // * RWSpinLock prioritizes readers, SharedMutex has both reader- and |
87 | // writer-priority variants, but defaults to write priority. |
88 | // |
89 | // * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's |
90 | // doesn't. Both semantics are reasonable. The boost documentation |
91 | // doesn't explicitly talk about this behavior (except by omitting |
92 | // any statement that those lock modes conflict), but the boost |
93 | // implementations do allow new readers while the upgradeable mode |
94 | // is held. See https://github.com/boostorg/thread/blob/master/ |
95 | // include/boost/thread/pthread/shared_mutex.hpp |
96 | // |
97 | // * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder |
98 | // (UpgradeableHolder would be even more pedantically correct). |
99 | // SharedMutex's holders have fewer methods (no reset) and are less |
100 | // tolerant (promotion and downgrade crash if the donor doesn't own |
101 | // the lock, and you must use the default constructor rather than |
102 | // passing a nullptr to the pointer constructor). |
103 | // |
104 | // Both SharedMutex and RWSpinLock provide "exclusive", "upgrade", |
105 | // and "shared" modes. At all times num_threads_holding_exclusive + |
106 | // num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive == |
107 | // 0 || num_threads_holding_shared == 0. RWSpinLock has the additional |
108 | // constraint that num_threads_holding_shared cannot increase while |
109 | // num_threads_holding_upgrade is non-zero. |
110 | // |
111 | // Comparison to the internal RWLock: |
112 | // |
113 | // * SharedMutex doesn't allow a maximum reader count to be configured, |
114 | // so it can't be used as a semaphore in the same way as RWLock. |
115 | // |
116 | // * SharedMutex is 4 bytes, RWLock is 256. |
117 | // |
118 | // * SharedMutex is as fast or faster than RWLock in all of my |
119 | // microbenchmarks, and has positive rather than negative scalability. |
120 | // |
121 | // * RWLock and SharedMutex are both writer priority locks. |
122 | // |
123 | // * SharedMutex avoids latency outliers as well as RWLock. |
124 | // |
125 | // * SharedMutex uses different names (t != 0 below): |
126 | // |
127 | // RWLock::lock(0) => SharedMutex::lock() |
128 | // |
129 | // RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t)) |
130 | // |
131 | // RWLock::tryLock() => SharedMutex::try_lock() |
132 | // |
133 | // RWLock::unlock() => SharedMutex::unlock() |
134 | // |
135 | // RWLock::enter(0) => SharedMutex::lock_shared() |
136 | // |
137 | // RWLock::enter(t) => |
138 | // SharedMutex::try_lock_shared_for(milliseconds(t)) |
139 | // |
140 | // RWLock::tryEnter() => SharedMutex::try_lock_shared() |
141 | // |
142 | // RWLock::leave() => SharedMutex::unlock_shared() |
143 | // |
144 | // * RWLock allows the reader count to be adjusted by a value other |
145 | // than 1 during enter() or leave(). SharedMutex doesn't currently |
146 | // implement this feature. |
147 | // |
148 | // * RWLock's methods are marked const, SharedMutex's aren't. |
149 | // |
150 | // Reader-writer locks have the potential to allow concurrent access |
151 | // to shared read-mostly data, but in practice they often provide no |
152 | // improvement over a mutex. The problem is the cache coherence protocol |
153 | // of modern CPUs. Coherence is provided by making sure that when a cache |
154 | // line is written it is present in only one core's cache. Since a memory |
155 | // write is required to acquire a reader-writer lock in shared mode, the |
156 | // cache line holding the lock is invalidated in all of the other caches. |
157 | // This leads to cache misses when another thread wants to acquire or |
158 | // release the lock concurrently. When the RWLock is colocated with the |
159 | // data it protects (common), cache misses can also continue occur when |
160 | // a thread that already holds the lock tries to read the protected data. |
161 | // |
162 | // Ideally, a reader-writer lock would allow multiple cores to acquire |
163 | // and release the lock in shared mode without incurring any cache misses. |
164 | // This requires that each core records its shared access in a cache line |
165 | // that isn't read or written by other read-locking cores. (Writers will |
166 | // have to check all of the cache lines.) Typical server hardware when |
167 | // this comment was written has 16 L1 caches and cache lines of 64 bytes, |
168 | // so a lock striped over all L1 caches would occupy a prohibitive 1024 |
169 | // bytes. Nothing says that we need a separate set of per-core memory |
170 | // locations for each lock, however. Each SharedMutex instance is only |
171 | // 4 bytes, but all locks together share a 2K area in which they make a |
172 | // core-local record of lock acquisitions. |
173 | // |
174 | // SharedMutex's strategy of using a shared set of core-local stripes has |
175 | // a potential downside, because it means that acquisition of any lock in |
176 | // write mode can conflict with acquisition of any lock in shared mode. |
177 | // If a lock instance doesn't actually experience concurrency then this |
178 | // downside will outweight the upside of improved scalability for readers. |
179 | // To avoid this problem we dynamically detect concurrent accesses to |
180 | // SharedMutex, and don't start using the deferred mode unless we actually |
181 | // observe concurrency. See kNumSharedToStartDeferring. |
182 | // |
183 | // It is explicitly allowed to call unlock_shared() from a different |
184 | // thread than lock_shared(), so long as they are properly paired. |
185 | // unlock_shared() needs to find the location at which lock_shared() |
186 | // recorded the lock, which might be in the lock itself or in any of |
187 | // the shared slots. If you can conveniently pass state from lock |
188 | // acquisition to release then the fastest mechanism is to std::move |
189 | // the SharedMutex::ReadHolder instance or an SharedMutex::Token (using |
190 | // lock_shared(Token&) and unlock_shared(Token&)). The guard or token |
191 | // will tell unlock_shared where in deferredReaders[] to look for the |
192 | // deferred lock. The Token-less version of unlock_shared() works in all |
193 | // cases, but is optimized for the common (no inter-thread handoff) case. |
194 | // |
195 | // In both read- and write-priority mode, a waiting lock() (exclusive mode) |
196 | // only blocks readers after it has waited for an active upgrade lock to be |
197 | // released; until the upgrade lock is released (or upgraded or downgraded) |
198 | // readers will still be able to enter. Preferences about lock acquisition |
199 | // are not guaranteed to be enforced perfectly (even if they were, there |
200 | // is theoretically the chance that a thread could be arbitrarily suspended |
201 | // between calling lock() and SharedMutex code actually getting executed). |
202 | // |
203 | // try_*_for methods always try at least once, even if the duration |
204 | // is zero or negative. The duration type must be compatible with |
205 | // std::chrono::steady_clock. try_*_until methods also always try at |
206 | // least once. std::chrono::system_clock and std::chrono::steady_clock |
207 | // are supported. |
208 | // |
209 | // If you have observed by profiling that your SharedMutex-s are getting |
210 | // cache misses on deferredReaders[] due to another SharedMutex user, then |
211 | // you can use the tag type to create your own instantiation of the type. |
212 | // The contention threshold (see kNumSharedToStartDeferring) should make |
213 | // this unnecessary in all but the most extreme cases. Make sure to check |
214 | // that the increased icache and dcache footprint of the tagged result is |
215 | // worth it. |
216 | |
217 | // SharedMutex's use of thread local storage is an optimization, so |
218 | // for the case where thread local storage is not supported, define it |
219 | // away. |
220 | |
221 | // Note about TSAN (ThreadSanitizer): the SharedMutexWritePriority version |
222 | // (the default) of this mutex is annotated appropriately so that TSAN can |
223 | // perform lock inversion analysis. However, the SharedMutexReadPriority version |
224 | // is not annotated. This is because TSAN's lock order heuristic |
225 | // assumes that two calls to lock_shared must be ordered, which leads |
226 | // to too many false positives for the reader-priority case. |
227 | // |
228 | // Suppose thread A holds a SharedMutexWritePriority lock in shared mode and an |
229 | // independent thread B is waiting for exclusive access. Then a thread C's |
230 | // lock_shared can't proceed until A has released the lock. Discounting |
231 | // situations that never use exclusive mode (so no lock is necessary at all) |
232 | // this means that without higher-level reasoning it is not safe to ignore |
233 | // reader <-> reader interactions. |
234 | // |
235 | // This reasoning does not apply to SharedMutexReadPriority, because there are |
236 | // no actions by a thread B that can make C need to wait for A. Since the |
237 | // overwhelming majority of SharedMutex instances use write priority, we |
238 | // restrict the TSAN annotations to only SharedMutexWritePriority. |
239 | |
240 | #ifndef FOLLY_SHAREDMUTEX_TLS |
241 | #if !FOLLY_MOBILE |
242 | #define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS |
243 | #else |
244 | #define FOLLY_SHAREDMUTEX_TLS |
245 | #endif |
246 | #endif |
247 | |
248 | namespace folly { |
249 | |
250 | struct SharedMutexToken { |
251 | enum class Type : uint16_t { |
252 | INVALID = 0, |
253 | INLINE_SHARED, |
254 | DEFERRED_SHARED, |
255 | }; |
256 | |
257 | Type type_; |
258 | uint16_t slot_; |
259 | }; |
260 | |
261 | namespace detail { |
262 | // Returns a guard that gives permission for the current thread to |
263 | // annotate, and adjust the annotation bits in, the SharedMutex at ptr. |
264 | std::unique_lock<std::mutex> sharedMutexAnnotationGuard(void* ptr); |
265 | } // namespace detail |
266 | |
267 | template < |
268 | bool ReaderPriority, |
269 | typename Tag_ = void, |
270 | template <typename> class Atom = std::atomic, |
271 | bool BlockImmediately = false, |
272 | bool AnnotateForThreadSanitizer = kIsSanitizeThread && !ReaderPriority> |
273 | class SharedMutexImpl { |
274 | public: |
275 | static constexpr bool kReaderPriority = ReaderPriority; |
276 | |
277 | typedef Tag_ Tag; |
278 | |
279 | typedef SharedMutexToken Token; |
280 | |
281 | class FOLLY_NODISCARD ReadHolder; |
282 | class FOLLY_NODISCARD UpgradeHolder; |
283 | class FOLLY_NODISCARD WriteHolder; |
284 | |
285 | constexpr SharedMutexImpl() noexcept : state_(0) {} |
286 | |
287 | SharedMutexImpl(const SharedMutexImpl&) = delete; |
288 | SharedMutexImpl(SharedMutexImpl&&) = delete; |
289 | SharedMutexImpl& operator=(const SharedMutexImpl&) = delete; |
290 | SharedMutexImpl& operator=(SharedMutexImpl&&) = delete; |
291 | |
292 | // It is an error to destroy an SharedMutex that still has |
293 | // any outstanding locks. This is checked if NDEBUG isn't defined. |
294 | // SharedMutex's exclusive mode can be safely used to guard the lock's |
295 | // own destruction. If, for example, you acquire the lock in exclusive |
296 | // mode and then observe that the object containing the lock is no longer |
297 | // needed, you can unlock() and then immediately destroy the lock. |
298 | // See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a |
299 | // description about why this property needs to be explicitly mentioned. |
300 | ~SharedMutexImpl() { |
301 | auto state = state_.load(std::memory_order_relaxed); |
302 | if (UNLIKELY((state & kHasS) != 0)) { |
303 | cleanupTokenlessSharedDeferred(state); |
304 | } |
305 | |
306 | if (folly::kIsDebug) { |
307 | // These asserts check that everybody has released the lock before it |
308 | // is destroyed. If you arrive here while debugging that is likely |
309 | // the problem. (You could also have general heap corruption.) |
310 | |
311 | // if a futexWait fails to go to sleep because the value has been |
312 | // changed, we don't necessarily clean up the wait bits, so it is |
313 | // possible they will be set here in a correct system |
314 | assert((state & ~(kWaitingAny | kMayDefer | kAnnotationCreated)) == 0); |
315 | if ((state & kMayDefer) != 0) { |
316 | for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) { |
317 | auto slotValue = |
318 | deferredReader(slot)->load(std::memory_order_relaxed); |
319 | assert(!slotValueIsThis(slotValue)); |
320 | (void)slotValue; |
321 | } |
322 | } |
323 | } |
324 | annotateDestroy(); |
325 | } |
326 | |
327 | // Checks if an exclusive lock could succeed so that lock elision could be |
328 | // enabled. Different from the two eligible_for_lock_{upgrade|shared}_elision |
329 | // functions, this is a conservative check since kMayDefer indicates |
330 | // "may-existing" deferred readers. |
331 | bool eligible_for_lock_elision() const { |
332 | // We rely on the transaction for linearization. Wait bits are |
333 | // irrelevant because a successful transaction will be in and out |
334 | // without affecting the wakeup. kBegunE is also okay for a similar |
335 | // reason. |
336 | auto state = state_.load(std::memory_order_relaxed); |
337 | return (state & (kHasS | kMayDefer | kHasE | kHasU)) == 0; |
338 | } |
339 | |
340 | // Checks if an upgrade lock could succeed so that lock elision could be |
341 | // enabled. |
342 | bool eligible_for_lock_upgrade_elision() const { |
343 | auto state = state_.load(std::memory_order_relaxed); |
344 | return (state & (kHasE | kHasU)) == 0; |
345 | } |
346 | |
347 | // Checks if a shared lock could succeed so that lock elision could be |
348 | // enabled. |
349 | bool eligible_for_lock_shared_elision() const { |
350 | // No need to honor kBegunE because a transaction doesn't block anybody |
351 | auto state = state_.load(std::memory_order_relaxed); |
352 | return (state & kHasE) == 0; |
353 | } |
354 | |
355 | void lock() { |
356 | WaitForever ctx; |
357 | (void)lockExclusiveImpl(kHasSolo, ctx); |
358 | annotateAcquired(annotate_rwlock_level::wrlock); |
359 | } |
360 | |
361 | bool try_lock() { |
362 | WaitNever ctx; |
363 | auto result = lockExclusiveImpl(kHasSolo, ctx); |
364 | annotateTryAcquired(result, annotate_rwlock_level::wrlock); |
365 | return result; |
366 | } |
367 | |
368 | template <class Rep, class Period> |
369 | bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) { |
370 | WaitForDuration<Rep, Period> ctx(duration); |
371 | auto result = lockExclusiveImpl(kHasSolo, ctx); |
372 | annotateTryAcquired(result, annotate_rwlock_level::wrlock); |
373 | return result; |
374 | } |
375 | |
376 | template <class Clock, class Duration> |
377 | bool try_lock_until( |
378 | const std::chrono::time_point<Clock, Duration>& absDeadline) { |
379 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
380 | auto result = lockExclusiveImpl(kHasSolo, ctx); |
381 | annotateTryAcquired(result, annotate_rwlock_level::wrlock); |
382 | return result; |
383 | } |
384 | |
385 | void unlock() { |
386 | annotateReleased(annotate_rwlock_level::wrlock); |
387 | // It is possible that we have a left-over kWaitingNotS if the last |
388 | // unlock_shared() that let our matching lock() complete finished |
389 | // releasing before lock()'s futexWait went to sleep. Clean it up now |
390 | auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE)); |
391 | assert((state & ~(kWaitingAny | kAnnotationCreated)) == 0); |
392 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS); |
393 | } |
394 | |
395 | // Managing the token yourself makes unlock_shared a bit faster |
396 | |
397 | void lock_shared() { |
398 | WaitForever ctx; |
399 | (void)lockSharedImpl(nullptr, ctx); |
400 | annotateAcquired(annotate_rwlock_level::rdlock); |
401 | } |
402 | |
403 | void lock_shared(Token& token) { |
404 | WaitForever ctx; |
405 | (void)lockSharedImpl(&token, ctx); |
406 | annotateAcquired(annotate_rwlock_level::rdlock); |
407 | } |
408 | |
409 | bool try_lock_shared() { |
410 | WaitNever ctx; |
411 | auto result = lockSharedImpl(nullptr, ctx); |
412 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
413 | return result; |
414 | } |
415 | |
416 | bool try_lock_shared(Token& token) { |
417 | WaitNever ctx; |
418 | auto result = lockSharedImpl(&token, ctx); |
419 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
420 | return result; |
421 | } |
422 | |
423 | template <class Rep, class Period> |
424 | bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) { |
425 | WaitForDuration<Rep, Period> ctx(duration); |
426 | auto result = lockSharedImpl(nullptr, ctx); |
427 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
428 | return result; |
429 | } |
430 | |
431 | template <class Rep, class Period> |
432 | bool try_lock_shared_for( |
433 | const std::chrono::duration<Rep, Period>& duration, |
434 | Token& token) { |
435 | WaitForDuration<Rep, Period> ctx(duration); |
436 | auto result = lockSharedImpl(&token, ctx); |
437 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
438 | return result; |
439 | } |
440 | |
441 | template <class Clock, class Duration> |
442 | bool try_lock_shared_until( |
443 | const std::chrono::time_point<Clock, Duration>& absDeadline) { |
444 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
445 | auto result = lockSharedImpl(nullptr, ctx); |
446 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
447 | return result; |
448 | } |
449 | |
450 | template <class Clock, class Duration> |
451 | bool try_lock_shared_until( |
452 | const std::chrono::time_point<Clock, Duration>& absDeadline, |
453 | Token& token) { |
454 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
455 | auto result = lockSharedImpl(&token, ctx); |
456 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
457 | return result; |
458 | } |
459 | |
460 | void unlock_shared() { |
461 | annotateReleased(annotate_rwlock_level::rdlock); |
462 | |
463 | auto state = state_.load(std::memory_order_acquire); |
464 | |
465 | // kPrevDefer can only be set if HasE or BegunE is set |
466 | assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer); |
467 | |
468 | // lock() strips kMayDefer immediately, but then copies it to |
469 | // kPrevDefer so we can tell if the pre-lock() lock_shared() might |
470 | // have deferred |
471 | if ((state & (kMayDefer | kPrevDefer)) == 0 || |
472 | !tryUnlockTokenlessSharedDeferred()) { |
473 | // Matching lock_shared() couldn't have deferred, or the deferred |
474 | // lock has already been inlined by applyDeferredReaders() |
475 | unlockSharedInline(); |
476 | } |
477 | } |
478 | |
479 | void unlock_shared(Token& token) { |
480 | annotateReleased(annotate_rwlock_level::rdlock); |
481 | |
482 | assert( |
483 | token.type_ == Token::Type::INLINE_SHARED || |
484 | token.type_ == Token::Type::DEFERRED_SHARED); |
485 | |
486 | if (token.type_ != Token::Type::DEFERRED_SHARED || |
487 | !tryUnlockSharedDeferred(token.slot_)) { |
488 | unlockSharedInline(); |
489 | } |
490 | if (folly::kIsDebug) { |
491 | token.type_ = Token::Type::INVALID; |
492 | } |
493 | } |
494 | |
495 | void unlock_and_lock_shared() { |
496 | annotateReleased(annotate_rwlock_level::wrlock); |
497 | annotateAcquired(annotate_rwlock_level::rdlock); |
498 | // We can't use state_ -=, because we need to clear 2 bits (1 of which |
499 | // has an uncertain initial state) and set 1 other. We might as well |
500 | // clear the relevant wake bits at the same time. Note that since S |
501 | // doesn't block the beginning of a transition to E (writer priority |
502 | // can cut off new S, reader priority grabs BegunE and blocks deferred |
503 | // S) we need to wake E as well. |
504 | auto state = state_.load(std::memory_order_acquire); |
505 | do { |
506 | assert( |
507 | (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE); |
508 | } while (!state_.compare_exchange_strong( |
509 | state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS)); |
510 | if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) { |
511 | futexWakeAll(kWaitingE | kWaitingU | kWaitingS); |
512 | } |
513 | } |
514 | |
515 | void unlock_and_lock_shared(Token& token) { |
516 | unlock_and_lock_shared(); |
517 | token.type_ = Token::Type::INLINE_SHARED; |
518 | } |
519 | |
520 | void lock_upgrade() { |
521 | WaitForever ctx; |
522 | (void)lockUpgradeImpl(ctx); |
523 | // For TSAN: treat upgrade locks as equivalent to read locks |
524 | annotateAcquired(annotate_rwlock_level::rdlock); |
525 | } |
526 | |
527 | bool try_lock_upgrade() { |
528 | WaitNever ctx; |
529 | auto result = lockUpgradeImpl(ctx); |
530 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
531 | return result; |
532 | } |
533 | |
534 | template <class Rep, class Period> |
535 | bool try_lock_upgrade_for( |
536 | const std::chrono::duration<Rep, Period>& duration) { |
537 | WaitForDuration<Rep, Period> ctx(duration); |
538 | auto result = lockUpgradeImpl(ctx); |
539 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
540 | return result; |
541 | } |
542 | |
543 | template <class Clock, class Duration> |
544 | bool try_lock_upgrade_until( |
545 | const std::chrono::time_point<Clock, Duration>& absDeadline) { |
546 | WaitUntilDeadline<Clock, Duration> ctx{absDeadline}; |
547 | auto result = lockUpgradeImpl(ctx); |
548 | annotateTryAcquired(result, annotate_rwlock_level::rdlock); |
549 | return result; |
550 | } |
551 | |
552 | void unlock_upgrade() { |
553 | annotateReleased(annotate_rwlock_level::rdlock); |
554 | auto state = (state_ -= kHasU); |
555 | assert((state & (kWaitingNotS | kHasSolo)) == 0); |
556 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU); |
557 | } |
558 | |
559 | void unlock_upgrade_and_lock() { |
560 | // no waiting necessary, so waitMask is empty |
561 | WaitForever ctx; |
562 | (void)lockExclusiveImpl(0, ctx); |
563 | annotateReleased(annotate_rwlock_level::rdlock); |
564 | annotateAcquired(annotate_rwlock_level::wrlock); |
565 | } |
566 | |
567 | void unlock_upgrade_and_lock_shared() { |
568 | // No need to annotate for TSAN here because we model upgrade and shared |
569 | // locks as the same. |
570 | auto state = (state_ -= kHasU - kIncrHasS); |
571 | assert((state & (kWaitingNotS | kHasSolo)) == 0); |
572 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU); |
573 | } |
574 | |
575 | void unlock_upgrade_and_lock_shared(Token& token) { |
576 | unlock_upgrade_and_lock_shared(); |
577 | token.type_ = Token::Type::INLINE_SHARED; |
578 | } |
579 | |
580 | void unlock_and_lock_upgrade() { |
581 | annotateReleased(annotate_rwlock_level::wrlock); |
582 | annotateAcquired(annotate_rwlock_level::rdlock); |
583 | // We can't use state_ -=, because we need to clear 2 bits (1 of |
584 | // which has an uncertain initial state) and set 1 other. We might |
585 | // as well clear the relevant wake bits at the same time. |
586 | auto state = state_.load(std::memory_order_acquire); |
587 | while (true) { |
588 | assert( |
589 | (state & ~(kWaitingAny | kPrevDefer | kAnnotationCreated)) == kHasE); |
590 | auto after = |
591 | (state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU; |
592 | if (state_.compare_exchange_strong(state, after)) { |
593 | if ((state & kWaitingS) != 0) { |
594 | futexWakeAll(kWaitingS); |
595 | } |
596 | return; |
597 | } |
598 | } |
599 | } |
600 | |
601 | private: |
602 | typedef typename folly::detail::Futex<Atom> Futex; |
603 | |
604 | // Internally we use four kinds of wait contexts. These are structs |
605 | // that provide a doWait method that returns true if a futex wake |
606 | // was issued that intersects with the waitMask, false if there was a |
607 | // timeout and no more waiting should be performed. Spinning occurs |
608 | // before the wait context is invoked. |
609 | |
610 | struct WaitForever { |
611 | bool canBlock() { |
612 | return true; |
613 | } |
614 | bool canTimeOut() { |
615 | return false; |
616 | } |
617 | bool shouldTimeOut() { |
618 | return false; |
619 | } |
620 | |
621 | bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { |
622 | detail::futexWait(&futex, expected, waitMask); |
623 | return true; |
624 | } |
625 | }; |
626 | |
627 | struct WaitNever { |
628 | bool canBlock() { |
629 | return false; |
630 | } |
631 | bool canTimeOut() { |
632 | return true; |
633 | } |
634 | bool shouldTimeOut() { |
635 | return true; |
636 | } |
637 | |
638 | bool doWait( |
639 | Futex& /* futex */, |
640 | uint32_t /* expected */, |
641 | uint32_t /* waitMask */) { |
642 | return false; |
643 | } |
644 | }; |
645 | |
646 | template <class Rep, class Period> |
647 | struct WaitForDuration { |
648 | std::chrono::duration<Rep, Period> duration_; |
649 | bool deadlineComputed_; |
650 | std::chrono::steady_clock::time_point deadline_; |
651 | |
652 | explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration) |
653 | : duration_(duration), deadlineComputed_(false) {} |
654 | |
655 | std::chrono::steady_clock::time_point deadline() { |
656 | if (!deadlineComputed_) { |
657 | deadline_ = std::chrono::steady_clock::now() + duration_; |
658 | deadlineComputed_ = true; |
659 | } |
660 | return deadline_; |
661 | } |
662 | |
663 | bool canBlock() { |
664 | return duration_.count() > 0; |
665 | } |
666 | bool canTimeOut() { |
667 | return true; |
668 | } |
669 | |
670 | bool shouldTimeOut() { |
671 | return std::chrono::steady_clock::now() > deadline(); |
672 | } |
673 | |
674 | bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { |
675 | auto result = |
676 | detail::futexWaitUntil(&futex, expected, deadline(), waitMask); |
677 | return result != folly::detail::FutexResult::TIMEDOUT; |
678 | } |
679 | }; |
680 | |
681 | template <class Clock, class Duration> |
682 | struct WaitUntilDeadline { |
683 | std::chrono::time_point<Clock, Duration> absDeadline_; |
684 | |
685 | bool canBlock() { |
686 | return true; |
687 | } |
688 | bool canTimeOut() { |
689 | return true; |
690 | } |
691 | bool shouldTimeOut() { |
692 | return Clock::now() > absDeadline_; |
693 | } |
694 | |
695 | bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) { |
696 | auto result = |
697 | detail::futexWaitUntil(&futex, expected, absDeadline_, waitMask); |
698 | return result != folly::detail::FutexResult::TIMEDOUT; |
699 | } |
700 | }; |
701 | |
702 | void annotateLazyCreate() { |
703 | if (AnnotateForThreadSanitizer && |
704 | (state_.load() & kAnnotationCreated) == 0) { |
705 | auto guard = detail::sharedMutexAnnotationGuard(this); |
706 | // check again |
707 | if ((state_.load() & kAnnotationCreated) == 0) { |
708 | state_.fetch_or(kAnnotationCreated); |
709 | annotate_benign_race_sized( |
710 | &state_, sizeof(state_), "init TSAN" , __FILE__, __LINE__); |
711 | annotate_rwlock_create(this, __FILE__, __LINE__); |
712 | } |
713 | } |
714 | } |
715 | |
716 | void annotateDestroy() { |
717 | if (AnnotateForThreadSanitizer) { |
718 | annotateLazyCreate(); |
719 | annotate_rwlock_destroy(this, __FILE__, __LINE__); |
720 | } |
721 | } |
722 | |
723 | void annotateAcquired(annotate_rwlock_level w) { |
724 | if (AnnotateForThreadSanitizer) { |
725 | annotateLazyCreate(); |
726 | annotate_rwlock_acquired(this, w, __FILE__, __LINE__); |
727 | } |
728 | } |
729 | |
730 | void annotateTryAcquired(bool result, annotate_rwlock_level w) { |
731 | if (AnnotateForThreadSanitizer) { |
732 | annotateLazyCreate(); |
733 | annotate_rwlock_try_acquired(this, w, result, __FILE__, __LINE__); |
734 | } |
735 | } |
736 | |
737 | void annotateReleased(annotate_rwlock_level w) { |
738 | if (AnnotateForThreadSanitizer) { |
739 | assert((state_.load() & kAnnotationCreated) != 0); |
740 | annotate_rwlock_released(this, w, __FILE__, __LINE__); |
741 | } |
742 | } |
743 | |
744 | // 32 bits of state |
745 | Futex state_{}; |
746 | |
747 | // S count needs to be on the end, because we explicitly allow it to |
748 | // underflow. This can occur while we are in the middle of applying |
749 | // deferred locks (we remove them from deferredReaders[] before |
750 | // inlining them), or during token-less unlock_shared() if a racing |
751 | // lock_shared();unlock_shared() moves the deferredReaders slot while |
752 | // the first unlock_shared() is scanning. The former case is cleaned |
753 | // up before we finish applying the locks. The latter case can persist |
754 | // until destruction, when it is cleaned up. |
755 | static constexpr uint32_t kIncrHasS = 1 << 11; |
756 | static constexpr uint32_t kHasS = ~(kIncrHasS - 1); |
757 | |
758 | // Set if annotation has been completed for this instance. That annotation |
759 | // (and setting this bit afterward) must be guarded by one of the mutexes in |
760 | // annotationCreationGuards. |
761 | static constexpr uint32_t kAnnotationCreated = 1 << 10; |
762 | |
763 | // If false, then there are definitely no deferred read locks for this |
764 | // instance. Cleared after initialization and when exclusively locked. |
765 | static constexpr uint32_t kMayDefer = 1 << 9; |
766 | |
767 | // lock() cleared kMayDefer as soon as it starts draining readers (so |
768 | // that it doesn't have to do a second CAS once drain completes), but |
769 | // unlock_shared() still needs to know whether to scan deferredReaders[] |
770 | // or not. We copy kMayDefer to kPrevDefer when setting kHasE or |
771 | // kBegunE, and clear it when clearing those bits. |
772 | static constexpr uint32_t kPrevDefer = 1 << 8; |
773 | |
774 | // Exclusive-locked blocks all read locks and write locks. This bit |
775 | // may be set before all readers have finished, but in that case the |
776 | // thread that sets it won't return to the caller until all read locks |
777 | // have been released. |
778 | static constexpr uint32_t kHasE = 1 << 7; |
779 | |
780 | // Exclusive-draining means that lock() is waiting for existing readers |
781 | // to leave, but that new readers may still acquire shared access. |
782 | // This is only used in reader priority mode. New readers during |
783 | // drain must be inline. The difference between this and kHasU is that |
784 | // kBegunE prevents kMayDefer from being set. |
785 | static constexpr uint32_t kBegunE = 1 << 6; |
786 | |
787 | // At most one thread may have either exclusive or upgrade lock |
788 | // ownership. Unlike exclusive mode, ownership of the lock in upgrade |
789 | // mode doesn't preclude other threads holding the lock in shared mode. |
790 | // boost's concept for this doesn't explicitly say whether new shared |
791 | // locks can be acquired one lock_upgrade has succeeded, but doesn't |
792 | // list that as disallowed. RWSpinLock disallows new read locks after |
793 | // lock_upgrade has been acquired, but the boost implementation doesn't. |
794 | // We choose the latter. |
795 | static constexpr uint32_t kHasU = 1 << 5; |
796 | |
797 | // There are three states that we consider to be "solo", in that they |
798 | // cannot coexist with other solo states. These are kHasE, kBegunE, |
799 | // and kHasU. Note that S doesn't conflict with any of these, because |
800 | // setting the kHasE is only one of the two steps needed to actually |
801 | // acquire the lock in exclusive mode (the other is draining the existing |
802 | // S holders). |
803 | static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU; |
804 | |
805 | // Once a thread sets kHasE it needs to wait for the current readers |
806 | // to exit the lock. We give this a separate wait identity from the |
807 | // waiting to set kHasE so that we can perform partial wakeups (wake |
808 | // one instead of wake all). |
809 | static constexpr uint32_t kWaitingNotS = 1 << 4; |
810 | |
811 | // When waking writers we can either wake them all, in which case we |
812 | // can clear kWaitingE, or we can call futexWake(1). futexWake tells |
813 | // us if anybody woke up, but even if we detect that nobody woke up we |
814 | // can't clear the bit after the fact without issuing another wakeup. |
815 | // To avoid thundering herds when there are lots of pending lock() |
816 | // without needing to call futexWake twice when there is only one |
817 | // waiter, kWaitingE actually encodes if we have observed multiple |
818 | // concurrent waiters. Tricky: ABA issues on futexWait mean that when |
819 | // we see kWaitingESingle we can't assume that there is only one. |
820 | static constexpr uint32_t kWaitingESingle = 1 << 2; |
821 | static constexpr uint32_t kWaitingEMultiple = 1 << 3; |
822 | static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple; |
823 | |
824 | // kWaitingU is essentially a 1 bit saturating counter. It always |
825 | // requires a wakeAll. |
826 | static constexpr uint32_t kWaitingU = 1 << 1; |
827 | |
828 | // All blocked lock_shared() should be awoken, so it is correct (not |
829 | // suboptimal) to wakeAll if there are any shared readers. |
830 | static constexpr uint32_t kWaitingS = 1 << 0; |
831 | |
832 | // kWaitingAny is a mask of all of the bits that record the state of |
833 | // threads, rather than the state of the lock. It is convenient to be |
834 | // able to mask them off during asserts. |
835 | static constexpr uint32_t kWaitingAny = |
836 | kWaitingNotS | kWaitingE | kWaitingU | kWaitingS; |
837 | |
838 | // The reader count at which a reader will attempt to use the lock |
839 | // in deferred mode. If this value is 2, then the second concurrent |
840 | // reader will set kMayDefer and use deferredReaders[]. kMayDefer is |
841 | // cleared during exclusive access, so this threshold must be reached |
842 | // each time a lock is held in exclusive mode. |
843 | static constexpr uint32_t kNumSharedToStartDeferring = 2; |
844 | |
845 | // The typical number of spins that a thread will wait for a state |
846 | // transition. There is no bound on the number of threads that can wait |
847 | // for a writer, so we are pretty conservative here to limit the chance |
848 | // that we are starving the writer of CPU. Each spin is 6 or 7 nanos, |
849 | // almost all of which is in the pause instruction. |
850 | static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2; |
851 | |
852 | // The maximum number of soft yields before falling back to futex. |
853 | // If the preemption heuristic is activated we will fall back before |
854 | // this. A soft yield takes ~900 nanos (two sched_yield plus a call |
855 | // to getrusage, with checks of the goal at each step). Soft yields |
856 | // aren't compatible with deterministic execution under test (unlike |
857 | // futexWaitUntil, which has a capricious but deterministic back end). |
858 | static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0; |
859 | |
860 | // If AccessSpreader assigns indexes from 0..k*n-1 on a system where some |
861 | // level of the memory hierarchy is symmetrically divided into k pieces |
862 | // (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes |
863 | // that are the same after integer division by k share that resource. |
864 | // Our strategy for deferred readers is to probe up to numSlots/4 slots, |
865 | // using the full granularity of AccessSpreader for the start slot |
866 | // and then search outward. We can use AccessSpreader::current(n) |
867 | // without managing our own spreader if kMaxDeferredReaders <= |
868 | // AccessSpreader::kMaxCpus, which is currently 128. |
869 | // |
870 | // Our 2-socket E5-2660 machines have 8 L1 caches on each chip, |
871 | // with 64 byte cache lines. That means we need 64*16 bytes of |
872 | // deferredReaders[] to give each L1 its own playground. On x86_64 |
873 | // each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders |
874 | // * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If |
875 | // kDeferredSearchDistance * kDeferredSeparationFactor <= |
876 | // 64 / 8 then we will search only within a single cache line, which |
877 | // guarantees we won't have inter-L1 contention. We give ourselves |
878 | // a factor of 2 on the core count, which should hold us for a couple |
879 | // processor generations. deferredReaders[] is 2048 bytes currently. |
880 | public: |
881 | static constexpr uint32_t kMaxDeferredReaders = 64; |
882 | static constexpr uint32_t kDeferredSearchDistance = 2; |
883 | static constexpr uint32_t kDeferredSeparationFactor = 4; |
884 | |
885 | private: |
886 | static_assert( |
887 | !(kMaxDeferredReaders & (kMaxDeferredReaders - 1)), |
888 | "kMaxDeferredReaders must be a power of 2" ); |
889 | static_assert( |
890 | !(kDeferredSearchDistance & (kDeferredSearchDistance - 1)), |
891 | "kDeferredSearchDistance must be a power of 2" ); |
892 | |
893 | // The number of deferred locks that can be simultaneously acquired |
894 | // by a thread via the token-less methods without performing any heap |
895 | // allocations. Each of these costs 3 pointers (24 bytes, probably) |
896 | // per thread. There's not much point in making this larger than |
897 | // kDeferredSearchDistance. |
898 | static constexpr uint32_t kTokenStackTLSCapacity = 2; |
899 | |
900 | // We need to make sure that if there is a lock_shared() |
901 | // and lock_shared(token) followed by unlock_shared() and |
902 | // unlock_shared(token), the token-less unlock doesn't null |
903 | // out deferredReaders[token.slot_]. If we allowed that, then |
904 | // unlock_shared(token) wouldn't be able to assume that its lock |
905 | // had been inlined by applyDeferredReaders when it finds that |
906 | // deferredReaders[token.slot_] no longer points to this. We accomplish |
907 | // this by stealing bit 0 from the pointer to record that the slot's |
908 | // element has no token, hence our use of uintptr_t in deferredReaders[]. |
909 | static constexpr uintptr_t kTokenless = 0x1; |
910 | |
911 | // This is the starting location for Token-less unlock_shared(). |
912 | static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot; |
913 | |
914 | // Last deferred reader slot used. |
915 | static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot; |
916 | |
917 | // Only indexes divisible by kDeferredSeparationFactor are used. |
918 | // If any of those elements points to a SharedMutexImpl, then it |
919 | // should be considered that there is a shared lock on that instance. |
920 | // See kTokenless. |
921 | public: |
922 | typedef Atom<uintptr_t> DeferredReaderSlot; |
923 | |
924 | private: |
925 | alignas(hardware_destructive_interference_size) static DeferredReaderSlot |
926 | deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor]; |
927 | |
928 | // Performs an exclusive lock, waiting for state_ & waitMask to be |
929 | // zero first |
930 | template <class WaitContext> |
931 | bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) { |
932 | uint32_t state = state_.load(std::memory_order_acquire); |
933 | if (LIKELY( |
934 | (state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 && |
935 | state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) { |
936 | return true; |
937 | } else { |
938 | return lockExclusiveImpl(state, preconditionGoalMask, ctx); |
939 | } |
940 | } |
941 | |
942 | template <class WaitContext> |
943 | bool lockExclusiveImpl( |
944 | uint32_t& state, |
945 | uint32_t preconditionGoalMask, |
946 | WaitContext& ctx) { |
947 | while (true) { |
948 | if (UNLIKELY((state & preconditionGoalMask) != 0) && |
949 | !waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) && |
950 | ctx.canTimeOut()) { |
951 | return false; |
952 | } |
953 | |
954 | uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer; |
955 | if (!kReaderPriority || (state & (kMayDefer | kHasS)) == 0) { |
956 | // Block readers immediately, either because we are in write |
957 | // priority mode or because we can acquire the lock in one |
958 | // step. Note that if state has kHasU, then we are doing an |
959 | // unlock_upgrade_and_lock() and we should clear it (reader |
960 | // priority branch also does this). |
961 | after |= (state | kHasE) & ~(kHasU | kMayDefer); |
962 | } else { |
963 | after |= (state | kBegunE) & ~(kHasU | kMayDefer); |
964 | } |
965 | if (state_.compare_exchange_strong(state, after)) { |
966 | auto before = state; |
967 | state = after; |
968 | |
969 | // If we set kHasE (writer priority) then no new readers can |
970 | // arrive. If we set kBegunE then they can still enter, but |
971 | // they must be inline. Either way we need to either spin on |
972 | // deferredReaders[] slots, or inline them so that we can wait on |
973 | // kHasS to zero itself. deferredReaders[] is pointers, which on |
974 | // x86_64 are bigger than futex() can handle, so we inline the |
975 | // deferred locks instead of trying to futexWait on each slot. |
976 | // Readers are responsible for rechecking state_ after recording |
977 | // a deferred read to avoid atomicity problems between the state_ |
978 | // CAS and applyDeferredReader's reads of deferredReaders[]. |
979 | if (UNLIKELY((before & kMayDefer) != 0)) { |
980 | applyDeferredReaders(state, ctx); |
981 | } |
982 | while (true) { |
983 | assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0); |
984 | if (UNLIKELY((state & kHasS) != 0) && |
985 | !waitForZeroBits(state, kHasS, kWaitingNotS, ctx) && |
986 | ctx.canTimeOut()) { |
987 | // Ugh. We blocked new readers and other writers for a while, |
988 | // but were unable to complete. Move on. On the plus side |
989 | // we can clear kWaitingNotS because nobody else can piggyback |
990 | // on it. |
991 | state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS)); |
992 | wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS); |
993 | return false; |
994 | } |
995 | |
996 | if (kReaderPriority && (state & kHasE) == 0) { |
997 | assert((state & kBegunE) != 0); |
998 | if (!state_.compare_exchange_strong( |
999 | state, (state & ~kBegunE) | kHasE)) { |
1000 | continue; |
1001 | } |
1002 | } |
1003 | |
1004 | return true; |
1005 | } |
1006 | } |
1007 | } |
1008 | } |
1009 | |
1010 | template <class WaitContext> |
1011 | bool waitForZeroBits( |
1012 | uint32_t& state, |
1013 | uint32_t goal, |
1014 | uint32_t waitMask, |
1015 | WaitContext& ctx) { |
1016 | uint32_t spinCount = 0; |
1017 | while (true) { |
1018 | state = state_.load(std::memory_order_acquire); |
1019 | if ((state & goal) == 0) { |
1020 | return true; |
1021 | } |
1022 | asm_volatile_pause(); |
1023 | ++spinCount; |
1024 | if (UNLIKELY(spinCount >= kMaxSpinCount)) { |
1025 | return ctx.canBlock() && |
1026 | yieldWaitForZeroBits(state, goal, waitMask, ctx); |
1027 | } |
1028 | } |
1029 | } |
1030 | |
1031 | template <class WaitContext> |
1032 | bool yieldWaitForZeroBits( |
1033 | uint32_t& state, |
1034 | uint32_t goal, |
1035 | uint32_t waitMask, |
1036 | WaitContext& ctx) { |
1037 | #ifdef RUSAGE_THREAD |
1038 | struct rusage usage; |
1039 | std::memset(&usage, 0, sizeof(usage)); |
1040 | long before = -1; |
1041 | #endif |
1042 | for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; |
1043 | ++yieldCount) { |
1044 | for (int softState = 0; softState < 3; ++softState) { |
1045 | if (softState < 2) { |
1046 | std::this_thread::yield(); |
1047 | } else { |
1048 | #ifdef RUSAGE_THREAD |
1049 | getrusage(RUSAGE_THREAD, &usage); |
1050 | #endif |
1051 | } |
1052 | if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) { |
1053 | return true; |
1054 | } |
1055 | if (ctx.shouldTimeOut()) { |
1056 | return false; |
1057 | } |
1058 | } |
1059 | #ifdef RUSAGE_THREAD |
1060 | if (before >= 0 && usage.ru_nivcsw >= before + 2) { |
1061 | // One involuntary csw might just be occasional background work, |
1062 | // but if we get two in a row then we guess that there is someone |
1063 | // else who can profitably use this CPU. Fall back to futex |
1064 | break; |
1065 | } |
1066 | before = usage.ru_nivcsw; |
1067 | #endif |
1068 | } |
1069 | return futexWaitForZeroBits(state, goal, waitMask, ctx); |
1070 | } |
1071 | |
1072 | template <class WaitContext> |
1073 | bool futexWaitForZeroBits( |
1074 | uint32_t& state, |
1075 | uint32_t goal, |
1076 | uint32_t waitMask, |
1077 | WaitContext& ctx) { |
1078 | assert( |
1079 | waitMask == kWaitingNotS || waitMask == kWaitingE || |
1080 | waitMask == kWaitingU || waitMask == kWaitingS); |
1081 | |
1082 | while (true) { |
1083 | state = state_.load(std::memory_order_acquire); |
1084 | if ((state & goal) == 0) { |
1085 | return true; |
1086 | } |
1087 | |
1088 | auto after = state; |
1089 | if (waitMask == kWaitingE) { |
1090 | if ((state & kWaitingESingle) != 0) { |
1091 | after |= kWaitingEMultiple; |
1092 | } else { |
1093 | after |= kWaitingESingle; |
1094 | } |
1095 | } else { |
1096 | after |= waitMask; |
1097 | } |
1098 | |
1099 | // CAS is better than atomic |= here, because it lets us avoid |
1100 | // setting the wait flag when the goal is concurrently achieved |
1101 | if (after != state && !state_.compare_exchange_strong(state, after)) { |
1102 | continue; |
1103 | } |
1104 | |
1105 | if (!ctx.doWait(state_, after, waitMask)) { |
1106 | // timed out |
1107 | return false; |
1108 | } |
1109 | } |
1110 | } |
1111 | |
1112 | // Wakes up waiters registered in state_ as appropriate, clearing the |
1113 | // awaiting bits for anybody that was awoken. Tries to perform direct |
1114 | // single wakeup of an exclusive waiter if appropriate |
1115 | void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) { |
1116 | if (UNLIKELY((state & wakeMask) != 0)) { |
1117 | wakeRegisteredWaitersImpl(state, wakeMask); |
1118 | } |
1119 | } |
1120 | |
1121 | void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) { |
1122 | // If there are multiple lock() pending only one of them will actually |
1123 | // get to wake up, so issuing futexWakeAll will make a thundering herd. |
1124 | // There's nothing stopping us from issuing futexWake(1) instead, |
1125 | // so long as the wait bits are still an accurate reflection of |
1126 | // the waiters. If we notice (via futexWake's return value) that |
1127 | // nobody woke up then we can try again with the normal wake-all path. |
1128 | // Note that we can't just clear the bits at that point; we need to |
1129 | // clear the bits and then issue another wakeup. |
1130 | // |
1131 | // It is possible that we wake an E waiter but an outside S grabs the |
1132 | // lock instead, at which point we should wake pending U and S waiters. |
1133 | // Rather than tracking state to make the failing E regenerate the |
1134 | // wakeup, we just disable the optimization in the case that there |
1135 | // are waiting U or S that we are eligible to wake. |
1136 | if ((wakeMask & kWaitingE) == kWaitingE && |
1137 | (state & wakeMask) == kWaitingE && |
1138 | detail::futexWake(&state_, 1, kWaitingE) > 0) { |
1139 | // somebody woke up, so leave state_ as is and clear it later |
1140 | return; |
1141 | } |
1142 | |
1143 | if ((state & wakeMask) != 0) { |
1144 | auto prev = state_.fetch_and(~wakeMask); |
1145 | if ((prev & wakeMask) != 0) { |
1146 | futexWakeAll(wakeMask); |
1147 | } |
1148 | state = prev & ~wakeMask; |
1149 | } |
1150 | } |
1151 | |
1152 | void futexWakeAll(uint32_t wakeMask) { |
1153 | detail::futexWake(&state_, std::numeric_limits<int>::max(), wakeMask); |
1154 | } |
1155 | |
1156 | DeferredReaderSlot* deferredReader(uint32_t slot) { |
1157 | return &deferredReaders[slot * kDeferredSeparationFactor]; |
1158 | } |
1159 | |
1160 | uintptr_t tokenfulSlotValue() { |
1161 | return reinterpret_cast<uintptr_t>(this); |
1162 | } |
1163 | |
1164 | uintptr_t tokenlessSlotValue() { |
1165 | return tokenfulSlotValue() | kTokenless; |
1166 | } |
1167 | |
1168 | bool slotValueIsThis(uintptr_t slotValue) { |
1169 | return (slotValue & ~kTokenless) == tokenfulSlotValue(); |
1170 | } |
1171 | |
1172 | // Clears any deferredReaders[] that point to this, adjusting the inline |
1173 | // shared lock count to compensate. Does some spinning and yielding |
1174 | // to avoid the work. Always finishes the application, even if ctx |
1175 | // times out. |
1176 | template <class WaitContext> |
1177 | void applyDeferredReaders(uint32_t& state, WaitContext& ctx) { |
1178 | uint32_t slot = 0; |
1179 | |
1180 | uint32_t spinCount = 0; |
1181 | while (true) { |
1182 | while (!slotValueIsThis( |
1183 | deferredReader(slot)->load(std::memory_order_acquire))) { |
1184 | if (++slot == kMaxDeferredReaders) { |
1185 | return; |
1186 | } |
1187 | } |
1188 | asm_volatile_pause(); |
1189 | if (UNLIKELY(++spinCount >= kMaxSpinCount)) { |
1190 | applyDeferredReaders(state, ctx, slot); |
1191 | return; |
1192 | } |
1193 | } |
1194 | } |
1195 | |
1196 | template <class WaitContext> |
1197 | void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) { |
1198 | #ifdef RUSAGE_THREAD |
1199 | struct rusage usage; |
1200 | std::memset(&usage, 0, sizeof(usage)); |
1201 | long before = -1; |
1202 | #endif |
1203 | for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount; |
1204 | ++yieldCount) { |
1205 | for (int softState = 0; softState < 3; ++softState) { |
1206 | if (softState < 2) { |
1207 | std::this_thread::yield(); |
1208 | } else { |
1209 | #ifdef RUSAGE_THREAD |
1210 | getrusage(RUSAGE_THREAD, &usage); |
1211 | #endif |
1212 | } |
1213 | while (!slotValueIsThis( |
1214 | deferredReader(slot)->load(std::memory_order_acquire))) { |
1215 | if (++slot == kMaxDeferredReaders) { |
1216 | return; |
1217 | } |
1218 | } |
1219 | if (ctx.shouldTimeOut()) { |
1220 | // finish applying immediately on timeout |
1221 | break; |
1222 | } |
1223 | } |
1224 | #ifdef RUSAGE_THREAD |
1225 | if (before >= 0 && usage.ru_nivcsw >= before + 2) { |
1226 | // heuristic says run queue is not empty |
1227 | break; |
1228 | } |
1229 | before = usage.ru_nivcsw; |
1230 | #endif |
1231 | } |
1232 | |
1233 | uint32_t movedSlotCount = 0; |
1234 | for (; slot < kMaxDeferredReaders; ++slot) { |
1235 | auto slotPtr = deferredReader(slot); |
1236 | auto slotValue = slotPtr->load(std::memory_order_acquire); |
1237 | if (slotValueIsThis(slotValue) && |
1238 | slotPtr->compare_exchange_strong(slotValue, 0)) { |
1239 | ++movedSlotCount; |
1240 | } |
1241 | } |
1242 | |
1243 | if (movedSlotCount > 0) { |
1244 | state = (state_ += movedSlotCount * kIncrHasS); |
1245 | } |
1246 | assert((state & (kHasE | kBegunE)) != 0); |
1247 | |
1248 | // if state + kIncrHasS overflows (off the end of state) then either |
1249 | // we have 2^(32-9) readers (almost certainly an application bug) |
1250 | // or we had an underflow (also a bug) |
1251 | assert(state < state + kIncrHasS); |
1252 | } |
1253 | |
1254 | // It is straightfoward to make a token-less lock_shared() and |
1255 | // unlock_shared() either by making the token-less version always use |
1256 | // INLINE_SHARED mode or by removing the token version. Supporting |
1257 | // deferred operation for both types is trickier than it appears, because |
1258 | // the purpose of the token it so that unlock_shared doesn't have to |
1259 | // look in other slots for its deferred lock. Token-less unlock_shared |
1260 | // might place a deferred lock in one place and then release a different |
1261 | // slot that was originally used by the token-ful version. If this was |
1262 | // important we could solve the problem by differentiating the deferred |
1263 | // locks so that cross-variety release wouldn't occur. The best way |
1264 | // is probably to steal a bit from the pointer, making deferredLocks[] |
1265 | // an array of Atom<uintptr_t>. |
1266 | |
1267 | template <class WaitContext> |
1268 | bool lockSharedImpl(Token* token, WaitContext& ctx) { |
1269 | uint32_t state = state_.load(std::memory_order_relaxed); |
1270 | if ((state & (kHasS | kMayDefer | kHasE)) == 0 && |
1271 | state_.compare_exchange_strong(state, state + kIncrHasS)) { |
1272 | if (token != nullptr) { |
1273 | token->type_ = Token::Type::INLINE_SHARED; |
1274 | } |
1275 | return true; |
1276 | } |
1277 | return lockSharedImpl(state, token, ctx); |
1278 | } |
1279 | |
1280 | template <class WaitContext> |
1281 | bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx); |
1282 | |
1283 | // Updates the state in/out argument as if the locks were made inline, |
1284 | // but does not update state_ |
1285 | void cleanupTokenlessSharedDeferred(uint32_t& state) { |
1286 | for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { |
1287 | auto slotPtr = deferredReader(i); |
1288 | auto slotValue = slotPtr->load(std::memory_order_relaxed); |
1289 | if (slotValue == tokenlessSlotValue()) { |
1290 | slotPtr->store(0, std::memory_order_relaxed); |
1291 | state += kIncrHasS; |
1292 | if ((state & kHasS) == 0) { |
1293 | break; |
1294 | } |
1295 | } |
1296 | } |
1297 | } |
1298 | |
1299 | bool tryUnlockTokenlessSharedDeferred(); |
1300 | |
1301 | bool tryUnlockSharedDeferred(uint32_t slot) { |
1302 | assert(slot < kMaxDeferredReaders); |
1303 | auto slotValue = tokenfulSlotValue(); |
1304 | return deferredReader(slot)->compare_exchange_strong(slotValue, 0); |
1305 | } |
1306 | |
1307 | uint32_t unlockSharedInline() { |
1308 | uint32_t state = (state_ -= kIncrHasS); |
1309 | assert( |
1310 | (state & (kHasE | kBegunE | kMayDefer)) != 0 || |
1311 | state < state + kIncrHasS); |
1312 | if ((state & kHasS) == 0) { |
1313 | // Only the second half of lock() can be blocked by a non-zero |
1314 | // reader count, so that's the only thing we need to wake |
1315 | wakeRegisteredWaiters(state, kWaitingNotS); |
1316 | } |
1317 | return state; |
1318 | } |
1319 | |
1320 | template <class WaitContext> |
1321 | bool lockUpgradeImpl(WaitContext& ctx) { |
1322 | uint32_t state; |
1323 | do { |
1324 | if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) { |
1325 | return false; |
1326 | } |
1327 | } while (!state_.compare_exchange_strong(state, state | kHasU)); |
1328 | return true; |
1329 | } |
1330 | |
1331 | public: |
1332 | class FOLLY_NODISCARD ReadHolder { |
1333 | ReadHolder() : lock_(nullptr) {} |
1334 | |
1335 | public: |
1336 | explicit ReadHolder(const SharedMutexImpl* lock) |
1337 | : lock_(const_cast<SharedMutexImpl*>(lock)) { |
1338 | if (lock_) { |
1339 | lock_->lock_shared(token_); |
1340 | } |
1341 | } |
1342 | |
1343 | explicit ReadHolder(const SharedMutexImpl& lock) |
1344 | : lock_(const_cast<SharedMutexImpl*>(&lock)) { |
1345 | lock_->lock_shared(token_); |
1346 | } |
1347 | |
1348 | ReadHolder(ReadHolder&& rhs) noexcept |
1349 | : lock_(rhs.lock_), token_(rhs.token_) { |
1350 | rhs.lock_ = nullptr; |
1351 | } |
1352 | |
1353 | // Downgrade from upgrade mode |
1354 | explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) { |
1355 | assert(upgraded.lock_ != nullptr); |
1356 | upgraded.lock_ = nullptr; |
1357 | lock_->unlock_upgrade_and_lock_shared(token_); |
1358 | } |
1359 | |
1360 | // Downgrade from exclusive mode |
1361 | explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) { |
1362 | assert(writer.lock_ != nullptr); |
1363 | writer.lock_ = nullptr; |
1364 | lock_->unlock_and_lock_shared(token_); |
1365 | } |
1366 | |
1367 | ReadHolder& operator=(ReadHolder&& rhs) noexcept { |
1368 | std::swap(lock_, rhs.lock_); |
1369 | std::swap(token_, rhs.token_); |
1370 | return *this; |
1371 | } |
1372 | |
1373 | ReadHolder(const ReadHolder& rhs) = delete; |
1374 | ReadHolder& operator=(const ReadHolder& rhs) = delete; |
1375 | |
1376 | ~ReadHolder() { |
1377 | unlock(); |
1378 | } |
1379 | |
1380 | void unlock() { |
1381 | if (lock_) { |
1382 | lock_->unlock_shared(token_); |
1383 | lock_ = nullptr; |
1384 | } |
1385 | } |
1386 | |
1387 | private: |
1388 | friend class UpgradeHolder; |
1389 | friend class WriteHolder; |
1390 | SharedMutexImpl* lock_; |
1391 | SharedMutexToken token_; |
1392 | }; |
1393 | |
1394 | class FOLLY_NODISCARD UpgradeHolder { |
1395 | UpgradeHolder() : lock_(nullptr) {} |
1396 | |
1397 | public: |
1398 | explicit UpgradeHolder(SharedMutexImpl* lock) : lock_(lock) { |
1399 | if (lock_) { |
1400 | lock_->lock_upgrade(); |
1401 | } |
1402 | } |
1403 | |
1404 | explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) { |
1405 | lock_->lock_upgrade(); |
1406 | } |
1407 | |
1408 | // Downgrade from exclusive mode |
1409 | explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) { |
1410 | assert(writer.lock_ != nullptr); |
1411 | writer.lock_ = nullptr; |
1412 | lock_->unlock_and_lock_upgrade(); |
1413 | } |
1414 | |
1415 | UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) { |
1416 | rhs.lock_ = nullptr; |
1417 | } |
1418 | |
1419 | UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept { |
1420 | std::swap(lock_, rhs.lock_); |
1421 | return *this; |
1422 | } |
1423 | |
1424 | UpgradeHolder(const UpgradeHolder& rhs) = delete; |
1425 | UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete; |
1426 | |
1427 | ~UpgradeHolder() { |
1428 | unlock(); |
1429 | } |
1430 | |
1431 | void unlock() { |
1432 | if (lock_) { |
1433 | lock_->unlock_upgrade(); |
1434 | lock_ = nullptr; |
1435 | } |
1436 | } |
1437 | |
1438 | private: |
1439 | friend class WriteHolder; |
1440 | friend class ReadHolder; |
1441 | SharedMutexImpl* lock_; |
1442 | }; |
1443 | |
1444 | class FOLLY_NODISCARD WriteHolder { |
1445 | WriteHolder() : lock_(nullptr) {} |
1446 | |
1447 | public: |
1448 | explicit WriteHolder(SharedMutexImpl* lock) : lock_(lock) { |
1449 | if (lock_) { |
1450 | lock_->lock(); |
1451 | } |
1452 | } |
1453 | |
1454 | explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) { |
1455 | lock_->lock(); |
1456 | } |
1457 | |
1458 | // Promotion from upgrade mode |
1459 | explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) { |
1460 | assert(upgrade.lock_ != nullptr); |
1461 | upgrade.lock_ = nullptr; |
1462 | lock_->unlock_upgrade_and_lock(); |
1463 | } |
1464 | |
1465 | // README: |
1466 | // |
1467 | // It is intended that WriteHolder(ReadHolder&& rhs) do not exist. |
1468 | // |
1469 | // Shared locks (read) can not safely upgrade to unique locks (write). |
1470 | // That upgrade path is a well-known recipe for deadlock, so we explicitly |
1471 | // disallow it. |
1472 | // |
1473 | // If you need to do a conditional mutation, you have a few options: |
1474 | // 1. Check the condition under a shared lock and release it. |
1475 | // Then maybe check the condition again under a unique lock and maybe do |
1476 | // the mutation. |
1477 | // 2. Check the condition once under an upgradeable lock. |
1478 | // Then maybe upgrade the lock to a unique lock and do the mutation. |
1479 | // 3. Check the condition and maybe perform the mutation under a unique |
1480 | // lock. |
1481 | // |
1482 | // Relevant upgradeable lock notes: |
1483 | // * At most one upgradeable lock can be held at a time for a given shared |
1484 | // mutex, just like a unique lock. |
1485 | // * An upgradeable lock may be held concurrently with any number of shared |
1486 | // locks. |
1487 | // * An upgradeable lock may be upgraded atomically to a unique lock. |
1488 | |
1489 | WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) { |
1490 | rhs.lock_ = nullptr; |
1491 | } |
1492 | |
1493 | WriteHolder& operator=(WriteHolder&& rhs) noexcept { |
1494 | std::swap(lock_, rhs.lock_); |
1495 | return *this; |
1496 | } |
1497 | |
1498 | WriteHolder(const WriteHolder& rhs) = delete; |
1499 | WriteHolder& operator=(const WriteHolder& rhs) = delete; |
1500 | |
1501 | ~WriteHolder() { |
1502 | unlock(); |
1503 | } |
1504 | |
1505 | void unlock() { |
1506 | if (lock_) { |
1507 | lock_->unlock(); |
1508 | lock_ = nullptr; |
1509 | } |
1510 | } |
1511 | |
1512 | private: |
1513 | friend class ReadHolder; |
1514 | friend class UpgradeHolder; |
1515 | SharedMutexImpl* lock_; |
1516 | }; |
1517 | |
1518 | // Adapters for Synchronized<> |
1519 | friend void acquireRead(SharedMutexImpl& lock) { |
1520 | lock.lock_shared(); |
1521 | } |
1522 | friend void acquireReadWrite(SharedMutexImpl& lock) { |
1523 | lock.lock(); |
1524 | } |
1525 | friend void releaseRead(SharedMutexImpl& lock) { |
1526 | lock.unlock_shared(); |
1527 | } |
1528 | friend void releaseReadWrite(SharedMutexImpl& lock) { |
1529 | lock.unlock(); |
1530 | } |
1531 | friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) { |
1532 | return lock.try_lock_shared_for(std::chrono::milliseconds(ms)); |
1533 | } |
1534 | friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) { |
1535 | return lock.try_lock_for(std::chrono::milliseconds(ms)); |
1536 | } |
1537 | }; |
1538 | |
1539 | typedef SharedMutexImpl<true> SharedMutexReadPriority; |
1540 | typedef SharedMutexImpl<false> SharedMutexWritePriority; |
1541 | typedef SharedMutexWritePriority SharedMutex; |
1542 | typedef SharedMutexImpl<false, void, std::atomic, false, false> |
1543 | SharedMutexSuppressTSAN; |
1544 | |
1545 | // Prevent the compiler from instantiating these in other translation units. |
1546 | // They are instantiated once in SharedMutex.cpp |
1547 | extern template class SharedMutexImpl<true>; |
1548 | extern template class SharedMutexImpl<false>; |
1549 | |
1550 | template < |
1551 | bool ReaderPriority, |
1552 | typename Tag_, |
1553 | template <typename> class Atom, |
1554 | bool BlockImmediately, |
1555 | bool AnnotateForThreadSanitizer> |
1556 | alignas(hardware_destructive_interference_size) typename SharedMutexImpl< |
1557 | ReaderPriority, |
1558 | Tag_, |
1559 | Atom, |
1560 | BlockImmediately, |
1561 | AnnotateForThreadSanitizer>::DeferredReaderSlot |
1562 | SharedMutexImpl< |
1563 | ReaderPriority, |
1564 | Tag_, |
1565 | Atom, |
1566 | BlockImmediately, |
1567 | AnnotateForThreadSanitizer>::deferredReaders |
1568 | [kMaxDeferredReaders * kDeferredSeparationFactor] = {}; |
1569 | |
1570 | template < |
1571 | bool ReaderPriority, |
1572 | typename Tag_, |
1573 | template <typename> class Atom, |
1574 | bool BlockImmediately, |
1575 | bool AnnotateForThreadSanitizer> |
1576 | FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl< |
1577 | ReaderPriority, |
1578 | Tag_, |
1579 | Atom, |
1580 | BlockImmediately, |
1581 | AnnotateForThreadSanitizer>::tls_lastTokenlessSlot = 0; |
1582 | |
1583 | template < |
1584 | bool ReaderPriority, |
1585 | typename Tag_, |
1586 | template <typename> class Atom, |
1587 | bool BlockImmediately, |
1588 | bool AnnotateForThreadSanitizer> |
1589 | FOLLY_SHAREDMUTEX_TLS uint32_t SharedMutexImpl< |
1590 | ReaderPriority, |
1591 | Tag_, |
1592 | Atom, |
1593 | BlockImmediately, |
1594 | AnnotateForThreadSanitizer>::tls_lastDeferredReaderSlot = 0; |
1595 | |
1596 | template < |
1597 | bool ReaderPriority, |
1598 | typename Tag_, |
1599 | template <typename> class Atom, |
1600 | bool BlockImmediately, |
1601 | bool AnnotateForThreadSanitizer> |
1602 | bool SharedMutexImpl< |
1603 | ReaderPriority, |
1604 | Tag_, |
1605 | Atom, |
1606 | BlockImmediately, |
1607 | AnnotateForThreadSanitizer>::tryUnlockTokenlessSharedDeferred() { |
1608 | auto bestSlot = tls_lastTokenlessSlot; |
1609 | for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) { |
1610 | auto slotPtr = deferredReader(bestSlot ^ i); |
1611 | auto slotValue = slotPtr->load(std::memory_order_relaxed); |
1612 | if (slotValue == tokenlessSlotValue() && |
1613 | slotPtr->compare_exchange_strong(slotValue, 0)) { |
1614 | tls_lastTokenlessSlot = bestSlot ^ i; |
1615 | return true; |
1616 | } |
1617 | } |
1618 | return false; |
1619 | } |
1620 | |
1621 | template < |
1622 | bool ReaderPriority, |
1623 | typename Tag_, |
1624 | template <typename> class Atom, |
1625 | bool BlockImmediately, |
1626 | bool AnnotateForThreadSanitizer> |
1627 | template <class WaitContext> |
1628 | bool SharedMutexImpl< |
1629 | ReaderPriority, |
1630 | Tag_, |
1631 | Atom, |
1632 | BlockImmediately, |
1633 | AnnotateForThreadSanitizer>:: |
1634 | lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) { |
1635 | while (true) { |
1636 | if (UNLIKELY((state & kHasE) != 0) && |
1637 | !waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) { |
1638 | return false; |
1639 | } |
1640 | |
1641 | uint32_t slot = tls_lastDeferredReaderSlot; |
1642 | uintptr_t slotValue = 1; // any non-zero value will do |
1643 | |
1644 | bool canAlreadyDefer = (state & kMayDefer) != 0; |
1645 | bool aboveDeferThreshold = |
1646 | (state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS; |
1647 | bool drainInProgress = ReaderPriority && (state & kBegunE) != 0; |
1648 | if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) { |
1649 | /* Try using the most recent slot first. */ |
1650 | slotValue = deferredReader(slot)->load(std::memory_order_relaxed); |
1651 | if (slotValue != 0) { |
1652 | // starting point for our empty-slot search, can change after |
1653 | // calling waitForZeroBits |
1654 | uint32_t bestSlot = |
1655 | (uint32_t)folly::AccessSpreader<Atom>::current(kMaxDeferredReaders); |
1656 | |
1657 | // deferred readers are already enabled, or it is time to |
1658 | // enable them if we can find a slot |
1659 | for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) { |
1660 | slot = bestSlot ^ i; |
1661 | assert(slot < kMaxDeferredReaders); |
1662 | slotValue = deferredReader(slot)->load(std::memory_order_relaxed); |
1663 | if (slotValue == 0) { |
1664 | // found empty slot |
1665 | tls_lastDeferredReaderSlot = slot; |
1666 | break; |
1667 | } |
1668 | } |
1669 | } |
1670 | } |
1671 | |
1672 | if (slotValue != 0) { |
1673 | // not yet deferred, or no empty slots |
1674 | if (state_.compare_exchange_strong(state, state + kIncrHasS)) { |
1675 | // successfully recorded the read lock inline |
1676 | if (token != nullptr) { |
1677 | token->type_ = Token::Type::INLINE_SHARED; |
1678 | } |
1679 | return true; |
1680 | } |
1681 | // state is updated, try again |
1682 | continue; |
1683 | } |
1684 | |
1685 | // record that deferred readers might be in use if necessary |
1686 | if ((state & kMayDefer) == 0) { |
1687 | if (!state_.compare_exchange_strong(state, state | kMayDefer)) { |
1688 | // keep going if CAS failed because somebody else set the bit |
1689 | // for us |
1690 | if ((state & (kHasE | kMayDefer)) != kMayDefer) { |
1691 | continue; |
1692 | } |
1693 | } |
1694 | // state = state | kMayDefer; |
1695 | } |
1696 | |
1697 | // try to use the slot |
1698 | bool gotSlot = deferredReader(slot)->compare_exchange_strong( |
1699 | slotValue, |
1700 | token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue()); |
1701 | |
1702 | // If we got the slot, we need to verify that an exclusive lock |
1703 | // didn't happen since we last checked. If we didn't get the slot we |
1704 | // need to recheck state_ anyway to make sure we don't waste too much |
1705 | // work. It is also possible that since we checked state_ someone |
1706 | // has acquired and released the write lock, clearing kMayDefer. |
1707 | // Both cases are covered by looking for the readers-possible bit, |
1708 | // because it is off when the exclusive lock bit is set. |
1709 | state = state_.load(std::memory_order_acquire); |
1710 | |
1711 | if (!gotSlot) { |
1712 | continue; |
1713 | } |
1714 | |
1715 | if (token == nullptr) { |
1716 | tls_lastTokenlessSlot = slot; |
1717 | } |
1718 | |
1719 | if ((state & kMayDefer) != 0) { |
1720 | assert((state & kHasE) == 0); |
1721 | // success |
1722 | if (token != nullptr) { |
1723 | token->type_ = Token::Type::DEFERRED_SHARED; |
1724 | token->slot_ = (uint16_t)slot; |
1725 | } |
1726 | return true; |
1727 | } |
1728 | |
1729 | // release the slot before retrying |
1730 | if (token == nullptr) { |
1731 | // We can't rely on slot. Token-less slot values can be freed by |
1732 | // any unlock_shared(), so we need to do the full deferredReader |
1733 | // search during unlock. Unlike unlock_shared(), we can't trust |
1734 | // kPrevDefer here. This deferred lock isn't visible to lock() |
1735 | // (that's the whole reason we're undoing it) so there might have |
1736 | // subsequently been an unlock() and lock() with no intervening |
1737 | // transition to deferred mode. |
1738 | if (!tryUnlockTokenlessSharedDeferred()) { |
1739 | unlockSharedInline(); |
1740 | } |
1741 | } else { |
1742 | if (!tryUnlockSharedDeferred(slot)) { |
1743 | unlockSharedInline(); |
1744 | } |
1745 | } |
1746 | |
1747 | // We got here not because the lock was unavailable, but because |
1748 | // we lost a compare-and-swap. Try-lock is typically allowed to |
1749 | // have spurious failures, but there is no lock efficiency gain |
1750 | // from exploiting that freedom here. |
1751 | } |
1752 | } |
1753 | |
1754 | } // namespace folly |
1755 | |