1#ifndef JEMALLOC_INTERNAL_TSD_H
2#define JEMALLOC_INTERNAL_TSD_H
3
4#include "jemalloc/internal/arena_types.h"
5#include "jemalloc/internal/assert.h"
6#include "jemalloc/internal/bin_types.h"
7#include "jemalloc/internal/jemalloc_internal_externs.h"
8#include "jemalloc/internal/prof_types.h"
9#include "jemalloc/internal/ql.h"
10#include "jemalloc/internal/rtree_tsd.h"
11#include "jemalloc/internal/tcache_types.h"
12#include "jemalloc/internal/tcache_structs.h"
13#include "jemalloc/internal/util.h"
14#include "jemalloc/internal/witness.h"
15
16/*
17 * Thread-Specific-Data layout
18 * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
19 * s: state
20 * e: tcache_enabled
21 * m: thread_allocated (config_stats)
22 * f: thread_deallocated (config_stats)
23 * p: prof_tdata (config_prof)
24 * c: rtree_ctx (rtree cache accessed on deallocation)
25 * t: tcache
26 * --- data not accessed on tcache fast path: arena-related fields ---
27 * d: arenas_tdata_bypass
28 * r: reentrancy_level
29 * x: narenas_tdata
30 * i: iarena
31 * a: arena
32 * o: arenas_tdata
33 * Loading TSD data is on the critical path of basically all malloc operations.
34 * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
35 * Use a compact layout to reduce cache footprint.
36 * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
37 * |---------------------------- 1st cacheline ----------------------------|
38 * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] |
39 * |---------------------------- 2nd cacheline ----------------------------|
40 * | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
41 * |---------------------------- 3nd cacheline ----------------------------|
42 * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
43 * +-------------------------------------------------------------------------+
44 * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
45 *
46 * The last 3 members (i, a and o) before tcache isn't really needed on tcache
47 * fast path. However we have a number of unused tcache bins and witnesses
48 * (never touched unless config_debug) at the end of tcache, so we place them
49 * there to avoid breaking the cachelines and possibly paging in an extra page.
50 */
51#ifdef JEMALLOC_JET
52typedef void (*test_callback_t)(int *);
53# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
54# define MALLOC_TEST_TSD \
55 O(test_data, int, int) \
56 O(test_callback, test_callback_t, int)
57# define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL
58#else
59# define MALLOC_TEST_TSD
60# define MALLOC_TEST_TSD_INITIALIZER
61#endif
62
63/* O(name, type, nullable type */
64#define MALLOC_TSD \
65 O(tcache_enabled, bool, bool) \
66 O(arenas_tdata_bypass, bool, bool) \
67 O(reentrancy_level, int8_t, int8_t) \
68 O(narenas_tdata, uint32_t, uint32_t) \
69 O(offset_state, uint64_t, uint64_t) \
70 O(thread_allocated, uint64_t, uint64_t) \
71 O(thread_deallocated, uint64_t, uint64_t) \
72 O(bytes_until_sample, int64_t, int64_t) \
73 O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
74 O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
75 O(iarena, arena_t *, arena_t *) \
76 O(arena, arena_t *, arena_t *) \
77 O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
78 O(binshards, tsd_binshards_t, tsd_binshards_t)\
79 O(tcache, tcache_t, tcache_t) \
80 O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
81 MALLOC_TEST_TSD
82
83#define TSD_INITIALIZER { \
84 ATOMIC_INIT(tsd_state_uninitialized), \
85 TCACHE_ENABLED_ZERO_INITIALIZER, \
86 false, \
87 0, \
88 0, \
89 0, \
90 0, \
91 0, \
92 0, \
93 NULL, \
94 RTREE_CTX_ZERO_INITIALIZER, \
95 NULL, \
96 NULL, \
97 NULL, \
98 TSD_BINSHARDS_ZERO_INITIALIZER, \
99 TCACHE_ZERO_INITIALIZER, \
100 WITNESS_TSD_INITIALIZER \
101 MALLOC_TEST_TSD_INITIALIZER \
102}
103
104void *malloc_tsd_malloc(size_t size);
105void malloc_tsd_dalloc(void *wrapper);
106void malloc_tsd_cleanup_register(bool (*f)(void));
107tsd_t *malloc_tsd_boot0(void);
108void malloc_tsd_boot1(void);
109void tsd_cleanup(void *arg);
110tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
111void tsd_state_set(tsd_t *tsd, uint8_t new_state);
112void tsd_slow_update(tsd_t *tsd);
113void tsd_prefork(tsd_t *tsd);
114void tsd_postfork_parent(tsd_t *tsd);
115void tsd_postfork_child(tsd_t *tsd);
116
117/*
118 * Call ..._inc when your module wants to take all threads down the slow paths,
119 * and ..._dec when it no longer needs to.
120 */
121void tsd_global_slow_inc(tsdn_t *tsdn);
122void tsd_global_slow_dec(tsdn_t *tsdn);
123bool tsd_global_slow();
124
125enum {
126 /* Common case --> jnz. */
127 tsd_state_nominal = 0,
128 /* Initialized but on slow path. */
129 tsd_state_nominal_slow = 1,
130 /*
131 * Some thread has changed global state in such a way that all nominal
132 * threads need to recompute their fast / slow status the next time they
133 * get a chance.
134 *
135 * Any thread can change another thread's status *to* recompute, but
136 * threads are the only ones who can change their status *from*
137 * recompute.
138 */
139 tsd_state_nominal_recompute = 2,
140 /*
141 * The above nominal states should be lower values. We use
142 * tsd_nominal_max to separate nominal states from threads in the
143 * process of being born / dying.
144 */
145 tsd_state_nominal_max = 2,
146
147 /*
148 * A thread might free() during its death as its only allocator action;
149 * in such scenarios, we need tsd, but set up in such a way that no
150 * cleanup is necessary.
151 */
152 tsd_state_minimal_initialized = 3,
153 /* States during which we know we're in thread death. */
154 tsd_state_purgatory = 4,
155 tsd_state_reincarnated = 5,
156 /*
157 * What it says on the tin; tsd that hasn't been initialized. Note
158 * that even when the tsd struct lives in TLS, when need to keep track
159 * of stuff like whether or not our pthread destructors have been
160 * scheduled, so this really truly is different than the nominal state.
161 */
162 tsd_state_uninitialized = 6
163};
164
165/*
166 * Some TSD accesses can only be done in a nominal state. To enforce this, we
167 * wrap TSD member access in a function that asserts on TSD state, and mangle
168 * field names to prevent touching them accidentally.
169 */
170#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
171
172#ifdef JEMALLOC_U8_ATOMICS
173# define tsd_state_t atomic_u8_t
174# define tsd_atomic_load atomic_load_u8
175# define tsd_atomic_store atomic_store_u8
176# define tsd_atomic_exchange atomic_exchange_u8
177#else
178# define tsd_state_t atomic_u32_t
179# define tsd_atomic_load atomic_load_u32
180# define tsd_atomic_store atomic_store_u32
181# define tsd_atomic_exchange atomic_exchange_u32
182#endif
183
184/* The actual tsd. */
185struct tsd_s {
186 /*
187 * The contents should be treated as totally opaque outside the tsd
188 * module. Access any thread-local state through the getters and
189 * setters below.
190 */
191
192 /*
193 * We manually limit the state to just a single byte. Unless the 8-bit
194 * atomics are unavailable (which is rare).
195 */
196 tsd_state_t state;
197#define O(n, t, nt) \
198 t TSD_MANGLE(n);
199MALLOC_TSD
200#undef O
201};
202
203JEMALLOC_ALWAYS_INLINE uint8_t
204tsd_state_get(tsd_t *tsd) {
205 /*
206 * This should be atomic. Unfortunately, compilers right now can't tell
207 * that this can be done as a memory comparison, and forces a load into
208 * a register that hurts fast-path performance.
209 */
210 /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
211 return *(uint8_t *)&tsd->state;
212}
213
214/*
215 * Wrapper around tsd_t that makes it possible to avoid implicit conversion
216 * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
217 * explicitly converted to tsd_t, which is non-nullable.
218 */
219struct tsdn_s {
220 tsd_t tsd;
221};
222#define TSDN_NULL ((tsdn_t *)0)
223JEMALLOC_ALWAYS_INLINE tsdn_t *
224tsd_tsdn(tsd_t *tsd) {
225 return (tsdn_t *)tsd;
226}
227
228JEMALLOC_ALWAYS_INLINE bool
229tsdn_null(const tsdn_t *tsdn) {
230 return tsdn == NULL;
231}
232
233JEMALLOC_ALWAYS_INLINE tsd_t *
234tsdn_tsd(tsdn_t *tsdn) {
235 assert(!tsdn_null(tsdn));
236
237 return &tsdn->tsd;
238}
239
240/*
241 * We put the platform-specific data declarations and inlines into their own
242 * header files to avoid cluttering this file. They define tsd_boot0,
243 * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set.
244 */
245#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
246#include "jemalloc/internal/tsd_malloc_thread_cleanup.h"
247#elif (defined(JEMALLOC_TLS))
248#include "jemalloc/internal/tsd_tls.h"
249#elif (defined(_WIN32))
250#include "jemalloc/internal/tsd_win.h"
251#else
252#include "jemalloc/internal/tsd_generic.h"
253#endif
254
255/*
256 * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of
257 * foo. This omits some safety checks, and so can be used during tsd
258 * initialization and cleanup.
259 */
260#define O(n, t, nt) \
261JEMALLOC_ALWAYS_INLINE t * \
262tsd_##n##p_get_unsafe(tsd_t *tsd) { \
263 return &tsd->TSD_MANGLE(n); \
264}
265MALLOC_TSD
266#undef O
267
268/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
269#define O(n, t, nt) \
270JEMALLOC_ALWAYS_INLINE t * \
271tsd_##n##p_get(tsd_t *tsd) { \
272 /* \
273 * Because the state might change asynchronously if it's \
274 * nominal, we need to make sure that we only read it once. \
275 */ \
276 uint8_t state = tsd_state_get(tsd); \
277 assert(state == tsd_state_nominal || \
278 state == tsd_state_nominal_slow || \
279 state == tsd_state_nominal_recompute || \
280 state == tsd_state_reincarnated || \
281 state == tsd_state_minimal_initialized); \
282 return tsd_##n##p_get_unsafe(tsd); \
283}
284MALLOC_TSD
285#undef O
286
287/*
288 * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn
289 * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type.
290 */
291#define O(n, t, nt) \
292JEMALLOC_ALWAYS_INLINE nt * \
293tsdn_##n##p_get(tsdn_t *tsdn) { \
294 if (tsdn_null(tsdn)) { \
295 return NULL; \
296 } \
297 tsd_t *tsd = tsdn_tsd(tsdn); \
298 return (nt *)tsd_##n##p_get(tsd); \
299}
300MALLOC_TSD
301#undef O
302
303/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
304#define O(n, t, nt) \
305JEMALLOC_ALWAYS_INLINE t \
306tsd_##n##_get(tsd_t *tsd) { \
307 return *tsd_##n##p_get(tsd); \
308}
309MALLOC_TSD
310#undef O
311
312/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
313#define O(n, t, nt) \
314JEMALLOC_ALWAYS_INLINE void \
315tsd_##n##_set(tsd_t *tsd, t val) { \
316 assert(tsd_state_get(tsd) != tsd_state_reincarnated && \
317 tsd_state_get(tsd) != tsd_state_minimal_initialized); \
318 *tsd_##n##p_get(tsd) = val; \
319}
320MALLOC_TSD
321#undef O
322
323JEMALLOC_ALWAYS_INLINE void
324tsd_assert_fast(tsd_t *tsd) {
325 /*
326 * Note that our fastness assertion does *not* include global slowness
327 * counters; it's not in general possible to ensure that they won't
328 * change asynchronously from underneath us.
329 */
330 assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
331 tsd_reentrancy_level_get(tsd) == 0);
332}
333
334JEMALLOC_ALWAYS_INLINE bool
335tsd_fast(tsd_t *tsd) {
336 bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
337 if (fast) {
338 tsd_assert_fast(tsd);
339 }
340
341 return fast;
342}
343
344JEMALLOC_ALWAYS_INLINE tsd_t *
345tsd_fetch_impl(bool init, bool minimal) {
346 tsd_t *tsd = tsd_get(init);
347
348 if (!init && tsd_get_allocates() && tsd == NULL) {
349 return NULL;
350 }
351 assert(tsd != NULL);
352
353 if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
354 return tsd_fetch_slow(tsd, minimal);
355 }
356 assert(tsd_fast(tsd));
357 tsd_assert_fast(tsd);
358
359 return tsd;
360}
361
362/* Get a minimal TSD that requires no cleanup. See comments in free(). */
363JEMALLOC_ALWAYS_INLINE tsd_t *
364tsd_fetch_min(void) {
365 return tsd_fetch_impl(true, true);
366}
367
368/* For internal background threads use only. */
369JEMALLOC_ALWAYS_INLINE tsd_t *
370tsd_internal_fetch(void) {
371 tsd_t *tsd = tsd_fetch_min();
372 /* Use reincarnated state to prevent full initialization. */
373 tsd_state_set(tsd, tsd_state_reincarnated);
374
375 return tsd;
376}
377
378JEMALLOC_ALWAYS_INLINE tsd_t *
379tsd_fetch(void) {
380 return tsd_fetch_impl(true, false);
381}
382
383static inline bool
384tsd_nominal(tsd_t *tsd) {
385 return (tsd_state_get(tsd) <= tsd_state_nominal_max);
386}
387
388JEMALLOC_ALWAYS_INLINE tsdn_t *
389tsdn_fetch(void) {
390 if (!tsd_booted_get()) {
391 return NULL;
392 }
393
394 return tsd_tsdn(tsd_fetch_impl(false, false));
395}
396
397JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
398tsd_rtree_ctx(tsd_t *tsd) {
399 return tsd_rtree_ctxp_get(tsd);
400}
401
402JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
403tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
404 /*
405 * If tsd cannot be accessed, initialize the fallback rtree_ctx and
406 * return a pointer to it.
407 */
408 if (unlikely(tsdn_null(tsdn))) {
409 rtree_ctx_data_init(fallback);
410 return fallback;
411 }
412 return tsd_rtree_ctx(tsdn_tsd(tsdn));
413}
414
415#endif /* JEMALLOC_INTERNAL_TSD_H */
416