1 | #ifndef JEMALLOC_INTERNAL_TSD_H |
2 | #define JEMALLOC_INTERNAL_TSD_H |
3 | |
4 | #include "jemalloc/internal/arena_types.h" |
5 | #include "jemalloc/internal/assert.h" |
6 | #include "jemalloc/internal/bin_types.h" |
7 | #include "jemalloc/internal/jemalloc_internal_externs.h" |
8 | #include "jemalloc/internal/prof_types.h" |
9 | #include "jemalloc/internal/ql.h" |
10 | #include "jemalloc/internal/rtree_tsd.h" |
11 | #include "jemalloc/internal/tcache_types.h" |
12 | #include "jemalloc/internal/tcache_structs.h" |
13 | #include "jemalloc/internal/util.h" |
14 | #include "jemalloc/internal/witness.h" |
15 | |
16 | /* |
17 | * Thread-Specific-Data layout |
18 | * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof --- |
19 | * s: state |
20 | * e: tcache_enabled |
21 | * m: thread_allocated (config_stats) |
22 | * f: thread_deallocated (config_stats) |
23 | * p: prof_tdata (config_prof) |
24 | * c: rtree_ctx (rtree cache accessed on deallocation) |
25 | * t: tcache |
26 | * --- data not accessed on tcache fast path: arena-related fields --- |
27 | * d: arenas_tdata_bypass |
28 | * r: reentrancy_level |
29 | * x: narenas_tdata |
30 | * i: iarena |
31 | * a: arena |
32 | * o: arenas_tdata |
33 | * Loading TSD data is on the critical path of basically all malloc operations. |
34 | * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective. |
35 | * Use a compact layout to reduce cache footprint. |
36 | * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+ |
37 | * |---------------------------- 1st cacheline ----------------------------| |
38 | * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] | |
39 | * |---------------------------- 2nd cacheline ----------------------------| |
40 | * | [c * 64 ........ ........ ........ ........ ........ ........ .......] | |
41 | * |---------------------------- 3nd cacheline ----------------------------| |
42 | * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... | |
43 | * +-------------------------------------------------------------------------+ |
44 | * Note: the entire tcache is embedded into TSD and spans multiple cachelines. |
45 | * |
46 | * The last 3 members (i, a and o) before tcache isn't really needed on tcache |
47 | * fast path. However we have a number of unused tcache bins and witnesses |
48 | * (never touched unless config_debug) at the end of tcache, so we place them |
49 | * there to avoid breaking the cachelines and possibly paging in an extra page. |
50 | */ |
51 | #ifdef JEMALLOC_JET |
52 | typedef void (*test_callback_t)(int *); |
53 | # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 |
54 | # define MALLOC_TEST_TSD \ |
55 | O(test_data, int, int) \ |
56 | O(test_callback, test_callback_t, int) |
57 | # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL |
58 | #else |
59 | # define MALLOC_TEST_TSD |
60 | # define MALLOC_TEST_TSD_INITIALIZER |
61 | #endif |
62 | |
63 | /* O(name, type, nullable type */ |
64 | #define MALLOC_TSD \ |
65 | O(tcache_enabled, bool, bool) \ |
66 | O(arenas_tdata_bypass, bool, bool) \ |
67 | O(reentrancy_level, int8_t, int8_t) \ |
68 | O(narenas_tdata, uint32_t, uint32_t) \ |
69 | O(offset_state, uint64_t, uint64_t) \ |
70 | O(thread_allocated, uint64_t, uint64_t) \ |
71 | O(thread_deallocated, uint64_t, uint64_t) \ |
72 | O(bytes_until_sample, int64_t, int64_t) \ |
73 | O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ |
74 | O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \ |
75 | O(iarena, arena_t *, arena_t *) \ |
76 | O(arena, arena_t *, arena_t *) \ |
77 | O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\ |
78 | O(binshards, tsd_binshards_t, tsd_binshards_t)\ |
79 | O(tcache, tcache_t, tcache_t) \ |
80 | O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ |
81 | MALLOC_TEST_TSD |
82 | |
83 | #define TSD_INITIALIZER { \ |
84 | ATOMIC_INIT(tsd_state_uninitialized), \ |
85 | TCACHE_ENABLED_ZERO_INITIALIZER, \ |
86 | false, \ |
87 | 0, \ |
88 | 0, \ |
89 | 0, \ |
90 | 0, \ |
91 | 0, \ |
92 | 0, \ |
93 | NULL, \ |
94 | RTREE_CTX_ZERO_INITIALIZER, \ |
95 | NULL, \ |
96 | NULL, \ |
97 | NULL, \ |
98 | TSD_BINSHARDS_ZERO_INITIALIZER, \ |
99 | TCACHE_ZERO_INITIALIZER, \ |
100 | WITNESS_TSD_INITIALIZER \ |
101 | MALLOC_TEST_TSD_INITIALIZER \ |
102 | } |
103 | |
104 | void *malloc_tsd_malloc(size_t size); |
105 | void malloc_tsd_dalloc(void *wrapper); |
106 | void malloc_tsd_cleanup_register(bool (*f)(void)); |
107 | tsd_t *malloc_tsd_boot0(void); |
108 | void malloc_tsd_boot1(void); |
109 | void tsd_cleanup(void *arg); |
110 | tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal); |
111 | void tsd_state_set(tsd_t *tsd, uint8_t new_state); |
112 | void tsd_slow_update(tsd_t *tsd); |
113 | void tsd_prefork(tsd_t *tsd); |
114 | void tsd_postfork_parent(tsd_t *tsd); |
115 | void tsd_postfork_child(tsd_t *tsd); |
116 | |
117 | /* |
118 | * Call ..._inc when your module wants to take all threads down the slow paths, |
119 | * and ..._dec when it no longer needs to. |
120 | */ |
121 | void tsd_global_slow_inc(tsdn_t *tsdn); |
122 | void tsd_global_slow_dec(tsdn_t *tsdn); |
123 | bool tsd_global_slow(); |
124 | |
125 | enum { |
126 | /* Common case --> jnz. */ |
127 | tsd_state_nominal = 0, |
128 | /* Initialized but on slow path. */ |
129 | tsd_state_nominal_slow = 1, |
130 | /* |
131 | * Some thread has changed global state in such a way that all nominal |
132 | * threads need to recompute their fast / slow status the next time they |
133 | * get a chance. |
134 | * |
135 | * Any thread can change another thread's status *to* recompute, but |
136 | * threads are the only ones who can change their status *from* |
137 | * recompute. |
138 | */ |
139 | tsd_state_nominal_recompute = 2, |
140 | /* |
141 | * The above nominal states should be lower values. We use |
142 | * tsd_nominal_max to separate nominal states from threads in the |
143 | * process of being born / dying. |
144 | */ |
145 | tsd_state_nominal_max = 2, |
146 | |
147 | /* |
148 | * A thread might free() during its death as its only allocator action; |
149 | * in such scenarios, we need tsd, but set up in such a way that no |
150 | * cleanup is necessary. |
151 | */ |
152 | tsd_state_minimal_initialized = 3, |
153 | /* States during which we know we're in thread death. */ |
154 | tsd_state_purgatory = 4, |
155 | tsd_state_reincarnated = 5, |
156 | /* |
157 | * What it says on the tin; tsd that hasn't been initialized. Note |
158 | * that even when the tsd struct lives in TLS, when need to keep track |
159 | * of stuff like whether or not our pthread destructors have been |
160 | * scheduled, so this really truly is different than the nominal state. |
161 | */ |
162 | tsd_state_uninitialized = 6 |
163 | }; |
164 | |
165 | /* |
166 | * Some TSD accesses can only be done in a nominal state. To enforce this, we |
167 | * wrap TSD member access in a function that asserts on TSD state, and mangle |
168 | * field names to prevent touching them accidentally. |
169 | */ |
170 | #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n |
171 | |
172 | #ifdef JEMALLOC_U8_ATOMICS |
173 | # define tsd_state_t atomic_u8_t |
174 | # define tsd_atomic_load atomic_load_u8 |
175 | # define tsd_atomic_store atomic_store_u8 |
176 | # define tsd_atomic_exchange atomic_exchange_u8 |
177 | #else |
178 | # define tsd_state_t atomic_u32_t |
179 | # define tsd_atomic_load atomic_load_u32 |
180 | # define tsd_atomic_store atomic_store_u32 |
181 | # define tsd_atomic_exchange atomic_exchange_u32 |
182 | #endif |
183 | |
184 | /* The actual tsd. */ |
185 | struct tsd_s { |
186 | /* |
187 | * The contents should be treated as totally opaque outside the tsd |
188 | * module. Access any thread-local state through the getters and |
189 | * setters below. |
190 | */ |
191 | |
192 | /* |
193 | * We manually limit the state to just a single byte. Unless the 8-bit |
194 | * atomics are unavailable (which is rare). |
195 | */ |
196 | tsd_state_t state; |
197 | #define O(n, t, nt) \ |
198 | t TSD_MANGLE(n); |
199 | MALLOC_TSD |
200 | #undef O |
201 | }; |
202 | |
203 | JEMALLOC_ALWAYS_INLINE uint8_t |
204 | tsd_state_get(tsd_t *tsd) { |
205 | /* |
206 | * This should be atomic. Unfortunately, compilers right now can't tell |
207 | * that this can be done as a memory comparison, and forces a load into |
208 | * a register that hurts fast-path performance. |
209 | */ |
210 | /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */ |
211 | return *(uint8_t *)&tsd->state; |
212 | } |
213 | |
214 | /* |
215 | * Wrapper around tsd_t that makes it possible to avoid implicit conversion |
216 | * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be |
217 | * explicitly converted to tsd_t, which is non-nullable. |
218 | */ |
219 | struct tsdn_s { |
220 | tsd_t tsd; |
221 | }; |
222 | #define TSDN_NULL ((tsdn_t *)0) |
223 | JEMALLOC_ALWAYS_INLINE tsdn_t * |
224 | tsd_tsdn(tsd_t *tsd) { |
225 | return (tsdn_t *)tsd; |
226 | } |
227 | |
228 | JEMALLOC_ALWAYS_INLINE bool |
229 | tsdn_null(const tsdn_t *tsdn) { |
230 | return tsdn == NULL; |
231 | } |
232 | |
233 | JEMALLOC_ALWAYS_INLINE tsd_t * |
234 | tsdn_tsd(tsdn_t *tsdn) { |
235 | assert(!tsdn_null(tsdn)); |
236 | |
237 | return &tsdn->tsd; |
238 | } |
239 | |
240 | /* |
241 | * We put the platform-specific data declarations and inlines into their own |
242 | * header files to avoid cluttering this file. They define tsd_boot0, |
243 | * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. |
244 | */ |
245 | #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP |
246 | #include "jemalloc/internal/tsd_malloc_thread_cleanup.h" |
247 | #elif (defined(JEMALLOC_TLS)) |
248 | #include "jemalloc/internal/tsd_tls.h" |
249 | #elif (defined(_WIN32)) |
250 | #include "jemalloc/internal/tsd_win.h" |
251 | #else |
252 | #include "jemalloc/internal/tsd_generic.h" |
253 | #endif |
254 | |
255 | /* |
256 | * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of |
257 | * foo. This omits some safety checks, and so can be used during tsd |
258 | * initialization and cleanup. |
259 | */ |
260 | #define O(n, t, nt) \ |
261 | JEMALLOC_ALWAYS_INLINE t * \ |
262 | tsd_##n##p_get_unsafe(tsd_t *tsd) { \ |
263 | return &tsd->TSD_MANGLE(n); \ |
264 | } |
265 | MALLOC_TSD |
266 | #undef O |
267 | |
268 | /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ |
269 | #define O(n, t, nt) \ |
270 | JEMALLOC_ALWAYS_INLINE t * \ |
271 | tsd_##n##p_get(tsd_t *tsd) { \ |
272 | /* \ |
273 | * Because the state might change asynchronously if it's \ |
274 | * nominal, we need to make sure that we only read it once. \ |
275 | */ \ |
276 | uint8_t state = tsd_state_get(tsd); \ |
277 | assert(state == tsd_state_nominal || \ |
278 | state == tsd_state_nominal_slow || \ |
279 | state == tsd_state_nominal_recompute || \ |
280 | state == tsd_state_reincarnated || \ |
281 | state == tsd_state_minimal_initialized); \ |
282 | return tsd_##n##p_get_unsafe(tsd); \ |
283 | } |
284 | MALLOC_TSD |
285 | #undef O |
286 | |
287 | /* |
288 | * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn |
289 | * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. |
290 | */ |
291 | #define O(n, t, nt) \ |
292 | JEMALLOC_ALWAYS_INLINE nt * \ |
293 | tsdn_##n##p_get(tsdn_t *tsdn) { \ |
294 | if (tsdn_null(tsdn)) { \ |
295 | return NULL; \ |
296 | } \ |
297 | tsd_t *tsd = tsdn_tsd(tsdn); \ |
298 | return (nt *)tsd_##n##p_get(tsd); \ |
299 | } |
300 | MALLOC_TSD |
301 | #undef O |
302 | |
303 | /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ |
304 | #define O(n, t, nt) \ |
305 | JEMALLOC_ALWAYS_INLINE t \ |
306 | tsd_##n##_get(tsd_t *tsd) { \ |
307 | return *tsd_##n##p_get(tsd); \ |
308 | } |
309 | MALLOC_TSD |
310 | #undef O |
311 | |
312 | /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ |
313 | #define O(n, t, nt) \ |
314 | JEMALLOC_ALWAYS_INLINE void \ |
315 | tsd_##n##_set(tsd_t *tsd, t val) { \ |
316 | assert(tsd_state_get(tsd) != tsd_state_reincarnated && \ |
317 | tsd_state_get(tsd) != tsd_state_minimal_initialized); \ |
318 | *tsd_##n##p_get(tsd) = val; \ |
319 | } |
320 | MALLOC_TSD |
321 | #undef O |
322 | |
323 | JEMALLOC_ALWAYS_INLINE void |
324 | tsd_assert_fast(tsd_t *tsd) { |
325 | /* |
326 | * Note that our fastness assertion does *not* include global slowness |
327 | * counters; it's not in general possible to ensure that they won't |
328 | * change asynchronously from underneath us. |
329 | */ |
330 | assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && |
331 | tsd_reentrancy_level_get(tsd) == 0); |
332 | } |
333 | |
334 | JEMALLOC_ALWAYS_INLINE bool |
335 | tsd_fast(tsd_t *tsd) { |
336 | bool fast = (tsd_state_get(tsd) == tsd_state_nominal); |
337 | if (fast) { |
338 | tsd_assert_fast(tsd); |
339 | } |
340 | |
341 | return fast; |
342 | } |
343 | |
344 | JEMALLOC_ALWAYS_INLINE tsd_t * |
345 | tsd_fetch_impl(bool init, bool minimal) { |
346 | tsd_t *tsd = tsd_get(init); |
347 | |
348 | if (!init && tsd_get_allocates() && tsd == NULL) { |
349 | return NULL; |
350 | } |
351 | assert(tsd != NULL); |
352 | |
353 | if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) { |
354 | return tsd_fetch_slow(tsd, minimal); |
355 | } |
356 | assert(tsd_fast(tsd)); |
357 | tsd_assert_fast(tsd); |
358 | |
359 | return tsd; |
360 | } |
361 | |
362 | /* Get a minimal TSD that requires no cleanup. See comments in free(). */ |
363 | JEMALLOC_ALWAYS_INLINE tsd_t * |
364 | tsd_fetch_min(void) { |
365 | return tsd_fetch_impl(true, true); |
366 | } |
367 | |
368 | /* For internal background threads use only. */ |
369 | JEMALLOC_ALWAYS_INLINE tsd_t * |
370 | tsd_internal_fetch(void) { |
371 | tsd_t *tsd = tsd_fetch_min(); |
372 | /* Use reincarnated state to prevent full initialization. */ |
373 | tsd_state_set(tsd, tsd_state_reincarnated); |
374 | |
375 | return tsd; |
376 | } |
377 | |
378 | JEMALLOC_ALWAYS_INLINE tsd_t * |
379 | tsd_fetch(void) { |
380 | return tsd_fetch_impl(true, false); |
381 | } |
382 | |
383 | static inline bool |
384 | tsd_nominal(tsd_t *tsd) { |
385 | return (tsd_state_get(tsd) <= tsd_state_nominal_max); |
386 | } |
387 | |
388 | JEMALLOC_ALWAYS_INLINE tsdn_t * |
389 | tsdn_fetch(void) { |
390 | if (!tsd_booted_get()) { |
391 | return NULL; |
392 | } |
393 | |
394 | return tsd_tsdn(tsd_fetch_impl(false, false)); |
395 | } |
396 | |
397 | JEMALLOC_ALWAYS_INLINE rtree_ctx_t * |
398 | tsd_rtree_ctx(tsd_t *tsd) { |
399 | return tsd_rtree_ctxp_get(tsd); |
400 | } |
401 | |
402 | JEMALLOC_ALWAYS_INLINE rtree_ctx_t * |
403 | tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { |
404 | /* |
405 | * If tsd cannot be accessed, initialize the fallback rtree_ctx and |
406 | * return a pointer to it. |
407 | */ |
408 | if (unlikely(tsdn_null(tsdn))) { |
409 | rtree_ctx_data_init(fallback); |
410 | return fallback; |
411 | } |
412 | return tsd_rtree_ctx(tsdn_tsd(tsdn)); |
413 | } |
414 | |
415 | #endif /* JEMALLOC_INTERNAL_TSD_H */ |
416 | |