1 | #include "jemalloc/internal/jemalloc_preamble.h" |
2 | #include "jemalloc/internal/jemalloc_internal_includes.h" |
3 | |
4 | #include "jemalloc/internal/sec.h" |
5 | |
6 | static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, |
7 | size_t alignment, bool zero, bool guarded, bool frequent_reuse, |
8 | bool *deferred_work_generated); |
9 | static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
10 | size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); |
11 | static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
12 | size_t old_size, size_t new_size, bool *deferred_work_generated); |
13 | static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
14 | bool *deferred_work_generated); |
15 | |
16 | static void |
17 | sec_bin_init(sec_bin_t *bin) { |
18 | bin->being_batch_filled = false; |
19 | bin->bytes_cur = 0; |
20 | edata_list_active_init(&bin->freelist); |
21 | } |
22 | |
23 | bool |
24 | sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback, |
25 | const sec_opts_t *opts) { |
26 | assert(opts->max_alloc >= PAGE); |
27 | |
28 | size_t max_alloc = PAGE_FLOOR(opts->max_alloc); |
29 | pszind_t npsizes = sz_psz2ind(max_alloc) + 1; |
30 | |
31 | size_t sz_shards = opts->nshards * sizeof(sec_shard_t); |
32 | size_t sz_bins = opts->nshards * (size_t)npsizes * sizeof(sec_bin_t); |
33 | size_t sz_alloc = sz_shards + sz_bins; |
34 | void *dynalloc = base_alloc(tsdn, base, sz_alloc, CACHELINE); |
35 | if (dynalloc == NULL) { |
36 | return true; |
37 | } |
38 | sec_shard_t *shard_cur = (sec_shard_t *)dynalloc; |
39 | sec->shards = shard_cur; |
40 | sec_bin_t *bin_cur = (sec_bin_t *)&shard_cur[opts->nshards]; |
41 | /* Just for asserts, below. */ |
42 | sec_bin_t *bin_start = bin_cur; |
43 | |
44 | for (size_t i = 0; i < opts->nshards; i++) { |
45 | sec_shard_t *shard = shard_cur; |
46 | shard_cur++; |
47 | bool err = malloc_mutex_init(&shard->mtx, "sec_shard" , |
48 | WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive); |
49 | if (err) { |
50 | return true; |
51 | } |
52 | shard->enabled = true; |
53 | shard->bins = bin_cur; |
54 | for (pszind_t j = 0; j < npsizes; j++) { |
55 | sec_bin_init(&shard->bins[j]); |
56 | bin_cur++; |
57 | } |
58 | shard->bytes_cur = 0; |
59 | shard->to_flush_next = 0; |
60 | } |
61 | /* |
62 | * Should have exactly matched the bin_start to the first unused byte |
63 | * after the shards. |
64 | */ |
65 | assert((void *)shard_cur == (void *)bin_start); |
66 | /* And the last bin to use up the last bytes of the allocation. */ |
67 | assert((char *)bin_cur == ((char *)dynalloc + sz_alloc)); |
68 | sec->fallback = fallback; |
69 | |
70 | |
71 | sec->opts = *opts; |
72 | sec->npsizes = npsizes; |
73 | |
74 | /* |
75 | * Initialize these last so that an improper use of an SEC whose |
76 | * initialization failed will segfault in an easy-to-spot way. |
77 | */ |
78 | sec->pai.alloc = &sec_alloc; |
79 | sec->pai.alloc_batch = &pai_alloc_batch_default; |
80 | sec->pai.expand = &sec_expand; |
81 | sec->pai.shrink = &sec_shrink; |
82 | sec->pai.dalloc = &sec_dalloc; |
83 | sec->pai.dalloc_batch = &pai_dalloc_batch_default; |
84 | |
85 | return false; |
86 | } |
87 | |
88 | static sec_shard_t * |
89 | sec_shard_pick(tsdn_t *tsdn, sec_t *sec) { |
90 | /* |
91 | * Eventually, we should implement affinity, tracking source shard using |
92 | * the edata_t's newly freed up fields. For now, just randomly |
93 | * distribute across all shards. |
94 | */ |
95 | if (tsdn_null(tsdn)) { |
96 | return &sec->shards[0]; |
97 | } |
98 | tsd_t *tsd = tsdn_tsd(tsdn); |
99 | uint8_t *idxp = tsd_sec_shardp_get(tsd); |
100 | if (*idxp == (uint8_t)-1) { |
101 | /* |
102 | * First use; initialize using the trick from Daniel Lemire's |
103 | * "A fast alternative to the modulo reduction. Use a 64 bit |
104 | * number to store 32 bits, since we'll deliberately overflow |
105 | * when we multiply by the number of shards. |
106 | */ |
107 | uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32); |
108 | uint32_t idx = |
109 | (uint32_t)((rand32 * (uint64_t)sec->opts.nshards) >> 32); |
110 | assert(idx < (uint32_t)sec->opts.nshards); |
111 | *idxp = (uint8_t)idx; |
112 | } |
113 | return &sec->shards[*idxp]; |
114 | } |
115 | |
116 | /* |
117 | * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an |
118 | * empty cache, we'll try to fill it, which can push the shard over it's limit. |
119 | */ |
120 | static void |
121 | sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { |
122 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
123 | edata_list_active_t to_flush; |
124 | edata_list_active_init(&to_flush); |
125 | while (shard->bytes_cur > sec->opts.bytes_after_flush) { |
126 | /* Pick a victim. */ |
127 | sec_bin_t *bin = &shard->bins[shard->to_flush_next]; |
128 | |
129 | /* Update our victim-picking state. */ |
130 | shard->to_flush_next++; |
131 | if (shard->to_flush_next == sec->npsizes) { |
132 | shard->to_flush_next = 0; |
133 | } |
134 | |
135 | assert(shard->bytes_cur >= bin->bytes_cur); |
136 | if (bin->bytes_cur != 0) { |
137 | shard->bytes_cur -= bin->bytes_cur; |
138 | bin->bytes_cur = 0; |
139 | edata_list_active_concat(&to_flush, &bin->freelist); |
140 | } |
141 | /* |
142 | * Either bin->bytes_cur was 0, in which case we didn't touch |
143 | * the bin list but it should be empty anyways (or else we |
144 | * missed a bytes_cur update on a list modification), or it |
145 | * *was* 0 and we emptied it ourselves. Either way, it should |
146 | * be empty now. |
147 | */ |
148 | assert(edata_list_active_empty(&bin->freelist)); |
149 | } |
150 | |
151 | malloc_mutex_unlock(tsdn, &shard->mtx); |
152 | bool deferred_work_generated = false; |
153 | pai_dalloc_batch(tsdn, sec->fallback, &to_flush, |
154 | &deferred_work_generated); |
155 | } |
156 | |
157 | static edata_t * |
158 | sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, |
159 | sec_bin_t *bin) { |
160 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
161 | if (!shard->enabled) { |
162 | return NULL; |
163 | } |
164 | edata_t *edata = edata_list_active_first(&bin->freelist); |
165 | if (edata != NULL) { |
166 | edata_list_active_remove(&bin->freelist, edata); |
167 | assert(edata_size_get(edata) <= bin->bytes_cur); |
168 | bin->bytes_cur -= edata_size_get(edata); |
169 | assert(edata_size_get(edata) <= shard->bytes_cur); |
170 | shard->bytes_cur -= edata_size_get(edata); |
171 | } |
172 | return edata; |
173 | } |
174 | |
175 | static edata_t * |
176 | sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, |
177 | sec_bin_t *bin, size_t size) { |
178 | malloc_mutex_assert_not_owner(tsdn, &shard->mtx); |
179 | |
180 | edata_list_active_t result; |
181 | edata_list_active_init(&result); |
182 | bool deferred_work_generated = false; |
183 | size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size, |
184 | 1 + sec->opts.batch_fill_extra, &result, &deferred_work_generated); |
185 | |
186 | edata_t *ret = edata_list_active_first(&result); |
187 | if (ret != NULL) { |
188 | edata_list_active_remove(&result, ret); |
189 | } |
190 | |
191 | malloc_mutex_lock(tsdn, &shard->mtx); |
192 | bin->being_batch_filled = false; |
193 | /* |
194 | * Handle the easy case first: nothing to cache. Note that this can |
195 | * only happen in case of OOM, since sec_alloc checks the expected |
196 | * number of allocs, and doesn't bother going down the batch_fill |
197 | * pathway if there won't be anything left to cache. So to be in this |
198 | * code path, we must have asked for > 1 alloc, but only gotten 1 back. |
199 | */ |
200 | if (nalloc <= 1) { |
201 | malloc_mutex_unlock(tsdn, &shard->mtx); |
202 | return ret; |
203 | } |
204 | |
205 | size_t new_cached_bytes = (nalloc - 1) * size; |
206 | |
207 | edata_list_active_concat(&bin->freelist, &result); |
208 | bin->bytes_cur += new_cached_bytes; |
209 | shard->bytes_cur += new_cached_bytes; |
210 | |
211 | if (shard->bytes_cur > sec->opts.max_bytes) { |
212 | sec_flush_some_and_unlock(tsdn, sec, shard); |
213 | } else { |
214 | malloc_mutex_unlock(tsdn, &shard->mtx); |
215 | } |
216 | |
217 | return ret; |
218 | } |
219 | |
220 | static edata_t * |
221 | sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, |
222 | bool guarded, bool frequent_reuse, bool *deferred_work_generated) { |
223 | assert((size & PAGE_MASK) == 0); |
224 | assert(!guarded); |
225 | |
226 | sec_t *sec = (sec_t *)self; |
227 | |
228 | if (zero || alignment > PAGE || sec->opts.nshards == 0 |
229 | || size > sec->opts.max_alloc) { |
230 | return pai_alloc(tsdn, sec->fallback, size, alignment, zero, |
231 | /* guarded */ false, frequent_reuse, |
232 | deferred_work_generated); |
233 | } |
234 | pszind_t pszind = sz_psz2ind(size); |
235 | assert(pszind < sec->npsizes); |
236 | |
237 | sec_shard_t *shard = sec_shard_pick(tsdn, sec); |
238 | sec_bin_t *bin = &shard->bins[pszind]; |
239 | bool do_batch_fill = false; |
240 | |
241 | malloc_mutex_lock(tsdn, &shard->mtx); |
242 | edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin); |
243 | if (edata == NULL) { |
244 | if (!bin->being_batch_filled |
245 | && sec->opts.batch_fill_extra > 0) { |
246 | bin->being_batch_filled = true; |
247 | do_batch_fill = true; |
248 | } |
249 | } |
250 | malloc_mutex_unlock(tsdn, &shard->mtx); |
251 | if (edata == NULL) { |
252 | if (do_batch_fill) { |
253 | edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin, |
254 | size); |
255 | } else { |
256 | edata = pai_alloc(tsdn, sec->fallback, size, alignment, |
257 | zero, /* guarded */ false, frequent_reuse, |
258 | deferred_work_generated); |
259 | } |
260 | } |
261 | return edata; |
262 | } |
263 | |
264 | static bool |
265 | sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, |
266 | size_t new_size, bool zero, bool *deferred_work_generated) { |
267 | sec_t *sec = (sec_t *)self; |
268 | return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero, |
269 | deferred_work_generated); |
270 | } |
271 | |
272 | static bool |
273 | sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, |
274 | size_t new_size, bool *deferred_work_generated) { |
275 | sec_t *sec = (sec_t *)self; |
276 | return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size, |
277 | deferred_work_generated); |
278 | } |
279 | |
280 | static void |
281 | sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { |
282 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
283 | shard->bytes_cur = 0; |
284 | edata_list_active_t to_flush; |
285 | edata_list_active_init(&to_flush); |
286 | for (pszind_t i = 0; i < sec->npsizes; i++) { |
287 | sec_bin_t *bin = &shard->bins[i]; |
288 | bin->bytes_cur = 0; |
289 | edata_list_active_concat(&to_flush, &bin->freelist); |
290 | } |
291 | |
292 | /* |
293 | * Ordinarily we would try to avoid doing the batch deallocation while |
294 | * holding the shard mutex, but the flush_all pathways only happen when |
295 | * we're disabling the HPA or resetting the arena, both of which are |
296 | * rare pathways. |
297 | */ |
298 | bool deferred_work_generated = false; |
299 | pai_dalloc_batch(tsdn, sec->fallback, &to_flush, |
300 | &deferred_work_generated); |
301 | } |
302 | |
303 | static void |
304 | sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, |
305 | edata_t *edata) { |
306 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
307 | assert(shard->bytes_cur <= sec->opts.max_bytes); |
308 | size_t size = edata_size_get(edata); |
309 | pszind_t pszind = sz_psz2ind(size); |
310 | assert(pszind < sec->npsizes); |
311 | /* |
312 | * Prepending here results in LIFO allocation per bin, which seems |
313 | * reasonable. |
314 | */ |
315 | sec_bin_t *bin = &shard->bins[pszind]; |
316 | edata_list_active_prepend(&bin->freelist, edata); |
317 | bin->bytes_cur += size; |
318 | shard->bytes_cur += size; |
319 | if (shard->bytes_cur > sec->opts.max_bytes) { |
320 | /* |
321 | * We've exceeded the shard limit. We make two nods in the |
322 | * direction of fragmentation avoidance: we flush everything in |
323 | * the shard, rather than one particular bin, and we hold the |
324 | * lock while flushing (in case one of the extents we flush is |
325 | * highly preferred from a fragmentation-avoidance perspective |
326 | * in the backing allocator). This has the extra advantage of |
327 | * not requiring advanced cache balancing strategies. |
328 | */ |
329 | sec_flush_some_and_unlock(tsdn, sec, shard); |
330 | malloc_mutex_assert_not_owner(tsdn, &shard->mtx); |
331 | } else { |
332 | malloc_mutex_unlock(tsdn, &shard->mtx); |
333 | } |
334 | } |
335 | |
336 | static void |
337 | sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
338 | bool *deferred_work_generated) { |
339 | sec_t *sec = (sec_t *)self; |
340 | if (sec->opts.nshards == 0 |
341 | || edata_size_get(edata) > sec->opts.max_alloc) { |
342 | pai_dalloc(tsdn, sec->fallback, edata, |
343 | deferred_work_generated); |
344 | return; |
345 | } |
346 | sec_shard_t *shard = sec_shard_pick(tsdn, sec); |
347 | malloc_mutex_lock(tsdn, &shard->mtx); |
348 | if (shard->enabled) { |
349 | sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata); |
350 | } else { |
351 | malloc_mutex_unlock(tsdn, &shard->mtx); |
352 | pai_dalloc(tsdn, sec->fallback, edata, |
353 | deferred_work_generated); |
354 | } |
355 | } |
356 | |
357 | void |
358 | sec_flush(tsdn_t *tsdn, sec_t *sec) { |
359 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
360 | malloc_mutex_lock(tsdn, &sec->shards[i].mtx); |
361 | sec_flush_all_locked(tsdn, sec, &sec->shards[i]); |
362 | malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); |
363 | } |
364 | } |
365 | |
366 | void |
367 | sec_disable(tsdn_t *tsdn, sec_t *sec) { |
368 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
369 | malloc_mutex_lock(tsdn, &sec->shards[i].mtx); |
370 | sec->shards[i].enabled = false; |
371 | sec_flush_all_locked(tsdn, sec, &sec->shards[i]); |
372 | malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); |
373 | } |
374 | } |
375 | |
376 | void |
377 | sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) { |
378 | size_t sum = 0; |
379 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
380 | /* |
381 | * We could save these lock acquisitions by making bytes_cur |
382 | * atomic, but stats collection is rare anyways and we expect |
383 | * the number and type of stats to get more interesting. |
384 | */ |
385 | malloc_mutex_lock(tsdn, &sec->shards[i].mtx); |
386 | sum += sec->shards[i].bytes_cur; |
387 | malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); |
388 | } |
389 | stats->bytes += sum; |
390 | } |
391 | |
392 | void |
393 | sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec, |
394 | mutex_prof_data_t *mutex_prof_data) { |
395 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
396 | malloc_mutex_lock(tsdn, &sec->shards[i].mtx); |
397 | malloc_mutex_prof_accum(tsdn, mutex_prof_data, |
398 | &sec->shards[i].mtx); |
399 | malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); |
400 | } |
401 | } |
402 | |
403 | void |
404 | sec_prefork2(tsdn_t *tsdn, sec_t *sec) { |
405 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
406 | malloc_mutex_prefork(tsdn, &sec->shards[i].mtx); |
407 | } |
408 | } |
409 | |
410 | void |
411 | sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) { |
412 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
413 | malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx); |
414 | } |
415 | } |
416 | |
417 | void |
418 | sec_postfork_child(tsdn_t *tsdn, sec_t *sec) { |
419 | for (size_t i = 0; i < sec->opts.nshards; i++) { |
420 | malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx); |
421 | } |
422 | } |
423 | |