1 | #include "jemalloc/internal/jemalloc_preamble.h" |
2 | #include "jemalloc/internal/jemalloc_internal_includes.h" |
3 | |
4 | #include "jemalloc/internal/hpa.h" |
5 | |
6 | #include "jemalloc/internal/fb.h" |
7 | #include "jemalloc/internal/witness.h" |
8 | |
9 | #define HPA_EDEN_SIZE (128 * HUGEPAGE) |
10 | |
11 | static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, |
12 | size_t alignment, bool zero, bool guarded, bool frequent_reuse, |
13 | bool *deferred_work_generated); |
14 | static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, |
15 | size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated); |
16 | static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
17 | size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); |
18 | static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
19 | size_t old_size, size_t new_size, bool *deferred_work_generated); |
20 | static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
21 | bool *deferred_work_generated); |
22 | static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, |
23 | edata_list_active_t *list, bool *deferred_work_generated); |
24 | static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); |
25 | |
26 | bool |
27 | hpa_supported() { |
28 | #ifdef _WIN32 |
29 | /* |
30 | * At least until the API and implementation is somewhat settled, we |
31 | * don't want to try to debug the VM subsystem on the hardest-to-test |
32 | * platform. |
33 | */ |
34 | return false; |
35 | #endif |
36 | if (!pages_can_hugify) { |
37 | return false; |
38 | } |
39 | /* |
40 | * We fundamentally rely on a address-space-hungry growth strategy for |
41 | * hugepages. |
42 | */ |
43 | if (LG_SIZEOF_PTR != 3) { |
44 | return false; |
45 | } |
46 | /* |
47 | * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes |
48 | * this sentinel value -- see the comment in pages.h. |
49 | */ |
50 | if (HUGEPAGE_PAGES == 1) { |
51 | return false; |
52 | } |
53 | return true; |
54 | } |
55 | |
56 | static void |
57 | hpa_do_consistency_checks(hpa_shard_t *shard) { |
58 | assert(shard->base != NULL); |
59 | } |
60 | |
61 | bool |
62 | hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) { |
63 | /* malloc_conf processing should have filtered out these cases. */ |
64 | assert(hpa_supported()); |
65 | bool err; |
66 | err = malloc_mutex_init(¢ral->grow_mtx, "hpa_central_grow" , |
67 | WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive); |
68 | if (err) { |
69 | return true; |
70 | } |
71 | err = malloc_mutex_init(¢ral->mtx, "hpa_central" , |
72 | WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive); |
73 | if (err) { |
74 | return true; |
75 | } |
76 | central->base = base; |
77 | central->eden = NULL; |
78 | central->eden_len = 0; |
79 | central->age_counter = 0; |
80 | central->hooks = *hooks; |
81 | return false; |
82 | } |
83 | |
84 | static hpdata_t * |
85 | hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) { |
86 | return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t), |
87 | CACHELINE); |
88 | } |
89 | |
90 | hpdata_t * |
91 | hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, |
92 | bool *oom) { |
93 | /* Don't yet support big allocations; these should get filtered out. */ |
94 | assert(size <= HUGEPAGE); |
95 | /* |
96 | * Should only try to extract from the central allocator if the local |
97 | * shard is exhausted. We should hold the grow_mtx on that shard. |
98 | */ |
99 | witness_assert_positive_depth_to_rank( |
100 | tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW); |
101 | |
102 | malloc_mutex_lock(tsdn, ¢ral->grow_mtx); |
103 | *oom = false; |
104 | |
105 | hpdata_t *ps = NULL; |
106 | |
107 | /* Is eden a perfect fit? */ |
108 | if (central->eden != NULL && central->eden_len == HUGEPAGE) { |
109 | ps = hpa_alloc_ps(tsdn, central); |
110 | if (ps == NULL) { |
111 | *oom = true; |
112 | malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); |
113 | return NULL; |
114 | } |
115 | hpdata_init(ps, central->eden, central->age_counter++); |
116 | central->eden = NULL; |
117 | central->eden_len = 0; |
118 | malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); |
119 | return ps; |
120 | } |
121 | |
122 | /* |
123 | * We're about to try to allocate from eden by splitting. If eden is |
124 | * NULL, we have to allocate it too. Otherwise, we just have to |
125 | * allocate an edata_t for the new psset. |
126 | */ |
127 | if (central->eden == NULL) { |
128 | /* |
129 | * During development, we're primarily concerned with systems |
130 | * with overcommit. Eventually, we should be more careful here. |
131 | */ |
132 | bool commit = true; |
133 | /* Allocate address space, bailing if we fail. */ |
134 | void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE, |
135 | &commit); |
136 | if (new_eden == NULL) { |
137 | *oom = true; |
138 | malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); |
139 | return NULL; |
140 | } |
141 | ps = hpa_alloc_ps(tsdn, central); |
142 | if (ps == NULL) { |
143 | pages_unmap(new_eden, HPA_EDEN_SIZE); |
144 | *oom = true; |
145 | malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); |
146 | return NULL; |
147 | } |
148 | central->eden = new_eden; |
149 | central->eden_len = HPA_EDEN_SIZE; |
150 | } else { |
151 | /* Eden is already nonempty; only need an edata for ps. */ |
152 | ps = hpa_alloc_ps(tsdn, central); |
153 | if (ps == NULL) { |
154 | *oom = true; |
155 | malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); |
156 | return NULL; |
157 | } |
158 | } |
159 | assert(ps != NULL); |
160 | assert(central->eden != NULL); |
161 | assert(central->eden_len > HUGEPAGE); |
162 | assert(central->eden_len % HUGEPAGE == 0); |
163 | assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden); |
164 | |
165 | hpdata_init(ps, central->eden, central->age_counter++); |
166 | |
167 | char *eden_char = (char *)central->eden; |
168 | eden_char += HUGEPAGE; |
169 | central->eden = (void *)eden_char; |
170 | central->eden_len -= HUGEPAGE; |
171 | |
172 | malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); |
173 | |
174 | return ps; |
175 | } |
176 | |
177 | bool |
178 | hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, |
179 | base_t *base, edata_cache_t *edata_cache, unsigned ind, |
180 | const hpa_shard_opts_t *opts) { |
181 | /* malloc_conf processing should have filtered out these cases. */ |
182 | assert(hpa_supported()); |
183 | bool err; |
184 | err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow" , |
185 | WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive); |
186 | if (err) { |
187 | return true; |
188 | } |
189 | err = malloc_mutex_init(&shard->mtx, "hpa_shard" , |
190 | WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive); |
191 | if (err) { |
192 | return true; |
193 | } |
194 | |
195 | assert(edata_cache != NULL); |
196 | shard->central = central; |
197 | shard->base = base; |
198 | edata_cache_fast_init(&shard->ecf, edata_cache); |
199 | psset_init(&shard->psset); |
200 | shard->age_counter = 0; |
201 | shard->ind = ind; |
202 | shard->emap = emap; |
203 | |
204 | shard->opts = *opts; |
205 | |
206 | shard->npending_purge = 0; |
207 | nstime_init_zero(&shard->last_purge); |
208 | |
209 | shard->stats.npurge_passes = 0; |
210 | shard->stats.npurges = 0; |
211 | shard->stats.nhugifies = 0; |
212 | shard->stats.ndehugifies = 0; |
213 | |
214 | /* |
215 | * Fill these in last, so that if an hpa_shard gets used despite |
216 | * initialization failing, we'll at least crash instead of just |
217 | * operating on corrupted data. |
218 | */ |
219 | shard->pai.alloc = &hpa_alloc; |
220 | shard->pai.alloc_batch = &hpa_alloc_batch; |
221 | shard->pai.expand = &hpa_expand; |
222 | shard->pai.shrink = &hpa_shrink; |
223 | shard->pai.dalloc = &hpa_dalloc; |
224 | shard->pai.dalloc_batch = &hpa_dalloc_batch; |
225 | shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work; |
226 | |
227 | hpa_do_consistency_checks(shard); |
228 | |
229 | return false; |
230 | } |
231 | |
232 | /* |
233 | * Note that the stats functions here follow the usual stats naming conventions; |
234 | * "merge" obtains the stats from some live object of instance, while "accum" |
235 | * only combines the stats from one stats objet to another. Hence the lack of |
236 | * locking here. |
237 | */ |
238 | static void |
239 | hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst, |
240 | hpa_shard_nonderived_stats_t *src) { |
241 | dst->npurge_passes += src->npurge_passes; |
242 | dst->npurges += src->npurges; |
243 | dst->nhugifies += src->nhugifies; |
244 | dst->ndehugifies += src->ndehugifies; |
245 | } |
246 | |
247 | void |
248 | hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) { |
249 | psset_stats_accum(&dst->psset_stats, &src->psset_stats); |
250 | hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, |
251 | &src->nonderived_stats); |
252 | } |
253 | |
254 | void |
255 | hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, |
256 | hpa_shard_stats_t *dst) { |
257 | hpa_do_consistency_checks(shard); |
258 | |
259 | malloc_mutex_lock(tsdn, &shard->grow_mtx); |
260 | malloc_mutex_lock(tsdn, &shard->mtx); |
261 | psset_stats_accum(&dst->psset_stats, &shard->psset.stats); |
262 | hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats); |
263 | malloc_mutex_unlock(tsdn, &shard->mtx); |
264 | malloc_mutex_unlock(tsdn, &shard->grow_mtx); |
265 | } |
266 | |
267 | static bool |
268 | hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { |
269 | /* |
270 | * Note that this needs to be >= rather than just >, because of the |
271 | * important special case in which the hugification threshold is exactly |
272 | * HUGEPAGE. |
273 | */ |
274 | return hpdata_nactive_get(ps) * PAGE |
275 | >= shard->opts.hugification_threshold; |
276 | } |
277 | |
278 | static size_t |
279 | hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { |
280 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
281 | return psset_ndirty(&shard->psset) - shard->npending_purge; |
282 | } |
283 | |
284 | static size_t |
285 | hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { |
286 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
287 | if (shard->opts.dirty_mult == (fxp_t)-1) { |
288 | return (size_t)-1; |
289 | } |
290 | return fxp_mul_frac(psset_nactive(&shard->psset), |
291 | shard->opts.dirty_mult); |
292 | } |
293 | |
294 | static bool |
295 | hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { |
296 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
297 | hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); |
298 | if (to_hugify == NULL) { |
299 | return false; |
300 | } |
301 | return hpa_adjusted_ndirty(tsdn, shard) |
302 | + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); |
303 | } |
304 | |
305 | static bool |
306 | hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { |
307 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
308 | if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { |
309 | return true; |
310 | } |
311 | if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { |
312 | return true; |
313 | } |
314 | return false; |
315 | } |
316 | |
317 | static void |
318 | hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard, |
319 | hpdata_t *ps) { |
320 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
321 | if (hpdata_changing_state_get(ps)) { |
322 | hpdata_purge_allowed_set(ps, false); |
323 | hpdata_disallow_hugify(ps); |
324 | return; |
325 | } |
326 | /* |
327 | * Hugepages are distinctly costly to purge, so try to avoid it unless |
328 | * they're *particularly* full of dirty pages. Eventually, we should |
329 | * use a smarter / more dynamic heuristic for situations where we have |
330 | * to manually hugify. |
331 | * |
332 | * In situations where we don't manually hugify, this problem is |
333 | * reduced. The "bad" situation we're trying to avoid is one's that's |
334 | * common in some Linux configurations (where both enabled and defrag |
335 | * are set to madvise) that can lead to long latency spikes on the first |
336 | * access after a hugification. The ideal policy in such configurations |
337 | * is probably time-based for both purging and hugifying; only hugify a |
338 | * hugepage if it's met the criteria for some extended period of time, |
339 | * and only dehugify it if it's failed to meet the criteria for an |
340 | * extended period of time. When background threads are on, we should |
341 | * try to take this hit on one of them, as well. |
342 | * |
343 | * I think the ideal setting is THP always enabled, and defrag set to |
344 | * deferred; in that case we don't need any explicit calls on the |
345 | * allocator's end at all; we just try to pack allocations in a |
346 | * hugepage-friendly manner and let the OS hugify in the background. |
347 | */ |
348 | hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0); |
349 | if (hpa_good_hugification_candidate(shard, ps) |
350 | && !hpdata_huge_get(ps)) { |
351 | nstime_t now; |
352 | shard->central->hooks.curtime(&now, /* first_reading */ true); |
353 | hpdata_allow_hugify(ps, now); |
354 | } |
355 | /* |
356 | * Once a hugepage has become eligible for hugification, we don't mark |
357 | * it as ineligible just because it stops meeting the criteria (this |
358 | * could lead to situations where a hugepage that spends most of its |
359 | * time meeting the criteria never quite getting hugified if there are |
360 | * intervening deallocations). The idea is that the hugification delay |
361 | * will allow them to get purged, reseting their "hugify-allowed" bit. |
362 | * If they don't get purged, then the hugification isn't hurting and |
363 | * might help. As an exception, we don't hugify hugepages that are now |
364 | * empty; it definitely doesn't help there until the hugepage gets |
365 | * reused, which is likely not for a while. |
366 | */ |
367 | if (hpdata_nactive_get(ps) == 0) { |
368 | hpdata_disallow_hugify(ps); |
369 | } |
370 | } |
371 | |
372 | static bool |
373 | hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { |
374 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
375 | hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); |
376 | return to_hugify != NULL || hpa_should_purge(tsdn, shard); |
377 | } |
378 | |
379 | /* Returns whether or not we purged anything. */ |
380 | static bool |
381 | hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { |
382 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
383 | |
384 | hpdata_t *to_purge = psset_pick_purge(&shard->psset); |
385 | if (to_purge == NULL) { |
386 | return false; |
387 | } |
388 | assert(hpdata_purge_allowed_get(to_purge)); |
389 | assert(!hpdata_changing_state_get(to_purge)); |
390 | |
391 | /* |
392 | * Don't let anyone else purge or hugify this page while |
393 | * we're purging it (allocations and deallocations are |
394 | * OK). |
395 | */ |
396 | psset_update_begin(&shard->psset, to_purge); |
397 | assert(hpdata_alloc_allowed_get(to_purge)); |
398 | hpdata_mid_purge_set(to_purge, true); |
399 | hpdata_purge_allowed_set(to_purge, false); |
400 | hpdata_disallow_hugify(to_purge); |
401 | /* |
402 | * Unlike with hugification (where concurrent |
403 | * allocations are allowed), concurrent allocation out |
404 | * of a hugepage being purged is unsafe; we might hand |
405 | * out an extent for an allocation and then purge it |
406 | * (clearing out user data). |
407 | */ |
408 | hpdata_alloc_allowed_set(to_purge, false); |
409 | psset_update_end(&shard->psset, to_purge); |
410 | |
411 | /* Gather all the metadata we'll need during the purge. */ |
412 | bool dehugify = hpdata_huge_get(to_purge); |
413 | hpdata_purge_state_t purge_state; |
414 | size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state); |
415 | |
416 | shard->npending_purge += num_to_purge; |
417 | |
418 | malloc_mutex_unlock(tsdn, &shard->mtx); |
419 | |
420 | /* Actually do the purging, now that the lock is dropped. */ |
421 | if (dehugify) { |
422 | shard->central->hooks.dehugify(hpdata_addr_get(to_purge), |
423 | HUGEPAGE); |
424 | } |
425 | size_t total_purged = 0; |
426 | uint64_t purges_this_pass = 0; |
427 | void *purge_addr; |
428 | size_t purge_size; |
429 | while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, |
430 | &purge_size)) { |
431 | total_purged += purge_size; |
432 | assert(total_purged <= HUGEPAGE); |
433 | purges_this_pass++; |
434 | shard->central->hooks.purge(purge_addr, purge_size); |
435 | } |
436 | |
437 | malloc_mutex_lock(tsdn, &shard->mtx); |
438 | /* The shard updates */ |
439 | shard->npending_purge -= num_to_purge; |
440 | shard->stats.npurge_passes++; |
441 | shard->stats.npurges += purges_this_pass; |
442 | shard->central->hooks.curtime(&shard->last_purge, |
443 | /* first_reading */ false); |
444 | if (dehugify) { |
445 | shard->stats.ndehugifies++; |
446 | } |
447 | |
448 | /* The hpdata updates. */ |
449 | psset_update_begin(&shard->psset, to_purge); |
450 | if (dehugify) { |
451 | hpdata_dehugify(to_purge); |
452 | } |
453 | hpdata_purge_end(to_purge, &purge_state); |
454 | hpdata_mid_purge_set(to_purge, false); |
455 | |
456 | hpdata_alloc_allowed_set(to_purge, true); |
457 | hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); |
458 | |
459 | psset_update_end(&shard->psset, to_purge); |
460 | |
461 | return true; |
462 | } |
463 | |
464 | /* Returns whether or not we hugified anything. */ |
465 | static bool |
466 | hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { |
467 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
468 | |
469 | if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { |
470 | return false; |
471 | } |
472 | |
473 | hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); |
474 | if (to_hugify == NULL) { |
475 | return false; |
476 | } |
477 | assert(hpdata_hugify_allowed_get(to_hugify)); |
478 | assert(!hpdata_changing_state_get(to_hugify)); |
479 | |
480 | /* Make sure that it's been hugifiable for long enough. */ |
481 | nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify); |
482 | uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed); |
483 | if (millis < shard->opts.hugify_delay_ms) { |
484 | return false; |
485 | } |
486 | |
487 | /* |
488 | * Don't let anyone else purge or hugify this page while |
489 | * we're hugifying it (allocations and deallocations are |
490 | * OK). |
491 | */ |
492 | psset_update_begin(&shard->psset, to_hugify); |
493 | hpdata_mid_hugify_set(to_hugify, true); |
494 | hpdata_purge_allowed_set(to_hugify, false); |
495 | hpdata_disallow_hugify(to_hugify); |
496 | assert(hpdata_alloc_allowed_get(to_hugify)); |
497 | psset_update_end(&shard->psset, to_hugify); |
498 | |
499 | malloc_mutex_unlock(tsdn, &shard->mtx); |
500 | |
501 | shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE); |
502 | |
503 | malloc_mutex_lock(tsdn, &shard->mtx); |
504 | shard->stats.nhugifies++; |
505 | |
506 | psset_update_begin(&shard->psset, to_hugify); |
507 | hpdata_hugify(to_hugify); |
508 | hpdata_mid_hugify_set(to_hugify, false); |
509 | hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); |
510 | psset_update_end(&shard->psset, to_hugify); |
511 | |
512 | return true; |
513 | } |
514 | |
515 | /* |
516 | * Execution of deferred work is forced if it's triggered by an explicit |
517 | * hpa_shard_do_deferred_work() call. |
518 | */ |
519 | static void |
520 | hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, |
521 | bool forced) { |
522 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
523 | if (!forced && shard->opts.deferral_allowed) { |
524 | return; |
525 | } |
526 | /* |
527 | * If we're on a background thread, do work so long as there's work to |
528 | * be done. Otherwise, bound latency to not be *too* bad by doing at |
529 | * most a small fixed number of operations. |
530 | */ |
531 | bool hugified = false; |
532 | bool purged = false; |
533 | size_t max_ops = (forced ? (size_t)-1 : 16); |
534 | size_t nops = 0; |
535 | do { |
536 | /* |
537 | * Always purge before hugifying, to make sure we get some |
538 | * ability to hit our quiescence targets. |
539 | */ |
540 | purged = false; |
541 | while (hpa_should_purge(tsdn, shard) && nops < max_ops) { |
542 | purged = hpa_try_purge(tsdn, shard); |
543 | if (purged) { |
544 | nops++; |
545 | } |
546 | } |
547 | hugified = hpa_try_hugify(tsdn, shard); |
548 | if (hugified) { |
549 | nops++; |
550 | } |
551 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
552 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
553 | } while ((hugified || purged) && nops < max_ops); |
554 | } |
555 | |
556 | static edata_t * |
557 | hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, |
558 | bool *oom) { |
559 | bool err; |
560 | edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf); |
561 | if (edata == NULL) { |
562 | *oom = true; |
563 | return NULL; |
564 | } |
565 | |
566 | hpdata_t *ps = psset_pick_alloc(&shard->psset, size); |
567 | if (ps == NULL) { |
568 | edata_cache_fast_put(tsdn, &shard->ecf, edata); |
569 | return NULL; |
570 | } |
571 | |
572 | psset_update_begin(&shard->psset, ps); |
573 | |
574 | if (hpdata_empty(ps)) { |
575 | /* |
576 | * If the pageslab used to be empty, treat it as though it's |
577 | * brand new for fragmentation-avoidance purposes; what we're |
578 | * trying to approximate is the age of the allocations *in* that |
579 | * pageslab, and the allocations in the new pageslab are |
580 | * definitionally the youngest in this hpa shard. |
581 | */ |
582 | hpdata_age_set(ps, shard->age_counter++); |
583 | } |
584 | |
585 | void *addr = hpdata_reserve_alloc(ps, size); |
586 | edata_init(edata, shard->ind, addr, size, /* slab */ false, |
587 | SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active, |
588 | /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA, |
589 | EXTENT_NOT_HEAD); |
590 | edata_ps_set(edata, ps); |
591 | |
592 | /* |
593 | * This could theoretically be moved outside of the critical section, |
594 | * but that introduces the potential for a race. Without the lock, the |
595 | * (initially nonempty, since this is the reuse pathway) pageslab we |
596 | * allocated out of could become otherwise empty while the lock is |
597 | * dropped. This would force us to deal with a pageslab eviction down |
598 | * the error pathway, which is a pain. |
599 | */ |
600 | err = emap_register_boundary(tsdn, shard->emap, edata, |
601 | SC_NSIZES, /* slab */ false); |
602 | if (err) { |
603 | hpdata_unreserve(ps, edata_addr_get(edata), |
604 | edata_size_get(edata)); |
605 | /* |
606 | * We should arguably reset dirty state here, but this would |
607 | * require some sort of prepare + commit functionality that's a |
608 | * little much to deal with for now. |
609 | * |
610 | * We don't have a do_deferred_work down this pathway, on the |
611 | * principle that we didn't *really* affect shard state (we |
612 | * tweaked the stats, but our tweaks weren't really accurate). |
613 | */ |
614 | psset_update_end(&shard->psset, ps); |
615 | edata_cache_fast_put(tsdn, &shard->ecf, edata); |
616 | *oom = true; |
617 | return NULL; |
618 | } |
619 | |
620 | hpa_update_purge_hugify_eligibility(tsdn, shard, ps); |
621 | psset_update_end(&shard->psset, ps); |
622 | return edata; |
623 | } |
624 | |
625 | static size_t |
626 | hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, |
627 | bool *oom, size_t nallocs, edata_list_active_t *results, |
628 | bool *deferred_work_generated) { |
629 | malloc_mutex_lock(tsdn, &shard->mtx); |
630 | size_t nsuccess = 0; |
631 | for (; nsuccess < nallocs; nsuccess++) { |
632 | edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size, |
633 | oom); |
634 | if (edata == NULL) { |
635 | break; |
636 | } |
637 | edata_list_active_append(results, edata); |
638 | } |
639 | |
640 | hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); |
641 | *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); |
642 | malloc_mutex_unlock(tsdn, &shard->mtx); |
643 | return nsuccess; |
644 | } |
645 | |
646 | static size_t |
647 | hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, |
648 | size_t nallocs, edata_list_active_t *results, |
649 | bool *deferred_work_generated) { |
650 | assert(size <= shard->opts.slab_max_alloc); |
651 | bool oom = false; |
652 | |
653 | size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, |
654 | nallocs, results, deferred_work_generated); |
655 | |
656 | if (nsuccess == nallocs || oom) { |
657 | return nsuccess; |
658 | } |
659 | |
660 | /* |
661 | * We didn't OOM, but weren't able to fill everything requested of us; |
662 | * try to grow. |
663 | */ |
664 | malloc_mutex_lock(tsdn, &shard->grow_mtx); |
665 | /* |
666 | * Check for grow races; maybe some earlier thread expanded the psset |
667 | * in between when we dropped the main mutex and grabbed the grow mutex. |
668 | */ |
669 | nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, |
670 | nallocs - nsuccess, results, deferred_work_generated); |
671 | if (nsuccess == nallocs || oom) { |
672 | malloc_mutex_unlock(tsdn, &shard->grow_mtx); |
673 | return nsuccess; |
674 | } |
675 | |
676 | /* |
677 | * Note that we don't hold shard->mtx here (while growing); |
678 | * deallocations (and allocations of smaller sizes) may still succeed |
679 | * while we're doing this potentially expensive system call. |
680 | */ |
681 | hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom); |
682 | if (ps == NULL) { |
683 | malloc_mutex_unlock(tsdn, &shard->grow_mtx); |
684 | return nsuccess; |
685 | } |
686 | |
687 | /* |
688 | * We got the pageslab; allocate from it. This does an unlock followed |
689 | * by a lock on the same mutex, and holds the grow mutex while doing |
690 | * deferred work, but this is an uncommon path; the simplicity is worth |
691 | * it. |
692 | */ |
693 | malloc_mutex_lock(tsdn, &shard->mtx); |
694 | psset_insert(&shard->psset, ps); |
695 | malloc_mutex_unlock(tsdn, &shard->mtx); |
696 | |
697 | nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, |
698 | nallocs - nsuccess, results, deferred_work_generated); |
699 | /* |
700 | * Drop grow_mtx before doing deferred work; other threads blocked on it |
701 | * should be allowed to proceed while we're working. |
702 | */ |
703 | malloc_mutex_unlock(tsdn, &shard->grow_mtx); |
704 | |
705 | return nsuccess; |
706 | } |
707 | |
708 | static hpa_shard_t * |
709 | hpa_from_pai(pai_t *self) { |
710 | assert(self->alloc = &hpa_alloc); |
711 | assert(self->expand = &hpa_expand); |
712 | assert(self->shrink = &hpa_shrink); |
713 | assert(self->dalloc = &hpa_dalloc); |
714 | return (hpa_shard_t *)self; |
715 | } |
716 | |
717 | static size_t |
718 | hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, |
719 | edata_list_active_t *results, bool *deferred_work_generated) { |
720 | assert(nallocs > 0); |
721 | assert((size & PAGE_MASK) == 0); |
722 | witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), |
723 | WITNESS_RANK_CORE, 0); |
724 | hpa_shard_t *shard = hpa_from_pai(self); |
725 | |
726 | if (size > shard->opts.slab_max_alloc) { |
727 | return 0; |
728 | } |
729 | |
730 | size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs, |
731 | results, deferred_work_generated); |
732 | |
733 | witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), |
734 | WITNESS_RANK_CORE, 0); |
735 | |
736 | /* |
737 | * Guard the sanity checks with config_debug because the loop cannot be |
738 | * proven non-circular by the compiler, even if everything within the |
739 | * loop is optimized away. |
740 | */ |
741 | if (config_debug) { |
742 | edata_t *edata; |
743 | ql_foreach(edata, &results->head, ql_link_active) { |
744 | emap_assert_mapped(tsdn, shard->emap, edata); |
745 | assert(edata_pai_get(edata) == EXTENT_PAI_HPA); |
746 | assert(edata_state_get(edata) == extent_state_active); |
747 | assert(edata_arena_ind_get(edata) == shard->ind); |
748 | assert(edata_szind_get_maybe_invalid(edata) == |
749 | SC_NSIZES); |
750 | assert(!edata_slab_get(edata)); |
751 | assert(edata_committed_get(edata)); |
752 | assert(edata_base_get(edata) == edata_addr_get(edata)); |
753 | assert(edata_base_get(edata) != NULL); |
754 | } |
755 | } |
756 | return nsuccess; |
757 | } |
758 | |
759 | static edata_t * |
760 | hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, |
761 | bool guarded, bool frequent_reuse, bool *deferred_work_generated) { |
762 | assert((size & PAGE_MASK) == 0); |
763 | assert(!guarded); |
764 | witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), |
765 | WITNESS_RANK_CORE, 0); |
766 | |
767 | /* We don't handle alignment or zeroing for now. */ |
768 | if (alignment > PAGE || zero) { |
769 | return NULL; |
770 | } |
771 | /* |
772 | * An alloc with alignment == PAGE and zero == false is equivalent to a |
773 | * batch alloc of 1. Just do that, so we can share code. |
774 | */ |
775 | edata_list_active_t results; |
776 | edata_list_active_init(&results); |
777 | size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1, |
778 | &results, deferred_work_generated); |
779 | assert(nallocs == 0 || nallocs == 1); |
780 | edata_t *edata = edata_list_active_first(&results); |
781 | return edata; |
782 | } |
783 | |
784 | static bool |
785 | hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, |
786 | size_t new_size, bool zero, bool *deferred_work_generated) { |
787 | /* Expand not yet supported. */ |
788 | return true; |
789 | } |
790 | |
791 | static bool |
792 | hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
793 | size_t old_size, size_t new_size, bool *deferred_work_generated) { |
794 | /* Shrink not yet supported. */ |
795 | return true; |
796 | } |
797 | |
798 | static void |
799 | hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { |
800 | malloc_mutex_assert_not_owner(tsdn, &shard->mtx); |
801 | |
802 | assert(edata_pai_get(edata) == EXTENT_PAI_HPA); |
803 | assert(edata_state_get(edata) == extent_state_active); |
804 | assert(edata_arena_ind_get(edata) == shard->ind); |
805 | assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES); |
806 | assert(edata_committed_get(edata)); |
807 | assert(edata_base_get(edata) != NULL); |
808 | |
809 | /* |
810 | * Another thread shouldn't be trying to touch the metadata of an |
811 | * allocation being freed. The one exception is a merge attempt from a |
812 | * lower-addressed PAC extent; in this case we have a nominal race on |
813 | * the edata metadata bits, but in practice the fact that the PAI bits |
814 | * are different will prevent any further access. The race is bad, but |
815 | * benign in practice, and the long term plan is to track enough state |
816 | * in the rtree to prevent these merge attempts in the first place. |
817 | */ |
818 | edata_addr_set(edata, edata_base_get(edata)); |
819 | edata_zeroed_set(edata, false); |
820 | emap_deregister_boundary(tsdn, shard->emap, edata); |
821 | } |
822 | |
823 | static void |
824 | hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { |
825 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
826 | |
827 | /* |
828 | * Release the metadata early, to avoid having to remember to do it |
829 | * while we're also doing tricky purging logic. First, we need to grab |
830 | * a few bits of metadata from it. |
831 | * |
832 | * Note that the shard mutex protects ps's metadata too; it wouldn't be |
833 | * correct to try to read most information out of it without the lock. |
834 | */ |
835 | hpdata_t *ps = edata_ps_get(edata); |
836 | /* Currently, all edatas come from pageslabs. */ |
837 | assert(ps != NULL); |
838 | void *unreserve_addr = edata_addr_get(edata); |
839 | size_t unreserve_size = edata_size_get(edata); |
840 | edata_cache_fast_put(tsdn, &shard->ecf, edata); |
841 | |
842 | psset_update_begin(&shard->psset, ps); |
843 | hpdata_unreserve(ps, unreserve_addr, unreserve_size); |
844 | hpa_update_purge_hugify_eligibility(tsdn, shard, ps); |
845 | psset_update_end(&shard->psset, ps); |
846 | } |
847 | |
848 | static void |
849 | hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, |
850 | bool *deferred_work_generated) { |
851 | hpa_shard_t *shard = hpa_from_pai(self); |
852 | |
853 | edata_t *edata; |
854 | ql_foreach(edata, &list->head, ql_link_active) { |
855 | hpa_dalloc_prepare_unlocked(tsdn, shard, edata); |
856 | } |
857 | |
858 | malloc_mutex_lock(tsdn, &shard->mtx); |
859 | /* Now, remove from the list. */ |
860 | while ((edata = edata_list_active_first(list)) != NULL) { |
861 | edata_list_active_remove(list, edata); |
862 | hpa_dalloc_locked(tsdn, shard, edata); |
863 | } |
864 | hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); |
865 | *deferred_work_generated = |
866 | hpa_shard_has_deferred_work(tsdn, shard); |
867 | |
868 | malloc_mutex_unlock(tsdn, &shard->mtx); |
869 | } |
870 | |
871 | static void |
872 | hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, |
873 | bool *deferred_work_generated) { |
874 | assert(!edata_guarded_get(edata)); |
875 | /* Just a dalloc_batch of size 1; this lets us share logic. */ |
876 | edata_list_active_t dalloc_list; |
877 | edata_list_active_init(&dalloc_list); |
878 | edata_list_active_append(&dalloc_list, edata); |
879 | hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated); |
880 | } |
881 | |
882 | /* |
883 | * Calculate time until either purging or hugification ought to happen. |
884 | * Called by background threads. |
885 | */ |
886 | static uint64_t |
887 | hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { |
888 | hpa_shard_t *shard = hpa_from_pai(self); |
889 | uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX; |
890 | |
891 | malloc_mutex_lock(tsdn, &shard->mtx); |
892 | |
893 | hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); |
894 | if (to_hugify != NULL) { |
895 | nstime_t time_hugify_allowed = |
896 | hpdata_time_hugify_allowed(to_hugify); |
897 | uint64_t since_hugify_allowed_ms = |
898 | shard->central->hooks.ms_since(&time_hugify_allowed); |
899 | /* |
900 | * If not enough time has passed since hugification was allowed, |
901 | * sleep for the rest. |
902 | */ |
903 | if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) { |
904 | time_ns = shard->opts.hugify_delay_ms - |
905 | since_hugify_allowed_ms; |
906 | time_ns *= 1000 * 1000; |
907 | } else { |
908 | malloc_mutex_unlock(tsdn, &shard->mtx); |
909 | return BACKGROUND_THREAD_DEFERRED_MIN; |
910 | } |
911 | } |
912 | |
913 | if (hpa_should_purge(tsdn, shard)) { |
914 | /* |
915 | * If we haven't purged before, no need to check interval |
916 | * between purges. Simply purge as soon as possible. |
917 | */ |
918 | if (shard->stats.npurge_passes == 0) { |
919 | malloc_mutex_unlock(tsdn, &shard->mtx); |
920 | return BACKGROUND_THREAD_DEFERRED_MIN; |
921 | } |
922 | uint64_t since_last_purge_ms = shard->central->hooks.ms_since( |
923 | &shard->last_purge); |
924 | |
925 | if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { |
926 | uint64_t until_purge_ns; |
927 | until_purge_ns = shard->opts.min_purge_interval_ms - |
928 | since_last_purge_ms; |
929 | until_purge_ns *= 1000 * 1000; |
930 | |
931 | if (until_purge_ns < time_ns) { |
932 | time_ns = until_purge_ns; |
933 | } |
934 | } else { |
935 | time_ns = BACKGROUND_THREAD_DEFERRED_MIN; |
936 | } |
937 | } |
938 | malloc_mutex_unlock(tsdn, &shard->mtx); |
939 | return time_ns; |
940 | } |
941 | |
942 | void |
943 | hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { |
944 | hpa_do_consistency_checks(shard); |
945 | |
946 | malloc_mutex_lock(tsdn, &shard->mtx); |
947 | edata_cache_fast_disable(tsdn, &shard->ecf); |
948 | malloc_mutex_unlock(tsdn, &shard->mtx); |
949 | } |
950 | |
951 | static void |
952 | hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) { |
953 | assert(bin_stats->npageslabs == 0); |
954 | assert(bin_stats->nactive == 0); |
955 | } |
956 | |
957 | static void |
958 | hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) { |
959 | malloc_mutex_assert_owner(tsdn, &shard->mtx); |
960 | for (int huge = 0; huge <= 1; huge++) { |
961 | hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]); |
962 | for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { |
963 | hpa_shard_assert_stats_empty( |
964 | &psset->stats.nonfull_slabs[i][huge]); |
965 | } |
966 | } |
967 | } |
968 | |
969 | void |
970 | hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { |
971 | hpa_do_consistency_checks(shard); |
972 | /* |
973 | * By the time we're here, the arena code should have dalloc'd all the |
974 | * active extents, which means we should have eventually evicted |
975 | * everything from the psset, so it shouldn't be able to serve even a |
976 | * 1-page allocation. |
977 | */ |
978 | if (config_debug) { |
979 | malloc_mutex_lock(tsdn, &shard->mtx); |
980 | hpa_assert_empty(tsdn, shard, &shard->psset); |
981 | malloc_mutex_unlock(tsdn, &shard->mtx); |
982 | } |
983 | hpdata_t *ps; |
984 | while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) { |
985 | /* There should be no allocations anywhere. */ |
986 | assert(hpdata_empty(ps)); |
987 | psset_remove(&shard->psset, ps); |
988 | shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE); |
989 | } |
990 | } |
991 | |
992 | void |
993 | hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, |
994 | bool deferral_allowed) { |
995 | hpa_do_consistency_checks(shard); |
996 | |
997 | malloc_mutex_lock(tsdn, &shard->mtx); |
998 | bool deferral_previously_allowed = shard->opts.deferral_allowed; |
999 | shard->opts.deferral_allowed = deferral_allowed; |
1000 | if (deferral_previously_allowed && !deferral_allowed) { |
1001 | hpa_shard_maybe_do_deferred_work(tsdn, shard, |
1002 | /* forced */ true); |
1003 | } |
1004 | malloc_mutex_unlock(tsdn, &shard->mtx); |
1005 | } |
1006 | |
1007 | void |
1008 | hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { |
1009 | hpa_do_consistency_checks(shard); |
1010 | |
1011 | malloc_mutex_lock(tsdn, &shard->mtx); |
1012 | hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); |
1013 | malloc_mutex_unlock(tsdn, &shard->mtx); |
1014 | } |
1015 | |
1016 | void |
1017 | hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { |
1018 | hpa_do_consistency_checks(shard); |
1019 | |
1020 | malloc_mutex_prefork(tsdn, &shard->grow_mtx); |
1021 | } |
1022 | |
1023 | void |
1024 | hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { |
1025 | hpa_do_consistency_checks(shard); |
1026 | |
1027 | malloc_mutex_prefork(tsdn, &shard->mtx); |
1028 | } |
1029 | |
1030 | void |
1031 | hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) { |
1032 | hpa_do_consistency_checks(shard); |
1033 | |
1034 | malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx); |
1035 | malloc_mutex_postfork_parent(tsdn, &shard->mtx); |
1036 | } |
1037 | |
1038 | void |
1039 | hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { |
1040 | hpa_do_consistency_checks(shard); |
1041 | |
1042 | malloc_mutex_postfork_child(tsdn, &shard->grow_mtx); |
1043 | malloc_mutex_postfork_child(tsdn, &shard->mtx); |
1044 | } |
1045 | |