1#include "jemalloc/internal/jemalloc_preamble.h"
2#include "jemalloc/internal/jemalloc_internal_includes.h"
3
4#include "jemalloc/internal/hpa.h"
5
6#include "jemalloc/internal/fb.h"
7#include "jemalloc/internal/witness.h"
8
9#define HPA_EDEN_SIZE (128 * HUGEPAGE)
10
11static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
12 size_t alignment, bool zero, bool guarded, bool frequent_reuse,
13 bool *deferred_work_generated);
14static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size,
15 size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated);
16static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
17 size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated);
18static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
19 size_t old_size, size_t new_size, bool *deferred_work_generated);
20static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
21 bool *deferred_work_generated);
22static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self,
23 edata_list_active_t *list, bool *deferred_work_generated);
24static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self);
25
26bool
27hpa_supported() {
28#ifdef _WIN32
29 /*
30 * At least until the API and implementation is somewhat settled, we
31 * don't want to try to debug the VM subsystem on the hardest-to-test
32 * platform.
33 */
34 return false;
35#endif
36 if (!pages_can_hugify) {
37 return false;
38 }
39 /*
40 * We fundamentally rely on a address-space-hungry growth strategy for
41 * hugepages.
42 */
43 if (LG_SIZEOF_PTR != 3) {
44 return false;
45 }
46 /*
47 * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes
48 * this sentinel value -- see the comment in pages.h.
49 */
50 if (HUGEPAGE_PAGES == 1) {
51 return false;
52 }
53 return true;
54}
55
56static void
57hpa_do_consistency_checks(hpa_shard_t *shard) {
58 assert(shard->base != NULL);
59}
60
61bool
62hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) {
63 /* malloc_conf processing should have filtered out these cases. */
64 assert(hpa_supported());
65 bool err;
66 err = malloc_mutex_init(&central->grow_mtx, "hpa_central_grow",
67 WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive);
68 if (err) {
69 return true;
70 }
71 err = malloc_mutex_init(&central->mtx, "hpa_central",
72 WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive);
73 if (err) {
74 return true;
75 }
76 central->base = base;
77 central->eden = NULL;
78 central->eden_len = 0;
79 central->age_counter = 0;
80 central->hooks = *hooks;
81 return false;
82}
83
84static hpdata_t *
85hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) {
86 return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t),
87 CACHELINE);
88}
89
90hpdata_t *
91hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size,
92 bool *oom) {
93 /* Don't yet support big allocations; these should get filtered out. */
94 assert(size <= HUGEPAGE);
95 /*
96 * Should only try to extract from the central allocator if the local
97 * shard is exhausted. We should hold the grow_mtx on that shard.
98 */
99 witness_assert_positive_depth_to_rank(
100 tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW);
101
102 malloc_mutex_lock(tsdn, &central->grow_mtx);
103 *oom = false;
104
105 hpdata_t *ps = NULL;
106
107 /* Is eden a perfect fit? */
108 if (central->eden != NULL && central->eden_len == HUGEPAGE) {
109 ps = hpa_alloc_ps(tsdn, central);
110 if (ps == NULL) {
111 *oom = true;
112 malloc_mutex_unlock(tsdn, &central->grow_mtx);
113 return NULL;
114 }
115 hpdata_init(ps, central->eden, central->age_counter++);
116 central->eden = NULL;
117 central->eden_len = 0;
118 malloc_mutex_unlock(tsdn, &central->grow_mtx);
119 return ps;
120 }
121
122 /*
123 * We're about to try to allocate from eden by splitting. If eden is
124 * NULL, we have to allocate it too. Otherwise, we just have to
125 * allocate an edata_t for the new psset.
126 */
127 if (central->eden == NULL) {
128 /*
129 * During development, we're primarily concerned with systems
130 * with overcommit. Eventually, we should be more careful here.
131 */
132 bool commit = true;
133 /* Allocate address space, bailing if we fail. */
134 void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE,
135 &commit);
136 if (new_eden == NULL) {
137 *oom = true;
138 malloc_mutex_unlock(tsdn, &central->grow_mtx);
139 return NULL;
140 }
141 ps = hpa_alloc_ps(tsdn, central);
142 if (ps == NULL) {
143 pages_unmap(new_eden, HPA_EDEN_SIZE);
144 *oom = true;
145 malloc_mutex_unlock(tsdn, &central->grow_mtx);
146 return NULL;
147 }
148 central->eden = new_eden;
149 central->eden_len = HPA_EDEN_SIZE;
150 } else {
151 /* Eden is already nonempty; only need an edata for ps. */
152 ps = hpa_alloc_ps(tsdn, central);
153 if (ps == NULL) {
154 *oom = true;
155 malloc_mutex_unlock(tsdn, &central->grow_mtx);
156 return NULL;
157 }
158 }
159 assert(ps != NULL);
160 assert(central->eden != NULL);
161 assert(central->eden_len > HUGEPAGE);
162 assert(central->eden_len % HUGEPAGE == 0);
163 assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden);
164
165 hpdata_init(ps, central->eden, central->age_counter++);
166
167 char *eden_char = (char *)central->eden;
168 eden_char += HUGEPAGE;
169 central->eden = (void *)eden_char;
170 central->eden_len -= HUGEPAGE;
171
172 malloc_mutex_unlock(tsdn, &central->grow_mtx);
173
174 return ps;
175}
176
177bool
178hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
179 base_t *base, edata_cache_t *edata_cache, unsigned ind,
180 const hpa_shard_opts_t *opts) {
181 /* malloc_conf processing should have filtered out these cases. */
182 assert(hpa_supported());
183 bool err;
184 err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
185 WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
186 if (err) {
187 return true;
188 }
189 err = malloc_mutex_init(&shard->mtx, "hpa_shard",
190 WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive);
191 if (err) {
192 return true;
193 }
194
195 assert(edata_cache != NULL);
196 shard->central = central;
197 shard->base = base;
198 edata_cache_fast_init(&shard->ecf, edata_cache);
199 psset_init(&shard->psset);
200 shard->age_counter = 0;
201 shard->ind = ind;
202 shard->emap = emap;
203
204 shard->opts = *opts;
205
206 shard->npending_purge = 0;
207 nstime_init_zero(&shard->last_purge);
208
209 shard->stats.npurge_passes = 0;
210 shard->stats.npurges = 0;
211 shard->stats.nhugifies = 0;
212 shard->stats.ndehugifies = 0;
213
214 /*
215 * Fill these in last, so that if an hpa_shard gets used despite
216 * initialization failing, we'll at least crash instead of just
217 * operating on corrupted data.
218 */
219 shard->pai.alloc = &hpa_alloc;
220 shard->pai.alloc_batch = &hpa_alloc_batch;
221 shard->pai.expand = &hpa_expand;
222 shard->pai.shrink = &hpa_shrink;
223 shard->pai.dalloc = &hpa_dalloc;
224 shard->pai.dalloc_batch = &hpa_dalloc_batch;
225 shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work;
226
227 hpa_do_consistency_checks(shard);
228
229 return false;
230}
231
232/*
233 * Note that the stats functions here follow the usual stats naming conventions;
234 * "merge" obtains the stats from some live object of instance, while "accum"
235 * only combines the stats from one stats objet to another. Hence the lack of
236 * locking here.
237 */
238static void
239hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst,
240 hpa_shard_nonderived_stats_t *src) {
241 dst->npurge_passes += src->npurge_passes;
242 dst->npurges += src->npurges;
243 dst->nhugifies += src->nhugifies;
244 dst->ndehugifies += src->ndehugifies;
245}
246
247void
248hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
249 psset_stats_accum(&dst->psset_stats, &src->psset_stats);
250 hpa_shard_nonderived_stats_accum(&dst->nonderived_stats,
251 &src->nonderived_stats);
252}
253
254void
255hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
256 hpa_shard_stats_t *dst) {
257 hpa_do_consistency_checks(shard);
258
259 malloc_mutex_lock(tsdn, &shard->grow_mtx);
260 malloc_mutex_lock(tsdn, &shard->mtx);
261 psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
262 hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats);
263 malloc_mutex_unlock(tsdn, &shard->mtx);
264 malloc_mutex_unlock(tsdn, &shard->grow_mtx);
265}
266
267static bool
268hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) {
269 /*
270 * Note that this needs to be >= rather than just >, because of the
271 * important special case in which the hugification threshold is exactly
272 * HUGEPAGE.
273 */
274 return hpdata_nactive_get(ps) * PAGE
275 >= shard->opts.hugification_threshold;
276}
277
278static size_t
279hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
280 malloc_mutex_assert_owner(tsdn, &shard->mtx);
281 return psset_ndirty(&shard->psset) - shard->npending_purge;
282}
283
284static size_t
285hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) {
286 malloc_mutex_assert_owner(tsdn, &shard->mtx);
287 if (shard->opts.dirty_mult == (fxp_t)-1) {
288 return (size_t)-1;
289 }
290 return fxp_mul_frac(psset_nactive(&shard->psset),
291 shard->opts.dirty_mult);
292}
293
294static bool
295hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
296 malloc_mutex_assert_owner(tsdn, &shard->mtx);
297 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
298 if (to_hugify == NULL) {
299 return false;
300 }
301 return hpa_adjusted_ndirty(tsdn, shard)
302 + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard);
303}
304
305static bool
306hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
307 malloc_mutex_assert_owner(tsdn, &shard->mtx);
308 if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) {
309 return true;
310 }
311 if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
312 return true;
313 }
314 return false;
315}
316
317static void
318hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard,
319 hpdata_t *ps) {
320 malloc_mutex_assert_owner(tsdn, &shard->mtx);
321 if (hpdata_changing_state_get(ps)) {
322 hpdata_purge_allowed_set(ps, false);
323 hpdata_disallow_hugify(ps);
324 return;
325 }
326 /*
327 * Hugepages are distinctly costly to purge, so try to avoid it unless
328 * they're *particularly* full of dirty pages. Eventually, we should
329 * use a smarter / more dynamic heuristic for situations where we have
330 * to manually hugify.
331 *
332 * In situations where we don't manually hugify, this problem is
333 * reduced. The "bad" situation we're trying to avoid is one's that's
334 * common in some Linux configurations (where both enabled and defrag
335 * are set to madvise) that can lead to long latency spikes on the first
336 * access after a hugification. The ideal policy in such configurations
337 * is probably time-based for both purging and hugifying; only hugify a
338 * hugepage if it's met the criteria for some extended period of time,
339 * and only dehugify it if it's failed to meet the criteria for an
340 * extended period of time. When background threads are on, we should
341 * try to take this hit on one of them, as well.
342 *
343 * I think the ideal setting is THP always enabled, and defrag set to
344 * deferred; in that case we don't need any explicit calls on the
345 * allocator's end at all; we just try to pack allocations in a
346 * hugepage-friendly manner and let the OS hugify in the background.
347 */
348 hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0);
349 if (hpa_good_hugification_candidate(shard, ps)
350 && !hpdata_huge_get(ps)) {
351 nstime_t now;
352 shard->central->hooks.curtime(&now, /* first_reading */ true);
353 hpdata_allow_hugify(ps, now);
354 }
355 /*
356 * Once a hugepage has become eligible for hugification, we don't mark
357 * it as ineligible just because it stops meeting the criteria (this
358 * could lead to situations where a hugepage that spends most of its
359 * time meeting the criteria never quite getting hugified if there are
360 * intervening deallocations). The idea is that the hugification delay
361 * will allow them to get purged, reseting their "hugify-allowed" bit.
362 * If they don't get purged, then the hugification isn't hurting and
363 * might help. As an exception, we don't hugify hugepages that are now
364 * empty; it definitely doesn't help there until the hugepage gets
365 * reused, which is likely not for a while.
366 */
367 if (hpdata_nactive_get(ps) == 0) {
368 hpdata_disallow_hugify(ps);
369 }
370}
371
372static bool
373hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
374 malloc_mutex_assert_owner(tsdn, &shard->mtx);
375 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
376 return to_hugify != NULL || hpa_should_purge(tsdn, shard);
377}
378
379/* Returns whether or not we purged anything. */
380static bool
381hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
382 malloc_mutex_assert_owner(tsdn, &shard->mtx);
383
384 hpdata_t *to_purge = psset_pick_purge(&shard->psset);
385 if (to_purge == NULL) {
386 return false;
387 }
388 assert(hpdata_purge_allowed_get(to_purge));
389 assert(!hpdata_changing_state_get(to_purge));
390
391 /*
392 * Don't let anyone else purge or hugify this page while
393 * we're purging it (allocations and deallocations are
394 * OK).
395 */
396 psset_update_begin(&shard->psset, to_purge);
397 assert(hpdata_alloc_allowed_get(to_purge));
398 hpdata_mid_purge_set(to_purge, true);
399 hpdata_purge_allowed_set(to_purge, false);
400 hpdata_disallow_hugify(to_purge);
401 /*
402 * Unlike with hugification (where concurrent
403 * allocations are allowed), concurrent allocation out
404 * of a hugepage being purged is unsafe; we might hand
405 * out an extent for an allocation and then purge it
406 * (clearing out user data).
407 */
408 hpdata_alloc_allowed_set(to_purge, false);
409 psset_update_end(&shard->psset, to_purge);
410
411 /* Gather all the metadata we'll need during the purge. */
412 bool dehugify = hpdata_huge_get(to_purge);
413 hpdata_purge_state_t purge_state;
414 size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state);
415
416 shard->npending_purge += num_to_purge;
417
418 malloc_mutex_unlock(tsdn, &shard->mtx);
419
420 /* Actually do the purging, now that the lock is dropped. */
421 if (dehugify) {
422 shard->central->hooks.dehugify(hpdata_addr_get(to_purge),
423 HUGEPAGE);
424 }
425 size_t total_purged = 0;
426 uint64_t purges_this_pass = 0;
427 void *purge_addr;
428 size_t purge_size;
429 while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
430 &purge_size)) {
431 total_purged += purge_size;
432 assert(total_purged <= HUGEPAGE);
433 purges_this_pass++;
434 shard->central->hooks.purge(purge_addr, purge_size);
435 }
436
437 malloc_mutex_lock(tsdn, &shard->mtx);
438 /* The shard updates */
439 shard->npending_purge -= num_to_purge;
440 shard->stats.npurge_passes++;
441 shard->stats.npurges += purges_this_pass;
442 shard->central->hooks.curtime(&shard->last_purge,
443 /* first_reading */ false);
444 if (dehugify) {
445 shard->stats.ndehugifies++;
446 }
447
448 /* The hpdata updates. */
449 psset_update_begin(&shard->psset, to_purge);
450 if (dehugify) {
451 hpdata_dehugify(to_purge);
452 }
453 hpdata_purge_end(to_purge, &purge_state);
454 hpdata_mid_purge_set(to_purge, false);
455
456 hpdata_alloc_allowed_set(to_purge, true);
457 hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge);
458
459 psset_update_end(&shard->psset, to_purge);
460
461 return true;
462}
463
464/* Returns whether or not we hugified anything. */
465static bool
466hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
467 malloc_mutex_assert_owner(tsdn, &shard->mtx);
468
469 if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
470 return false;
471 }
472
473 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
474 if (to_hugify == NULL) {
475 return false;
476 }
477 assert(hpdata_hugify_allowed_get(to_hugify));
478 assert(!hpdata_changing_state_get(to_hugify));
479
480 /* Make sure that it's been hugifiable for long enough. */
481 nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify);
482 uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed);
483 if (millis < shard->opts.hugify_delay_ms) {
484 return false;
485 }
486
487 /*
488 * Don't let anyone else purge or hugify this page while
489 * we're hugifying it (allocations and deallocations are
490 * OK).
491 */
492 psset_update_begin(&shard->psset, to_hugify);
493 hpdata_mid_hugify_set(to_hugify, true);
494 hpdata_purge_allowed_set(to_hugify, false);
495 hpdata_disallow_hugify(to_hugify);
496 assert(hpdata_alloc_allowed_get(to_hugify));
497 psset_update_end(&shard->psset, to_hugify);
498
499 malloc_mutex_unlock(tsdn, &shard->mtx);
500
501 shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE);
502
503 malloc_mutex_lock(tsdn, &shard->mtx);
504 shard->stats.nhugifies++;
505
506 psset_update_begin(&shard->psset, to_hugify);
507 hpdata_hugify(to_hugify);
508 hpdata_mid_hugify_set(to_hugify, false);
509 hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify);
510 psset_update_end(&shard->psset, to_hugify);
511
512 return true;
513}
514
515/*
516 * Execution of deferred work is forced if it's triggered by an explicit
517 * hpa_shard_do_deferred_work() call.
518 */
519static void
520hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
521 bool forced) {
522 malloc_mutex_assert_owner(tsdn, &shard->mtx);
523 if (!forced && shard->opts.deferral_allowed) {
524 return;
525 }
526 /*
527 * If we're on a background thread, do work so long as there's work to
528 * be done. Otherwise, bound latency to not be *too* bad by doing at
529 * most a small fixed number of operations.
530 */
531 bool hugified = false;
532 bool purged = false;
533 size_t max_ops = (forced ? (size_t)-1 : 16);
534 size_t nops = 0;
535 do {
536 /*
537 * Always purge before hugifying, to make sure we get some
538 * ability to hit our quiescence targets.
539 */
540 purged = false;
541 while (hpa_should_purge(tsdn, shard) && nops < max_ops) {
542 purged = hpa_try_purge(tsdn, shard);
543 if (purged) {
544 nops++;
545 }
546 }
547 hugified = hpa_try_hugify(tsdn, shard);
548 if (hugified) {
549 nops++;
550 }
551 malloc_mutex_assert_owner(tsdn, &shard->mtx);
552 malloc_mutex_assert_owner(tsdn, &shard->mtx);
553 } while ((hugified || purged) && nops < max_ops);
554}
555
556static edata_t *
557hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
558 bool *oom) {
559 bool err;
560 edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf);
561 if (edata == NULL) {
562 *oom = true;
563 return NULL;
564 }
565
566 hpdata_t *ps = psset_pick_alloc(&shard->psset, size);
567 if (ps == NULL) {
568 edata_cache_fast_put(tsdn, &shard->ecf, edata);
569 return NULL;
570 }
571
572 psset_update_begin(&shard->psset, ps);
573
574 if (hpdata_empty(ps)) {
575 /*
576 * If the pageslab used to be empty, treat it as though it's
577 * brand new for fragmentation-avoidance purposes; what we're
578 * trying to approximate is the age of the allocations *in* that
579 * pageslab, and the allocations in the new pageslab are
580 * definitionally the youngest in this hpa shard.
581 */
582 hpdata_age_set(ps, shard->age_counter++);
583 }
584
585 void *addr = hpdata_reserve_alloc(ps, size);
586 edata_init(edata, shard->ind, addr, size, /* slab */ false,
587 SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active,
588 /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA,
589 EXTENT_NOT_HEAD);
590 edata_ps_set(edata, ps);
591
592 /*
593 * This could theoretically be moved outside of the critical section,
594 * but that introduces the potential for a race. Without the lock, the
595 * (initially nonempty, since this is the reuse pathway) pageslab we
596 * allocated out of could become otherwise empty while the lock is
597 * dropped. This would force us to deal with a pageslab eviction down
598 * the error pathway, which is a pain.
599 */
600 err = emap_register_boundary(tsdn, shard->emap, edata,
601 SC_NSIZES, /* slab */ false);
602 if (err) {
603 hpdata_unreserve(ps, edata_addr_get(edata),
604 edata_size_get(edata));
605 /*
606 * We should arguably reset dirty state here, but this would
607 * require some sort of prepare + commit functionality that's a
608 * little much to deal with for now.
609 *
610 * We don't have a do_deferred_work down this pathway, on the
611 * principle that we didn't *really* affect shard state (we
612 * tweaked the stats, but our tweaks weren't really accurate).
613 */
614 psset_update_end(&shard->psset, ps);
615 edata_cache_fast_put(tsdn, &shard->ecf, edata);
616 *oom = true;
617 return NULL;
618 }
619
620 hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
621 psset_update_end(&shard->psset, ps);
622 return edata;
623}
624
625static size_t
626hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
627 bool *oom, size_t nallocs, edata_list_active_t *results,
628 bool *deferred_work_generated) {
629 malloc_mutex_lock(tsdn, &shard->mtx);
630 size_t nsuccess = 0;
631 for (; nsuccess < nallocs; nsuccess++) {
632 edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size,
633 oom);
634 if (edata == NULL) {
635 break;
636 }
637 edata_list_active_append(results, edata);
638 }
639
640 hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
641 *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard);
642 malloc_mutex_unlock(tsdn, &shard->mtx);
643 return nsuccess;
644}
645
646static size_t
647hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
648 size_t nallocs, edata_list_active_t *results,
649 bool *deferred_work_generated) {
650 assert(size <= shard->opts.slab_max_alloc);
651 bool oom = false;
652
653 size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
654 nallocs, results, deferred_work_generated);
655
656 if (nsuccess == nallocs || oom) {
657 return nsuccess;
658 }
659
660 /*
661 * We didn't OOM, but weren't able to fill everything requested of us;
662 * try to grow.
663 */
664 malloc_mutex_lock(tsdn, &shard->grow_mtx);
665 /*
666 * Check for grow races; maybe some earlier thread expanded the psset
667 * in between when we dropped the main mutex and grabbed the grow mutex.
668 */
669 nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
670 nallocs - nsuccess, results, deferred_work_generated);
671 if (nsuccess == nallocs || oom) {
672 malloc_mutex_unlock(tsdn, &shard->grow_mtx);
673 return nsuccess;
674 }
675
676 /*
677 * Note that we don't hold shard->mtx here (while growing);
678 * deallocations (and allocations of smaller sizes) may still succeed
679 * while we're doing this potentially expensive system call.
680 */
681 hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom);
682 if (ps == NULL) {
683 malloc_mutex_unlock(tsdn, &shard->grow_mtx);
684 return nsuccess;
685 }
686
687 /*
688 * We got the pageslab; allocate from it. This does an unlock followed
689 * by a lock on the same mutex, and holds the grow mutex while doing
690 * deferred work, but this is an uncommon path; the simplicity is worth
691 * it.
692 */
693 malloc_mutex_lock(tsdn, &shard->mtx);
694 psset_insert(&shard->psset, ps);
695 malloc_mutex_unlock(tsdn, &shard->mtx);
696
697 nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
698 nallocs - nsuccess, results, deferred_work_generated);
699 /*
700 * Drop grow_mtx before doing deferred work; other threads blocked on it
701 * should be allowed to proceed while we're working.
702 */
703 malloc_mutex_unlock(tsdn, &shard->grow_mtx);
704
705 return nsuccess;
706}
707
708static hpa_shard_t *
709hpa_from_pai(pai_t *self) {
710 assert(self->alloc = &hpa_alloc);
711 assert(self->expand = &hpa_expand);
712 assert(self->shrink = &hpa_shrink);
713 assert(self->dalloc = &hpa_dalloc);
714 return (hpa_shard_t *)self;
715}
716
717static size_t
718hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
719 edata_list_active_t *results, bool *deferred_work_generated) {
720 assert(nallocs > 0);
721 assert((size & PAGE_MASK) == 0);
722 witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
723 WITNESS_RANK_CORE, 0);
724 hpa_shard_t *shard = hpa_from_pai(self);
725
726 if (size > shard->opts.slab_max_alloc) {
727 return 0;
728 }
729
730 size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs,
731 results, deferred_work_generated);
732
733 witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
734 WITNESS_RANK_CORE, 0);
735
736 /*
737 * Guard the sanity checks with config_debug because the loop cannot be
738 * proven non-circular by the compiler, even if everything within the
739 * loop is optimized away.
740 */
741 if (config_debug) {
742 edata_t *edata;
743 ql_foreach(edata, &results->head, ql_link_active) {
744 emap_assert_mapped(tsdn, shard->emap, edata);
745 assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
746 assert(edata_state_get(edata) == extent_state_active);
747 assert(edata_arena_ind_get(edata) == shard->ind);
748 assert(edata_szind_get_maybe_invalid(edata) ==
749 SC_NSIZES);
750 assert(!edata_slab_get(edata));
751 assert(edata_committed_get(edata));
752 assert(edata_base_get(edata) == edata_addr_get(edata));
753 assert(edata_base_get(edata) != NULL);
754 }
755 }
756 return nsuccess;
757}
758
759static edata_t *
760hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
761 bool guarded, bool frequent_reuse, bool *deferred_work_generated) {
762 assert((size & PAGE_MASK) == 0);
763 assert(!guarded);
764 witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
765 WITNESS_RANK_CORE, 0);
766
767 /* We don't handle alignment or zeroing for now. */
768 if (alignment > PAGE || zero) {
769 return NULL;
770 }
771 /*
772 * An alloc with alignment == PAGE and zero == false is equivalent to a
773 * batch alloc of 1. Just do that, so we can share code.
774 */
775 edata_list_active_t results;
776 edata_list_active_init(&results);
777 size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1,
778 &results, deferred_work_generated);
779 assert(nallocs == 0 || nallocs == 1);
780 edata_t *edata = edata_list_active_first(&results);
781 return edata;
782}
783
784static bool
785hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
786 size_t new_size, bool zero, bool *deferred_work_generated) {
787 /* Expand not yet supported. */
788 return true;
789}
790
791static bool
792hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
793 size_t old_size, size_t new_size, bool *deferred_work_generated) {
794 /* Shrink not yet supported. */
795 return true;
796}
797
798static void
799hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
800 malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
801
802 assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
803 assert(edata_state_get(edata) == extent_state_active);
804 assert(edata_arena_ind_get(edata) == shard->ind);
805 assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
806 assert(edata_committed_get(edata));
807 assert(edata_base_get(edata) != NULL);
808
809 /*
810 * Another thread shouldn't be trying to touch the metadata of an
811 * allocation being freed. The one exception is a merge attempt from a
812 * lower-addressed PAC extent; in this case we have a nominal race on
813 * the edata metadata bits, but in practice the fact that the PAI bits
814 * are different will prevent any further access. The race is bad, but
815 * benign in practice, and the long term plan is to track enough state
816 * in the rtree to prevent these merge attempts in the first place.
817 */
818 edata_addr_set(edata, edata_base_get(edata));
819 edata_zeroed_set(edata, false);
820 emap_deregister_boundary(tsdn, shard->emap, edata);
821}
822
823static void
824hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
825 malloc_mutex_assert_owner(tsdn, &shard->mtx);
826
827 /*
828 * Release the metadata early, to avoid having to remember to do it
829 * while we're also doing tricky purging logic. First, we need to grab
830 * a few bits of metadata from it.
831 *
832 * Note that the shard mutex protects ps's metadata too; it wouldn't be
833 * correct to try to read most information out of it without the lock.
834 */
835 hpdata_t *ps = edata_ps_get(edata);
836 /* Currently, all edatas come from pageslabs. */
837 assert(ps != NULL);
838 void *unreserve_addr = edata_addr_get(edata);
839 size_t unreserve_size = edata_size_get(edata);
840 edata_cache_fast_put(tsdn, &shard->ecf, edata);
841
842 psset_update_begin(&shard->psset, ps);
843 hpdata_unreserve(ps, unreserve_addr, unreserve_size);
844 hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
845 psset_update_end(&shard->psset, ps);
846}
847
848static void
849hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
850 bool *deferred_work_generated) {
851 hpa_shard_t *shard = hpa_from_pai(self);
852
853 edata_t *edata;
854 ql_foreach(edata, &list->head, ql_link_active) {
855 hpa_dalloc_prepare_unlocked(tsdn, shard, edata);
856 }
857
858 malloc_mutex_lock(tsdn, &shard->mtx);
859 /* Now, remove from the list. */
860 while ((edata = edata_list_active_first(list)) != NULL) {
861 edata_list_active_remove(list, edata);
862 hpa_dalloc_locked(tsdn, shard, edata);
863 }
864 hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
865 *deferred_work_generated =
866 hpa_shard_has_deferred_work(tsdn, shard);
867
868 malloc_mutex_unlock(tsdn, &shard->mtx);
869}
870
871static void
872hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
873 bool *deferred_work_generated) {
874 assert(!edata_guarded_get(edata));
875 /* Just a dalloc_batch of size 1; this lets us share logic. */
876 edata_list_active_t dalloc_list;
877 edata_list_active_init(&dalloc_list);
878 edata_list_active_append(&dalloc_list, edata);
879 hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated);
880}
881
882/*
883 * Calculate time until either purging or hugification ought to happen.
884 * Called by background threads.
885 */
886static uint64_t
887hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
888 hpa_shard_t *shard = hpa_from_pai(self);
889 uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX;
890
891 malloc_mutex_lock(tsdn, &shard->mtx);
892
893 hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
894 if (to_hugify != NULL) {
895 nstime_t time_hugify_allowed =
896 hpdata_time_hugify_allowed(to_hugify);
897 uint64_t since_hugify_allowed_ms =
898 shard->central->hooks.ms_since(&time_hugify_allowed);
899 /*
900 * If not enough time has passed since hugification was allowed,
901 * sleep for the rest.
902 */
903 if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) {
904 time_ns = shard->opts.hugify_delay_ms -
905 since_hugify_allowed_ms;
906 time_ns *= 1000 * 1000;
907 } else {
908 malloc_mutex_unlock(tsdn, &shard->mtx);
909 return BACKGROUND_THREAD_DEFERRED_MIN;
910 }
911 }
912
913 if (hpa_should_purge(tsdn, shard)) {
914 /*
915 * If we haven't purged before, no need to check interval
916 * between purges. Simply purge as soon as possible.
917 */
918 if (shard->stats.npurge_passes == 0) {
919 malloc_mutex_unlock(tsdn, &shard->mtx);
920 return BACKGROUND_THREAD_DEFERRED_MIN;
921 }
922 uint64_t since_last_purge_ms = shard->central->hooks.ms_since(
923 &shard->last_purge);
924
925 if (since_last_purge_ms < shard->opts.min_purge_interval_ms) {
926 uint64_t until_purge_ns;
927 until_purge_ns = shard->opts.min_purge_interval_ms -
928 since_last_purge_ms;
929 until_purge_ns *= 1000 * 1000;
930
931 if (until_purge_ns < time_ns) {
932 time_ns = until_purge_ns;
933 }
934 } else {
935 time_ns = BACKGROUND_THREAD_DEFERRED_MIN;
936 }
937 }
938 malloc_mutex_unlock(tsdn, &shard->mtx);
939 return time_ns;
940}
941
942void
943hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
944 hpa_do_consistency_checks(shard);
945
946 malloc_mutex_lock(tsdn, &shard->mtx);
947 edata_cache_fast_disable(tsdn, &shard->ecf);
948 malloc_mutex_unlock(tsdn, &shard->mtx);
949}
950
951static void
952hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
953 assert(bin_stats->npageslabs == 0);
954 assert(bin_stats->nactive == 0);
955}
956
957static void
958hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
959 malloc_mutex_assert_owner(tsdn, &shard->mtx);
960 for (int huge = 0; huge <= 1; huge++) {
961 hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]);
962 for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
963 hpa_shard_assert_stats_empty(
964 &psset->stats.nonfull_slabs[i][huge]);
965 }
966 }
967}
968
969void
970hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
971 hpa_do_consistency_checks(shard);
972 /*
973 * By the time we're here, the arena code should have dalloc'd all the
974 * active extents, which means we should have eventually evicted
975 * everything from the psset, so it shouldn't be able to serve even a
976 * 1-page allocation.
977 */
978 if (config_debug) {
979 malloc_mutex_lock(tsdn, &shard->mtx);
980 hpa_assert_empty(tsdn, shard, &shard->psset);
981 malloc_mutex_unlock(tsdn, &shard->mtx);
982 }
983 hpdata_t *ps;
984 while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) {
985 /* There should be no allocations anywhere. */
986 assert(hpdata_empty(ps));
987 psset_remove(&shard->psset, ps);
988 shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE);
989 }
990}
991
992void
993hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard,
994 bool deferral_allowed) {
995 hpa_do_consistency_checks(shard);
996
997 malloc_mutex_lock(tsdn, &shard->mtx);
998 bool deferral_previously_allowed = shard->opts.deferral_allowed;
999 shard->opts.deferral_allowed = deferral_allowed;
1000 if (deferral_previously_allowed && !deferral_allowed) {
1001 hpa_shard_maybe_do_deferred_work(tsdn, shard,
1002 /* forced */ true);
1003 }
1004 malloc_mutex_unlock(tsdn, &shard->mtx);
1005}
1006
1007void
1008hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
1009 hpa_do_consistency_checks(shard);
1010
1011 malloc_mutex_lock(tsdn, &shard->mtx);
1012 hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true);
1013 malloc_mutex_unlock(tsdn, &shard->mtx);
1014}
1015
1016void
1017hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
1018 hpa_do_consistency_checks(shard);
1019
1020 malloc_mutex_prefork(tsdn, &shard->grow_mtx);
1021}
1022
1023void
1024hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
1025 hpa_do_consistency_checks(shard);
1026
1027 malloc_mutex_prefork(tsdn, &shard->mtx);
1028}
1029
1030void
1031hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) {
1032 hpa_do_consistency_checks(shard);
1033
1034 malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx);
1035 malloc_mutex_postfork_parent(tsdn, &shard->mtx);
1036}
1037
1038void
1039hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
1040 hpa_do_consistency_checks(shard);
1041
1042 malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
1043 malloc_mutex_postfork_child(tsdn, &shard->mtx);
1044}
1045