1#include "jemalloc/internal/jemalloc_preamble.h"
2#include "jemalloc/internal/jemalloc_internal_includes.h"
3
4#include "jemalloc/internal/sec.h"
5
6static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
7 size_t alignment, bool zero, bool guarded, bool frequent_reuse,
8 bool *deferred_work_generated);
9static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
10 size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated);
11static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
12 size_t old_size, size_t new_size, bool *deferred_work_generated);
13static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
14 bool *deferred_work_generated);
15
16static void
17sec_bin_init(sec_bin_t *bin) {
18 bin->being_batch_filled = false;
19 bin->bytes_cur = 0;
20 edata_list_active_init(&bin->freelist);
21}
22
23bool
24sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
25 const sec_opts_t *opts) {
26 assert(opts->max_alloc >= PAGE);
27
28 size_t max_alloc = PAGE_FLOOR(opts->max_alloc);
29 pszind_t npsizes = sz_psz2ind(max_alloc) + 1;
30
31 size_t sz_shards = opts->nshards * sizeof(sec_shard_t);
32 size_t sz_bins = opts->nshards * (size_t)npsizes * sizeof(sec_bin_t);
33 size_t sz_alloc = sz_shards + sz_bins;
34 void *dynalloc = base_alloc(tsdn, base, sz_alloc, CACHELINE);
35 if (dynalloc == NULL) {
36 return true;
37 }
38 sec_shard_t *shard_cur = (sec_shard_t *)dynalloc;
39 sec->shards = shard_cur;
40 sec_bin_t *bin_cur = (sec_bin_t *)&shard_cur[opts->nshards];
41 /* Just for asserts, below. */
42 sec_bin_t *bin_start = bin_cur;
43
44 for (size_t i = 0; i < opts->nshards; i++) {
45 sec_shard_t *shard = shard_cur;
46 shard_cur++;
47 bool err = malloc_mutex_init(&shard->mtx, "sec_shard",
48 WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive);
49 if (err) {
50 return true;
51 }
52 shard->enabled = true;
53 shard->bins = bin_cur;
54 for (pszind_t j = 0; j < npsizes; j++) {
55 sec_bin_init(&shard->bins[j]);
56 bin_cur++;
57 }
58 shard->bytes_cur = 0;
59 shard->to_flush_next = 0;
60 }
61 /*
62 * Should have exactly matched the bin_start to the first unused byte
63 * after the shards.
64 */
65 assert((void *)shard_cur == (void *)bin_start);
66 /* And the last bin to use up the last bytes of the allocation. */
67 assert((char *)bin_cur == ((char *)dynalloc + sz_alloc));
68 sec->fallback = fallback;
69
70
71 sec->opts = *opts;
72 sec->npsizes = npsizes;
73
74 /*
75 * Initialize these last so that an improper use of an SEC whose
76 * initialization failed will segfault in an easy-to-spot way.
77 */
78 sec->pai.alloc = &sec_alloc;
79 sec->pai.alloc_batch = &pai_alloc_batch_default;
80 sec->pai.expand = &sec_expand;
81 sec->pai.shrink = &sec_shrink;
82 sec->pai.dalloc = &sec_dalloc;
83 sec->pai.dalloc_batch = &pai_dalloc_batch_default;
84
85 return false;
86}
87
88static sec_shard_t *
89sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
90 /*
91 * Eventually, we should implement affinity, tracking source shard using
92 * the edata_t's newly freed up fields. For now, just randomly
93 * distribute across all shards.
94 */
95 if (tsdn_null(tsdn)) {
96 return &sec->shards[0];
97 }
98 tsd_t *tsd = tsdn_tsd(tsdn);
99 uint8_t *idxp = tsd_sec_shardp_get(tsd);
100 if (*idxp == (uint8_t)-1) {
101 /*
102 * First use; initialize using the trick from Daniel Lemire's
103 * "A fast alternative to the modulo reduction. Use a 64 bit
104 * number to store 32 bits, since we'll deliberately overflow
105 * when we multiply by the number of shards.
106 */
107 uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32);
108 uint32_t idx =
109 (uint32_t)((rand32 * (uint64_t)sec->opts.nshards) >> 32);
110 assert(idx < (uint32_t)sec->opts.nshards);
111 *idxp = (uint8_t)idx;
112 }
113 return &sec->shards[*idxp];
114}
115
116/*
117 * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an
118 * empty cache, we'll try to fill it, which can push the shard over it's limit.
119 */
120static void
121sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
122 malloc_mutex_assert_owner(tsdn, &shard->mtx);
123 edata_list_active_t to_flush;
124 edata_list_active_init(&to_flush);
125 while (shard->bytes_cur > sec->opts.bytes_after_flush) {
126 /* Pick a victim. */
127 sec_bin_t *bin = &shard->bins[shard->to_flush_next];
128
129 /* Update our victim-picking state. */
130 shard->to_flush_next++;
131 if (shard->to_flush_next == sec->npsizes) {
132 shard->to_flush_next = 0;
133 }
134
135 assert(shard->bytes_cur >= bin->bytes_cur);
136 if (bin->bytes_cur != 0) {
137 shard->bytes_cur -= bin->bytes_cur;
138 bin->bytes_cur = 0;
139 edata_list_active_concat(&to_flush, &bin->freelist);
140 }
141 /*
142 * Either bin->bytes_cur was 0, in which case we didn't touch
143 * the bin list but it should be empty anyways (or else we
144 * missed a bytes_cur update on a list modification), or it
145 * *was* 0 and we emptied it ourselves. Either way, it should
146 * be empty now.
147 */
148 assert(edata_list_active_empty(&bin->freelist));
149 }
150
151 malloc_mutex_unlock(tsdn, &shard->mtx);
152 bool deferred_work_generated = false;
153 pai_dalloc_batch(tsdn, sec->fallback, &to_flush,
154 &deferred_work_generated);
155}
156
157static edata_t *
158sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
159 sec_bin_t *bin) {
160 malloc_mutex_assert_owner(tsdn, &shard->mtx);
161 if (!shard->enabled) {
162 return NULL;
163 }
164 edata_t *edata = edata_list_active_first(&bin->freelist);
165 if (edata != NULL) {
166 edata_list_active_remove(&bin->freelist, edata);
167 assert(edata_size_get(edata) <= bin->bytes_cur);
168 bin->bytes_cur -= edata_size_get(edata);
169 assert(edata_size_get(edata) <= shard->bytes_cur);
170 shard->bytes_cur -= edata_size_get(edata);
171 }
172 return edata;
173}
174
175static edata_t *
176sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
177 sec_bin_t *bin, size_t size) {
178 malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
179
180 edata_list_active_t result;
181 edata_list_active_init(&result);
182 bool deferred_work_generated = false;
183 size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size,
184 1 + sec->opts.batch_fill_extra, &result, &deferred_work_generated);
185
186 edata_t *ret = edata_list_active_first(&result);
187 if (ret != NULL) {
188 edata_list_active_remove(&result, ret);
189 }
190
191 malloc_mutex_lock(tsdn, &shard->mtx);
192 bin->being_batch_filled = false;
193 /*
194 * Handle the easy case first: nothing to cache. Note that this can
195 * only happen in case of OOM, since sec_alloc checks the expected
196 * number of allocs, and doesn't bother going down the batch_fill
197 * pathway if there won't be anything left to cache. So to be in this
198 * code path, we must have asked for > 1 alloc, but only gotten 1 back.
199 */
200 if (nalloc <= 1) {
201 malloc_mutex_unlock(tsdn, &shard->mtx);
202 return ret;
203 }
204
205 size_t new_cached_bytes = (nalloc - 1) * size;
206
207 edata_list_active_concat(&bin->freelist, &result);
208 bin->bytes_cur += new_cached_bytes;
209 shard->bytes_cur += new_cached_bytes;
210
211 if (shard->bytes_cur > sec->opts.max_bytes) {
212 sec_flush_some_and_unlock(tsdn, sec, shard);
213 } else {
214 malloc_mutex_unlock(tsdn, &shard->mtx);
215 }
216
217 return ret;
218}
219
220static edata_t *
221sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero,
222 bool guarded, bool frequent_reuse, bool *deferred_work_generated) {
223 assert((size & PAGE_MASK) == 0);
224 assert(!guarded);
225
226 sec_t *sec = (sec_t *)self;
227
228 if (zero || alignment > PAGE || sec->opts.nshards == 0
229 || size > sec->opts.max_alloc) {
230 return pai_alloc(tsdn, sec->fallback, size, alignment, zero,
231 /* guarded */ false, frequent_reuse,
232 deferred_work_generated);
233 }
234 pszind_t pszind = sz_psz2ind(size);
235 assert(pszind < sec->npsizes);
236
237 sec_shard_t *shard = sec_shard_pick(tsdn, sec);
238 sec_bin_t *bin = &shard->bins[pszind];
239 bool do_batch_fill = false;
240
241 malloc_mutex_lock(tsdn, &shard->mtx);
242 edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin);
243 if (edata == NULL) {
244 if (!bin->being_batch_filled
245 && sec->opts.batch_fill_extra > 0) {
246 bin->being_batch_filled = true;
247 do_batch_fill = true;
248 }
249 }
250 malloc_mutex_unlock(tsdn, &shard->mtx);
251 if (edata == NULL) {
252 if (do_batch_fill) {
253 edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin,
254 size);
255 } else {
256 edata = pai_alloc(tsdn, sec->fallback, size, alignment,
257 zero, /* guarded */ false, frequent_reuse,
258 deferred_work_generated);
259 }
260 }
261 return edata;
262}
263
264static bool
265sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
266 size_t new_size, bool zero, bool *deferred_work_generated) {
267 sec_t *sec = (sec_t *)self;
268 return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero,
269 deferred_work_generated);
270}
271
272static bool
273sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
274 size_t new_size, bool *deferred_work_generated) {
275 sec_t *sec = (sec_t *)self;
276 return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size,
277 deferred_work_generated);
278}
279
280static void
281sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) {
282 malloc_mutex_assert_owner(tsdn, &shard->mtx);
283 shard->bytes_cur = 0;
284 edata_list_active_t to_flush;
285 edata_list_active_init(&to_flush);
286 for (pszind_t i = 0; i < sec->npsizes; i++) {
287 sec_bin_t *bin = &shard->bins[i];
288 bin->bytes_cur = 0;
289 edata_list_active_concat(&to_flush, &bin->freelist);
290 }
291
292 /*
293 * Ordinarily we would try to avoid doing the batch deallocation while
294 * holding the shard mutex, but the flush_all pathways only happen when
295 * we're disabling the HPA or resetting the arena, both of which are
296 * rare pathways.
297 */
298 bool deferred_work_generated = false;
299 pai_dalloc_batch(tsdn, sec->fallback, &to_flush,
300 &deferred_work_generated);
301}
302
303static void
304sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard,
305 edata_t *edata) {
306 malloc_mutex_assert_owner(tsdn, &shard->mtx);
307 assert(shard->bytes_cur <= sec->opts.max_bytes);
308 size_t size = edata_size_get(edata);
309 pszind_t pszind = sz_psz2ind(size);
310 assert(pszind < sec->npsizes);
311 /*
312 * Prepending here results in LIFO allocation per bin, which seems
313 * reasonable.
314 */
315 sec_bin_t *bin = &shard->bins[pszind];
316 edata_list_active_prepend(&bin->freelist, edata);
317 bin->bytes_cur += size;
318 shard->bytes_cur += size;
319 if (shard->bytes_cur > sec->opts.max_bytes) {
320 /*
321 * We've exceeded the shard limit. We make two nods in the
322 * direction of fragmentation avoidance: we flush everything in
323 * the shard, rather than one particular bin, and we hold the
324 * lock while flushing (in case one of the extents we flush is
325 * highly preferred from a fragmentation-avoidance perspective
326 * in the backing allocator). This has the extra advantage of
327 * not requiring advanced cache balancing strategies.
328 */
329 sec_flush_some_and_unlock(tsdn, sec, shard);
330 malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
331 } else {
332 malloc_mutex_unlock(tsdn, &shard->mtx);
333 }
334}
335
336static void
337sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
338 bool *deferred_work_generated) {
339 sec_t *sec = (sec_t *)self;
340 if (sec->opts.nshards == 0
341 || edata_size_get(edata) > sec->opts.max_alloc) {
342 pai_dalloc(tsdn, sec->fallback, edata,
343 deferred_work_generated);
344 return;
345 }
346 sec_shard_t *shard = sec_shard_pick(tsdn, sec);
347 malloc_mutex_lock(tsdn, &shard->mtx);
348 if (shard->enabled) {
349 sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata);
350 } else {
351 malloc_mutex_unlock(tsdn, &shard->mtx);
352 pai_dalloc(tsdn, sec->fallback, edata,
353 deferred_work_generated);
354 }
355}
356
357void
358sec_flush(tsdn_t *tsdn, sec_t *sec) {
359 for (size_t i = 0; i < sec->opts.nshards; i++) {
360 malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
361 sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
362 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
363 }
364}
365
366void
367sec_disable(tsdn_t *tsdn, sec_t *sec) {
368 for (size_t i = 0; i < sec->opts.nshards; i++) {
369 malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
370 sec->shards[i].enabled = false;
371 sec_flush_all_locked(tsdn, sec, &sec->shards[i]);
372 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
373 }
374}
375
376void
377sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) {
378 size_t sum = 0;
379 for (size_t i = 0; i < sec->opts.nshards; i++) {
380 /*
381 * We could save these lock acquisitions by making bytes_cur
382 * atomic, but stats collection is rare anyways and we expect
383 * the number and type of stats to get more interesting.
384 */
385 malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
386 sum += sec->shards[i].bytes_cur;
387 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
388 }
389 stats->bytes += sum;
390}
391
392void
393sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec,
394 mutex_prof_data_t *mutex_prof_data) {
395 for (size_t i = 0; i < sec->opts.nshards; i++) {
396 malloc_mutex_lock(tsdn, &sec->shards[i].mtx);
397 malloc_mutex_prof_accum(tsdn, mutex_prof_data,
398 &sec->shards[i].mtx);
399 malloc_mutex_unlock(tsdn, &sec->shards[i].mtx);
400 }
401}
402
403void
404sec_prefork2(tsdn_t *tsdn, sec_t *sec) {
405 for (size_t i = 0; i < sec->opts.nshards; i++) {
406 malloc_mutex_prefork(tsdn, &sec->shards[i].mtx);
407 }
408}
409
410void
411sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) {
412 for (size_t i = 0; i < sec->opts.nshards; i++) {
413 malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx);
414 }
415}
416
417void
418sec_postfork_child(tsdn_t *tsdn, sec_t *sec) {
419 for (size_t i = 0; i < sec->opts.nshards; i++) {
420 malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx);
421 }
422}
423