1 | #include "jemalloc/internal/jemalloc_preamble.h" |
2 | #include "jemalloc/internal/jemalloc_internal_includes.h" |
3 | |
4 | #include "jemalloc/internal/hpdata.h" |
5 | |
6 | static int |
7 | hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) { |
8 | uint64_t a_age = hpdata_age_get(a); |
9 | uint64_t b_age = hpdata_age_get(b); |
10 | /* |
11 | * hpdata ages are operation counts in the psset; no two should be the |
12 | * same. |
13 | */ |
14 | assert(a_age != b_age); |
15 | return (a_age > b_age) - (a_age < b_age); |
16 | } |
17 | |
18 | ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp) |
19 | |
20 | void |
21 | hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { |
22 | hpdata_addr_set(hpdata, addr); |
23 | hpdata_age_set(hpdata, age); |
24 | hpdata->h_huge = false; |
25 | hpdata->h_alloc_allowed = true; |
26 | hpdata->h_in_psset_alloc_container = false; |
27 | hpdata->h_purge_allowed = false; |
28 | hpdata->h_hugify_allowed = false; |
29 | hpdata->h_in_psset_hugify_container = false; |
30 | hpdata->h_mid_purge = false; |
31 | hpdata->h_mid_hugify = false; |
32 | hpdata->h_updating = false; |
33 | hpdata->h_in_psset = false; |
34 | hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES); |
35 | hpdata->h_nactive = 0; |
36 | fb_init(hpdata->active_pages, HUGEPAGE_PAGES); |
37 | hpdata->h_ntouched = 0; |
38 | fb_init(hpdata->touched_pages, HUGEPAGE_PAGES); |
39 | |
40 | hpdata_assert_consistent(hpdata); |
41 | } |
42 | |
43 | void * |
44 | hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) { |
45 | hpdata_assert_consistent(hpdata); |
46 | /* |
47 | * This is a metadata change; the hpdata should therefore either not be |
48 | * in the psset, or should have explicitly marked itself as being |
49 | * mid-update. |
50 | */ |
51 | assert(!hpdata->h_in_psset || hpdata->h_updating); |
52 | assert(hpdata->h_alloc_allowed); |
53 | assert((sz & PAGE_MASK) == 0); |
54 | size_t npages = sz >> LG_PAGE; |
55 | assert(npages <= hpdata_longest_free_range_get(hpdata)); |
56 | |
57 | size_t result; |
58 | |
59 | size_t start = 0; |
60 | /* |
61 | * These are dead stores, but the compiler will issue warnings on them |
62 | * since it can't tell statically that found is always true below. |
63 | */ |
64 | size_t begin = 0; |
65 | size_t len = 0; |
66 | |
67 | size_t largest_unchosen_range = 0; |
68 | while (true) { |
69 | bool found = fb_urange_iter(hpdata->active_pages, |
70 | HUGEPAGE_PAGES, start, &begin, &len); |
71 | /* |
72 | * A precondition to this function is that hpdata must be able |
73 | * to serve the allocation. |
74 | */ |
75 | assert(found); |
76 | assert(len <= hpdata_longest_free_range_get(hpdata)); |
77 | if (len >= npages) { |
78 | /* |
79 | * We use first-fit within the page slabs; this gives |
80 | * bounded worst-case fragmentation within a slab. It's |
81 | * not necessarily right; we could experiment with |
82 | * various other options. |
83 | */ |
84 | break; |
85 | } |
86 | if (len > largest_unchosen_range) { |
87 | largest_unchosen_range = len; |
88 | } |
89 | start = begin + len; |
90 | } |
91 | /* We found a range; remember it. */ |
92 | result = begin; |
93 | fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); |
94 | hpdata->h_nactive += npages; |
95 | |
96 | /* |
97 | * We might be about to dirty some memory for the first time; update our |
98 | * count if so. |
99 | */ |
100 | size_t new_dirty = fb_ucount(hpdata->touched_pages, HUGEPAGE_PAGES, |
101 | result, npages); |
102 | fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, result, npages); |
103 | hpdata->h_ntouched += new_dirty; |
104 | |
105 | /* |
106 | * If we allocated out of a range that was the longest in the hpdata, it |
107 | * might be the only one of that size and we'll have to adjust the |
108 | * metadata. |
109 | */ |
110 | if (len == hpdata_longest_free_range_get(hpdata)) { |
111 | start = begin + npages; |
112 | while (start < HUGEPAGE_PAGES) { |
113 | bool found = fb_urange_iter(hpdata->active_pages, |
114 | HUGEPAGE_PAGES, start, &begin, &len); |
115 | if (!found) { |
116 | break; |
117 | } |
118 | assert(len <= hpdata_longest_free_range_get(hpdata)); |
119 | if (len == hpdata_longest_free_range_get(hpdata)) { |
120 | largest_unchosen_range = len; |
121 | break; |
122 | } |
123 | if (len > largest_unchosen_range) { |
124 | largest_unchosen_range = len; |
125 | } |
126 | start = begin + len; |
127 | } |
128 | hpdata_longest_free_range_set(hpdata, largest_unchosen_range); |
129 | } |
130 | |
131 | hpdata_assert_consistent(hpdata); |
132 | return (void *)( |
133 | (uintptr_t)hpdata_addr_get(hpdata) + (result << LG_PAGE)); |
134 | } |
135 | |
136 | void |
137 | hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { |
138 | hpdata_assert_consistent(hpdata); |
139 | /* See the comment in reserve. */ |
140 | assert(!hpdata->h_in_psset || hpdata->h_updating); |
141 | assert(((uintptr_t)addr & PAGE_MASK) == 0); |
142 | assert((sz & PAGE_MASK) == 0); |
143 | size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata)) |
144 | >> LG_PAGE; |
145 | assert(begin < HUGEPAGE_PAGES); |
146 | size_t npages = sz >> LG_PAGE; |
147 | size_t old_longest_range = hpdata_longest_free_range_get(hpdata); |
148 | |
149 | fb_unset_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); |
150 | /* We might have just created a new, larger range. */ |
151 | size_t new_begin = (fb_fls(hpdata->active_pages, HUGEPAGE_PAGES, |
152 | begin) + 1); |
153 | size_t new_end = fb_ffs(hpdata->active_pages, HUGEPAGE_PAGES, |
154 | begin + npages - 1); |
155 | size_t new_range_len = new_end - new_begin; |
156 | |
157 | if (new_range_len > old_longest_range) { |
158 | hpdata_longest_free_range_set(hpdata, new_range_len); |
159 | } |
160 | |
161 | hpdata->h_nactive -= npages; |
162 | |
163 | hpdata_assert_consistent(hpdata); |
164 | } |
165 | |
166 | size_t |
167 | hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { |
168 | hpdata_assert_consistent(hpdata); |
169 | /* |
170 | * See the comment below; we might purge any inactive extent, so it's |
171 | * unsafe for any other thread to turn any inactive extent active while |
172 | * we're operating on it. |
173 | */ |
174 | assert(!hpdata_alloc_allowed_get(hpdata)); |
175 | |
176 | purge_state->npurged = 0; |
177 | purge_state->next_purge_search_begin = 0; |
178 | |
179 | /* |
180 | * Initialize to_purge. |
181 | * |
182 | * It's possible to end up in situations where two dirty extents are |
183 | * separated by a retained extent: |
184 | * - 1 page allocated. |
185 | * - 1 page allocated. |
186 | * - 1 pages allocated. |
187 | * |
188 | * If the middle page is freed and purged, and then the first and third |
189 | * pages are freed, and then another purge pass happens, the hpdata |
190 | * looks like this: |
191 | * - 1 page dirty. |
192 | * - 1 page retained. |
193 | * - 1 page dirty. |
194 | * |
195 | * But it's safe to do a single 3-page purge. |
196 | * |
197 | * We do this by first computing the dirty pages, and then filling in |
198 | * any gaps by extending each range in the dirty bitmap to extend until |
199 | * the next active page. This purges more pages, but the expensive part |
200 | * of purging is the TLB shootdowns, rather than the kernel state |
201 | * tracking; doing a little bit more of the latter is fine if it saves |
202 | * us from doing some of the former. |
203 | */ |
204 | |
205 | /* |
206 | * The dirty pages are those that are touched but not active. Note that |
207 | * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the |
208 | * fb_group_t is 64 bits, so this is 64 bytes, spread across 8 |
209 | * fb_group_ts. |
210 | */ |
211 | fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; |
212 | fb_init(dirty_pages, HUGEPAGE_PAGES); |
213 | fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES); |
214 | fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages, |
215 | HUGEPAGE_PAGES); |
216 | |
217 | fb_init(purge_state->to_purge, HUGEPAGE_PAGES); |
218 | size_t next_bit = 0; |
219 | while (next_bit < HUGEPAGE_PAGES) { |
220 | size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES, |
221 | next_bit); |
222 | /* Recall that fb_ffs returns nbits if no set bit is found. */ |
223 | if (next_dirty == HUGEPAGE_PAGES) { |
224 | break; |
225 | } |
226 | size_t next_active = fb_ffs(hpdata->active_pages, |
227 | HUGEPAGE_PAGES, next_dirty); |
228 | /* |
229 | * Don't purge past the end of the dirty extent, into retained |
230 | * pages. This helps the kernel a tiny bit, but honestly it's |
231 | * mostly helpful for testing (where we tend to write test cases |
232 | * that think in terms of the dirty ranges). |
233 | */ |
234 | ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES, |
235 | next_active - 1); |
236 | assert(last_dirty >= 0); |
237 | assert((size_t)last_dirty >= next_dirty); |
238 | assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES); |
239 | |
240 | fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty, |
241 | last_dirty - next_dirty + 1); |
242 | next_bit = next_active + 1; |
243 | } |
244 | |
245 | /* We should purge, at least, everything dirty. */ |
246 | size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive; |
247 | purge_state->ndirty_to_purge = ndirty; |
248 | assert(ndirty <= fb_scount( |
249 | purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); |
250 | assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0, |
251 | HUGEPAGE_PAGES)); |
252 | |
253 | hpdata_assert_consistent(hpdata); |
254 | |
255 | return ndirty; |
256 | } |
257 | |
258 | bool |
259 | hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, |
260 | void **r_purge_addr, size_t *r_purge_size) { |
261 | /* |
262 | * Note that we don't have a consistency check here; we're accessing |
263 | * hpdata without synchronization, and therefore have no right to expect |
264 | * a consistent state. |
265 | */ |
266 | assert(!hpdata_alloc_allowed_get(hpdata)); |
267 | |
268 | if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) { |
269 | return false; |
270 | } |
271 | size_t purge_begin; |
272 | size_t purge_len; |
273 | bool found_range = fb_srange_iter(purge_state->to_purge, HUGEPAGE_PAGES, |
274 | purge_state->next_purge_search_begin, &purge_begin, &purge_len); |
275 | if (!found_range) { |
276 | return false; |
277 | } |
278 | |
279 | *r_purge_addr = (void *)( |
280 | (uintptr_t)hpdata_addr_get(hpdata) + purge_begin * PAGE); |
281 | *r_purge_size = purge_len * PAGE; |
282 | |
283 | purge_state->next_purge_search_begin = purge_begin + purge_len; |
284 | purge_state->npurged += purge_len; |
285 | assert(purge_state->npurged <= HUGEPAGE_PAGES); |
286 | |
287 | return true; |
288 | } |
289 | |
290 | void |
291 | hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { |
292 | assert(!hpdata_alloc_allowed_get(hpdata)); |
293 | hpdata_assert_consistent(hpdata); |
294 | /* See the comment in reserve. */ |
295 | assert(!hpdata->h_in_psset || hpdata->h_updating); |
296 | |
297 | assert(purge_state->npurged == fb_scount(purge_state->to_purge, |
298 | HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); |
299 | assert(purge_state->npurged >= purge_state->ndirty_to_purge); |
300 | |
301 | fb_bit_not(purge_state->to_purge, purge_state->to_purge, |
302 | HUGEPAGE_PAGES); |
303 | fb_bit_and(hpdata->touched_pages, hpdata->touched_pages, |
304 | purge_state->to_purge, HUGEPAGE_PAGES); |
305 | assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge); |
306 | hpdata->h_ntouched -= purge_state->ndirty_to_purge; |
307 | |
308 | hpdata_assert_consistent(hpdata); |
309 | } |
310 | |
311 | void |
312 | hpdata_hugify(hpdata_t *hpdata) { |
313 | hpdata_assert_consistent(hpdata); |
314 | hpdata->h_huge = true; |
315 | fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES); |
316 | hpdata->h_ntouched = HUGEPAGE_PAGES; |
317 | hpdata_assert_consistent(hpdata); |
318 | } |
319 | |
320 | void |
321 | hpdata_dehugify(hpdata_t *hpdata) { |
322 | hpdata_assert_consistent(hpdata); |
323 | hpdata->h_huge = false; |
324 | hpdata_assert_consistent(hpdata); |
325 | } |
326 | |