1#define JEMALLOC_PAGES_C_
2#include "jemalloc/internal/jemalloc_preamble.h"
3
4#include "jemalloc/internal/pages.h"
5
6#include "jemalloc/internal/jemalloc_internal_includes.h"
7
8#include "jemalloc/internal/assert.h"
9#include "jemalloc/internal/malloc_io.h"
10
11#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12#include <sys/sysctl.h>
13#ifdef __FreeBSD__
14#include <vm/vm_param.h>
15#endif
16#endif
17
18/******************************************************************************/
19/* Data. */
20
21/* Actual operating system page size, detected during bootstrap, <= PAGE. */
22static size_t os_page;
23
24#ifndef _WIN32
25# define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
26# define PAGES_PROT_DECOMMIT (PROT_NONE)
27static int mmap_flags;
28#endif
29static bool os_overcommits;
30
31const char *thp_mode_names[] = {
32 "default",
33 "always",
34 "never",
35 "not supported"
36};
37thp_mode_t opt_thp = THP_MODE_DEFAULT;
38thp_mode_t init_system_thp_mode;
39
40/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
41static bool pages_can_purge_lazy_runtime = true;
42
43/******************************************************************************/
44/*
45 * Function prototypes for static functions that are referenced prior to
46 * definition.
47 */
48
49static void os_pages_unmap(void *addr, size_t size);
50
51/******************************************************************************/
52
53static void *
54os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
55 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
56 assert(ALIGNMENT_CEILING(size, os_page) == size);
57 assert(size != 0);
58
59 if (os_overcommits) {
60 *commit = true;
61 }
62
63 void *ret;
64#ifdef _WIN32
65 /*
66 * If VirtualAlloc can't allocate at the given address when one is
67 * given, it fails and returns NULL.
68 */
69 ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
70 PAGE_READWRITE);
71#else
72 /*
73 * We don't use MAP_FIXED here, because it can cause the *replacement*
74 * of existing mappings, and we only want to create new mappings.
75 */
76 {
77 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
78
79 ret = mmap(addr, size, prot, mmap_flags, -1, 0);
80 }
81 assert(ret != NULL);
82
83 if (ret == MAP_FAILED) {
84 ret = NULL;
85 } else if (addr != NULL && ret != addr) {
86 /*
87 * We succeeded in mapping memory, but not in the right place.
88 */
89 os_pages_unmap(ret, size);
90 ret = NULL;
91 }
92#endif
93 assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
94 ret == addr));
95 return ret;
96}
97
98static void *
99os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
100 bool *commit) {
101 void *ret = (void *)((uintptr_t)addr + leadsize);
102
103 assert(alloc_size >= leadsize + size);
104#ifdef _WIN32
105 os_pages_unmap(addr, alloc_size);
106 void *new_addr = os_pages_map(ret, size, PAGE, commit);
107 if (new_addr == ret) {
108 return ret;
109 }
110 if (new_addr != NULL) {
111 os_pages_unmap(new_addr, size);
112 }
113 return NULL;
114#else
115 size_t trailsize = alloc_size - leadsize - size;
116
117 if (leadsize != 0) {
118 os_pages_unmap(addr, leadsize);
119 }
120 if (trailsize != 0) {
121 os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
122 }
123 return ret;
124#endif
125}
126
127static void
128os_pages_unmap(void *addr, size_t size) {
129 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
130 assert(ALIGNMENT_CEILING(size, os_page) == size);
131
132#ifdef _WIN32
133 if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
134#else
135 if (munmap(addr, size) == -1)
136#endif
137 {
138 char buf[BUFERROR_BUF];
139
140 buferror(get_errno(), buf, sizeof(buf));
141 malloc_printf("<jemalloc>: Error in "
142#ifdef _WIN32
143 "VirtualFree"
144#else
145 "munmap"
146#endif
147 "(): %s\n", buf);
148 if (opt_abort) {
149 abort();
150 }
151 }
152}
153
154static void *
155pages_map_slow(size_t size, size_t alignment, bool *commit) {
156 size_t alloc_size = size + alignment - os_page;
157 /* Beware size_t wrap-around. */
158 if (alloc_size < size) {
159 return NULL;
160 }
161
162 void *ret;
163 do {
164 void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
165 if (pages == NULL) {
166 return NULL;
167 }
168 size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
169 - (uintptr_t)pages;
170 ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
171 } while (ret == NULL);
172
173 assert(ret != NULL);
174 assert(PAGE_ADDR2BASE(ret) == ret);
175 return ret;
176}
177
178void *
179pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
180 assert(alignment >= PAGE);
181 assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
182
183#if defined(__FreeBSD__) && defined(MAP_EXCL)
184 /*
185 * FreeBSD has mechanisms both to mmap at specific address without
186 * touching existing mappings, and to mmap with specific alignment.
187 */
188 {
189 if (os_overcommits) {
190 *commit = true;
191 }
192
193 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
194 int flags = mmap_flags;
195
196 if (addr != NULL) {
197 flags |= MAP_FIXED | MAP_EXCL;
198 } else {
199 unsigned alignment_bits = ffs_zu(alignment);
200 assert(alignment_bits > 1);
201 flags |= MAP_ALIGNED(alignment_bits - 1);
202 }
203
204 void *ret = mmap(addr, size, prot, flags, -1, 0);
205 if (ret == MAP_FAILED) {
206 ret = NULL;
207 }
208
209 return ret;
210 }
211#endif
212 /*
213 * Ideally, there would be a way to specify alignment to mmap() (like
214 * NetBSD has), but in the absence of such a feature, we have to work
215 * hard to efficiently create aligned mappings. The reliable, but
216 * slow method is to create a mapping that is over-sized, then trim the
217 * excess. However, that always results in one or two calls to
218 * os_pages_unmap(), and it can leave holes in the process's virtual
219 * memory map if memory grows downward.
220 *
221 * Optimistically try mapping precisely the right amount before falling
222 * back to the slow method, with the expectation that the optimistic
223 * approach works most of the time.
224 */
225
226 void *ret = os_pages_map(addr, size, os_page, commit);
227 if (ret == NULL || ret == addr) {
228 return ret;
229 }
230 assert(addr == NULL);
231 if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
232 os_pages_unmap(ret, size);
233 return pages_map_slow(size, alignment, commit);
234 }
235
236 assert(PAGE_ADDR2BASE(ret) == ret);
237 return ret;
238}
239
240void
241pages_unmap(void *addr, size_t size) {
242 assert(PAGE_ADDR2BASE(addr) == addr);
243 assert(PAGE_CEILING(size) == size);
244
245 os_pages_unmap(addr, size);
246}
247
248static bool
249pages_commit_impl(void *addr, size_t size, bool commit) {
250 assert(PAGE_ADDR2BASE(addr) == addr);
251 assert(PAGE_CEILING(size) == size);
252
253 if (os_overcommits) {
254 return true;
255 }
256
257#ifdef _WIN32
258 return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
259 PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
260#else
261 {
262 int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
263 void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
264 -1, 0);
265 if (result == MAP_FAILED) {
266 return true;
267 }
268 if (result != addr) {
269 /*
270 * We succeeded in mapping memory, but not in the right
271 * place.
272 */
273 os_pages_unmap(result, size);
274 return true;
275 }
276 return false;
277 }
278#endif
279}
280
281bool
282pages_commit(void *addr, size_t size) {
283 return pages_commit_impl(addr, size, true);
284}
285
286bool
287pages_decommit(void *addr, size_t size) {
288 return pages_commit_impl(addr, size, false);
289}
290
291bool
292pages_purge_lazy(void *addr, size_t size) {
293 assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
294 assert(PAGE_CEILING(size) == size);
295
296 if (!pages_can_purge_lazy) {
297 return true;
298 }
299 if (!pages_can_purge_lazy_runtime) {
300 /*
301 * Built with lazy purge enabled, but detected it was not
302 * supported on the current system.
303 */
304 return true;
305 }
306
307#ifdef _WIN32
308 VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
309 return false;
310#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
311 return (madvise(addr, size,
312# ifdef MADV_FREE
313 MADV_FREE
314# else
315 JEMALLOC_MADV_FREE
316# endif
317 ) != 0);
318#elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
319 !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
320 return (madvise(addr, size, MADV_DONTNEED) != 0);
321#else
322 not_reached();
323#endif
324}
325
326bool
327pages_purge_forced(void *addr, size_t size) {
328 assert(PAGE_ADDR2BASE(addr) == addr);
329 assert(PAGE_CEILING(size) == size);
330
331 if (!pages_can_purge_forced) {
332 return true;
333 }
334
335#if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
336 defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
337 return (madvise(addr, size, MADV_DONTNEED) != 0);
338#elif defined(JEMALLOC_MAPS_COALESCE)
339 /* Try to overlay a new demand-zeroed mapping. */
340 return pages_commit(addr, size);
341#else
342 not_reached();
343#endif
344}
345
346static bool
347pages_huge_impl(void *addr, size_t size, bool aligned) {
348 if (aligned) {
349 assert(HUGEPAGE_ADDR2BASE(addr) == addr);
350 assert(HUGEPAGE_CEILING(size) == size);
351 }
352#ifdef JEMALLOC_HAVE_MADVISE_HUGE
353 return (madvise(addr, size, MADV_HUGEPAGE) != 0);
354#else
355 return true;
356#endif
357}
358
359bool
360pages_huge(void *addr, size_t size) {
361 return pages_huge_impl(addr, size, true);
362}
363
364static bool
365pages_huge_unaligned(void *addr, size_t size) {
366 return pages_huge_impl(addr, size, false);
367}
368
369static bool
370pages_nohuge_impl(void *addr, size_t size, bool aligned) {
371 if (aligned) {
372 assert(HUGEPAGE_ADDR2BASE(addr) == addr);
373 assert(HUGEPAGE_CEILING(size) == size);
374 }
375
376#ifdef JEMALLOC_HAVE_MADVISE_HUGE
377 return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
378#else
379 return false;
380#endif
381}
382
383bool
384pages_nohuge(void *addr, size_t size) {
385 return pages_nohuge_impl(addr, size, true);
386}
387
388static bool
389pages_nohuge_unaligned(void *addr, size_t size) {
390 return pages_nohuge_impl(addr, size, false);
391}
392
393bool
394pages_dontdump(void *addr, size_t size) {
395 assert(PAGE_ADDR2BASE(addr) == addr);
396 assert(PAGE_CEILING(size) == size);
397#ifdef JEMALLOC_MADVISE_DONTDUMP
398 return madvise(addr, size, MADV_DONTDUMP) != 0;
399#else
400 return false;
401#endif
402}
403
404bool
405pages_dodump(void *addr, size_t size) {
406 assert(PAGE_ADDR2BASE(addr) == addr);
407 assert(PAGE_CEILING(size) == size);
408#ifdef JEMALLOC_MADVISE_DONTDUMP
409 return madvise(addr, size, MADV_DODUMP) != 0;
410#else
411 return false;
412#endif
413}
414
415
416static size_t
417os_page_detect(void) {
418#ifdef _WIN32
419 SYSTEM_INFO si;
420 GetSystemInfo(&si);
421 return si.dwPageSize;
422#elif defined(__FreeBSD__)
423 /*
424 * This returns the value obtained from
425 * the auxv vector, avoiding a syscall.
426 */
427 return getpagesize();
428#else
429 long result = sysconf(_SC_PAGESIZE);
430 if (result == -1) {
431 return LG_PAGE;
432 }
433 return (size_t)result;
434#endif
435}
436
437#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
438static bool
439os_overcommits_sysctl(void) {
440 int vm_overcommit;
441 size_t sz;
442
443 sz = sizeof(vm_overcommit);
444#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
445 int mib[2];
446
447 mib[0] = CTL_VM;
448 mib[1] = VM_OVERCOMMIT;
449 if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
450 return false; /* Error. */
451 }
452#else
453 if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
454 return false; /* Error. */
455 }
456#endif
457
458 return ((vm_overcommit & 0x3) == 0);
459}
460#endif
461
462#ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
463/*
464 * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
465 * reentry during bootstrapping if another library has interposed system call
466 * wrappers.
467 */
468static bool
469os_overcommits_proc(void) {
470 int fd;
471 char buf[1];
472
473#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
474 #if defined(O_CLOEXEC)
475 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
476 O_CLOEXEC);
477 #else
478 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
479 if (fd != -1) {
480 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
481 }
482 #endif
483#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
484 #if defined(O_CLOEXEC)
485 fd = (int)syscall(SYS_openat,
486 AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
487 #else
488 fd = (int)syscall(SYS_openat,
489 AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
490 if (fd != -1) {
491 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
492 }
493 #endif
494#else
495 #if defined(O_CLOEXEC)
496 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
497 #else
498 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
499 if (fd != -1) {
500 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
501 }
502 #endif
503#endif
504
505 if (fd == -1) {
506 return false; /* Error. */
507 }
508
509 ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
510#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
511 syscall(SYS_close, fd);
512#else
513 close(fd);
514#endif
515
516 if (nread < 1) {
517 return false; /* Error. */
518 }
519 /*
520 * /proc/sys/vm/overcommit_memory meanings:
521 * 0: Heuristic overcommit.
522 * 1: Always overcommit.
523 * 2: Never overcommit.
524 */
525 return (buf[0] == '0' || buf[0] == '1');
526}
527#endif
528
529void
530pages_set_thp_state (void *ptr, size_t size) {
531 if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
532 return;
533 }
534 assert(opt_thp != thp_mode_not_supported &&
535 init_system_thp_mode != thp_mode_not_supported);
536
537 if (opt_thp == thp_mode_always
538 && init_system_thp_mode != thp_mode_never) {
539 assert(init_system_thp_mode == thp_mode_default);
540 pages_huge_unaligned(ptr, size);
541 } else if (opt_thp == thp_mode_never) {
542 assert(init_system_thp_mode == thp_mode_default ||
543 init_system_thp_mode == thp_mode_always);
544 pages_nohuge_unaligned(ptr, size);
545 }
546}
547
548static void
549init_thp_state(void) {
550 if (!have_madvise_huge) {
551 if (metadata_thp_enabled() && opt_abort) {
552 malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
553 abort();
554 }
555 goto label_error;
556 }
557
558 static const char sys_state_madvise[] = "always [madvise] never\n";
559 static const char sys_state_always[] = "[always] madvise never\n";
560 static const char sys_state_never[] = "always madvise [never]\n";
561 char buf[sizeof(sys_state_madvise)];
562
563#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
564 int fd = (int)syscall(SYS_open,
565 "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
566#else
567 int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
568#endif
569 if (fd == -1) {
570 goto label_error;
571 }
572
573 ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
574#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
575 syscall(SYS_close, fd);
576#else
577 close(fd);
578#endif
579
580 if (nread < 0) {
581 goto label_error;
582 }
583
584 if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
585 init_system_thp_mode = thp_mode_default;
586 } else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
587 init_system_thp_mode = thp_mode_always;
588 } else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
589 init_system_thp_mode = thp_mode_never;
590 } else {
591 goto label_error;
592 }
593 return;
594label_error:
595 opt_thp = init_system_thp_mode = thp_mode_not_supported;
596}
597
598bool
599pages_boot(void) {
600 os_page = os_page_detect();
601 if (os_page > PAGE) {
602 malloc_write("<jemalloc>: Unsupported system page size\n");
603 if (opt_abort) {
604 abort();
605 }
606 return true;
607 }
608
609#ifndef _WIN32
610 mmap_flags = MAP_PRIVATE | MAP_ANON;
611#endif
612
613#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
614 os_overcommits = os_overcommits_sysctl();
615#elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
616 os_overcommits = os_overcommits_proc();
617# ifdef MAP_NORESERVE
618 if (os_overcommits) {
619 mmap_flags |= MAP_NORESERVE;
620 }
621# endif
622#else
623 os_overcommits = false;
624#endif
625
626 init_thp_state();
627
628#ifdef __FreeBSD__
629 /*
630 * FreeBSD doesn't need the check; madvise(2) is known to work.
631 */
632#else
633 /* Detect lazy purge runtime support. */
634 if (pages_can_purge_lazy) {
635 bool committed = false;
636 void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
637 if (madv_free_page == NULL) {
638 return true;
639 }
640 assert(pages_can_purge_lazy_runtime);
641 if (pages_purge_lazy(madv_free_page, PAGE)) {
642 pages_can_purge_lazy_runtime = false;
643 }
644 os_pages_unmap(madv_free_page, PAGE);
645 }
646#endif
647
648 return false;
649}
650