os.c source code [mimalloc/src/os.c]

1	/ ----------------------------------------------------------------------------*
2	Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
3	This is free software; you can redistribute it and/or modify it under the
4	terms of the MIT license. A copy of the license can be found in the file
5	"LICENSE" at the root of this distribution.
6	-----------------------------------------------------------------------------/*
7	#ifndef _DEFAULT_SOURCE
8	#define _DEFAULT_SOURCE // ensure mmap flags are defined
9	#endif
10
11	#if defined(__sun)
12	// illumos provides new mman.h api when any of these are defined
13	// otherwise the old api based on caddr_t which predates the void pointers one.
14	// stock solaris provides only the former, chose to atomically to discard those
15	// flags only here rather than project wide tough.
16	#undef _XOPEN_SOURCE
17	#undef _POSIX_C_SOURCE
18	#endif
19	#include "mimalloc.h"
20	#include "mimalloc-internal.h"
21	#include "mimalloc-atomic.h"
22
23	#include <string.h> // strerror
24
25	#ifdef _MSC_VER
26	#pragma warning(disable:4996) // strerror
27	#endif
28
29	#if defined(__wasi__)
30	#define MI_USE_SBRK
31	#endif
32
33	#if defined(_WIN32)
34	#include <windows.h>
35	#elif defined(__wasi__)
36	#include <unistd.h> // sbrk
37	#else
38	#include <sys/mman.h> // mmap
39	#include <unistd.h> // sysconf
40	#if defined(__linux__)
41	#include <features.h>
42	#include <fcntl.h>
43	#if defined(__GLIBC__)
44	#include <linux/mman.h> // linux mmap flags
45	#else
46	#include <sys/mman.h>
47	#endif
48	#endif
49	#if defined(__APPLE__)
50	#include <TargetConditionals.h>
51	#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
52	#include <mach/vm_statistics.h>
53	#endif
54	#endif
55	#if defined(__FreeBSD__) \|\| defined(__DragonFly__)
56	#include <sys/param.h>
57	#if __FreeBSD_version >= 1200000
58	#include <sys/cpuset.h>
59	#include <sys/domainset.h>
60	#endif
61	#include <sys/sysctl.h>
62	#endif
63	#endif
64
65	/ -----------------------------------------------------------*
66	Initialization.
67	On windows initializes support for aligned allocation and
68	large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
69	----------------------------------------------------------- /*
70	bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
71	bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
72
73	static void* mi_align_up_ptr(void* p, size_t alignment) {
74	return (void*)_mi_align_up((uintptr_t)p, alignment);
75	}
76
77	static void* mi_align_down_ptr(void* p, size_t alignment) {
78	return (void*)_mi_align_down((uintptr_t)p, alignment);
79	}
80
81
82	// page size (initialized properly in `os_init`)
83	static size_t os_page_size = `4096`;
84
85	// minimal allocation granularity
86	static size_t os_alloc_granularity = `4096`;
87
88	// if non-zero, use large page allocation
89	static size_t large_os_page_size = `0`;
90
91	// is memory overcommit allowed?
92	// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
93	static bool os_overcommit = true;
94
95	bool _mi_os_has_overcommit(void) {
96	return os_overcommit;
97	}
98
99	// OS (small) page size
100	size_t _mi_os_page_size(void) {
101	return os_page_size;
102	}
103
104	// if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
105	size_t _mi_os_large_page_size(void) {
106	return (large_os_page_size != `0` ? large_os_page_size : _mi_os_page_size());
107	}
108
109	#if !defined(MI_USE_SBRK) && !defined(__wasi__)
110	static bool use_large_os_page(size_t size, size_t alignment) {
111	// if we have access, check the size and alignment requirements
112	if (large_os_page_size == `0` \|\| !mi_option_is_enabled(mi_option_large_os_pages)) return false;
113	return ((size % large_os_page_size) == `0` && (alignment % large_os_page_size) == `0`);
114	}
115	#endif
116
117	// round to a good OS allocation size (bounded by max 12.5% waste)
118	size_t _mi_os_good_alloc_size(size_t size) {
119	size_t align_size;
120	if (size < `512`*MI_KiB) align_size = _mi_os_page_size();
121	else if (size < `2`MI_MiB) align_size = `64`MI_KiB;
122	else if (size < `8`MI_MiB) align_size = `256`MI_KiB;
123	else if (size < `32`MI_MiB) align_size = `1`MI_MiB;
124	else align_size = `4`*MI_MiB;
125	if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow?
126	return _mi_align_up(size, align_size);
127	}
128
129	#if defined(_WIN32)
130	// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
131	// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
132	// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
133	// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
134	typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
135	MiMemExtendedParameterInvalidType = `0`,
136	MiMemExtendedParameterAddressRequirements,
137	MiMemExtendedParameterNumaNode,
138	MiMemExtendedParameterPartitionHandle,
139	MiMemExtendedParameterUserPhysicalHandle,
140	MiMemExtendedParameterAttributeFlags,
141	MiMemExtendedParameterMax
142	} MI_MEM_EXTENDED_PARAMETER_TYPE;
143
144	typedef struct DECLSPEC_ALIGN(`8`) MI_MEM_EXTENDED_PARAMETER_S {
145	struct { DWORD64 Type : `8`; DWORD64 Reserved : `56`; } Type;
146	union { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
147	} MI_MEM_EXTENDED_PARAMETER;
148
149	typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
150	PVOID LowestStartingAddress;
151	PVOID HighestEndingAddress;
152	SIZE_T Alignment;
153	} MI_MEM_ADDRESS_REQUIREMENTS;
154
155	#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE 0x00000010
156
157	#include <winternl.h>
158	typedef PVOID (__stdcall PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER, ULONG);
159	typedef NTSTATUS (__stdcall PNtAllocateVirtualMemoryEx)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER, ULONG);
160	static PVirtualAlloc2 pVirtualAlloc2 = NULL;
161	static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
162
163	// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
164	typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
165
166	typedef VOID (__stdcall PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER ProcNumber);
167	typedef BOOL (__stdcall PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER Processor, PUSHORT NodeNumber);
168	typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
169	static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
170	static PGetNumaProcessorNodeEx pGetNumaProcessorNodeEx = NULL;
171	static PGetNumaNodeProcessorMaskEx pGetNumaNodeProcessorMaskEx = NULL;
172
173	static bool mi_win_enable_large_os_pages(void)
174	{
175	if (large_os_page_size > `0`) return true;
176
177	// Try to see if large OS pages are supported
178	// To use large pages on Windows, we first need access permission
179	// Set "Lock pages in memory" permission in the group policy editor
180	// <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
181	unsigned long err = `0`;
182	HANDLE token = NULL;
183	BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES \| TOKEN_QUERY, &token);
184	if (ok) {
185	TOKEN_PRIVILEGES tp;
186	ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[`0`].Luid);
187	if (ok) {
188	tp.PrivilegeCount = `1`;
189	tp.Privileges[`0`].Attributes = SE_PRIVILEGE_ENABLED;
190	ok = AdjustTokenPrivileges(token, FALSE, &tp, `0`, (PTOKEN_PRIVILEGES)NULL, `0`);
191	if (ok) {
192	err = GetLastError();
193	ok = (err == ERROR_SUCCESS);
194	if (ok) {
195	large_os_page_size = GetLargePageMinimum();
196	}
197	}
198	}
199	CloseHandle(token);
200	}
201	if (!ok) {
202	if (err == `0`) err = GetLastError();
203	_mi_warning_message("cannot enable large OS page support, error %lu\n", err);
204	}
205	return (ok!=`0`);
206	}
207
208	void _mi_os_init(void)
209	{
210	os_overcommit = false;
211	// get the page size
212	SYSTEM_INFO si;
213	GetSystemInfo(&si);
214	if (si.dwPageSize > `0`) os_page_size = si.dwPageSize;
215	if (si.dwAllocationGranularity > `0`) os_alloc_granularity = si.dwAllocationGranularity;
216	// get the VirtualAlloc2 function
217	HINSTANCE hDll;
218	hDll = LoadLibrary(TEXT("kernelbase.dll"));
219	if (hDll != NULL) {
220	// use VirtualAlloc2FromApp if possible as it is available to Windows store apps
221	pVirtualAlloc2 = (PVirtualAlloc2)(void ()(void*))GetProcAddress(hDll, "VirtualAlloc2FromApp");
222	if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void ()(void*))GetProcAddress(hDll, "VirtualAlloc2");
223	FreeLibrary(hDll);
224	}
225	// NtAllocateVirtualMemoryEx is used for huge page allocation
226	hDll = LoadLibrary(TEXT("ntdll.dll"));
227	if (hDll != NULL) {
228	pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void ()(void*))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
229	FreeLibrary(hDll);
230	}
231	// Try to use Win7+ numa API
232	hDll = LoadLibrary(TEXT("kernel32.dll"));
233	if (hDll != NULL) {
234	pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void ()(void*))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
235	pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void ()(void*))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
236	pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void ()(void*))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
237	FreeLibrary(hDll);
238	}
239	if (mi_option_is_enabled(mi_option_large_os_pages) \|\| mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
240	mi_win_enable_large_os_pages();
241	}
242	}
243	#elif defined(__wasi__)
244	void _mi_os_init(void) {
245	os_overcommit = false;
246	os_page_size = `64`MI_KiB; // WebAssembly has a fixed page size: 64KiB*
247	os_alloc_granularity = `16`;
248	}
249
250	#else // generic unix
251
252	static void os_detect_overcommit(void) {
253	#if defined(__linux__)
254	int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
255	if (fd < `0`) return;
256	char buf[`32`];
257	ssize_t nread = read(fd, &buf, sizeof(buf));
258	close(fd);
259	// <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
260	// 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
261	if (nread >= `1`) {
262	os_overcommit = (buf[`0`] == `'0'` \|\| buf[`0`] == `'1'`);
263	}
264	#elif defined(__FreeBSD__)
265	int val = `0`;
266	size_t olen = sizeof(val);
267	if (sysctlbyname("vm.overcommit", &val, &olen, NULL, `0`) == `0`) {
268	os_overcommit = (val != `0`);
269	}
270	#else
271	// default: overcommit is true
272	#endif
273	}
274
275	void _mi_os_init(void) {
276	// get the page size
277	long result = sysconf(_SC_PAGESIZE);
278	if (result > `0`) {
279	os_page_size = (size_t)result;
280	os_alloc_granularity = os_page_size;
281	}
282	large_os_page_size = `2`MI_MiB; // TODO: can we query the OS for this?*
283	os_detect_overcommit();
284	}
285	#endif
286
287
288	#if defined(MADV_NORMAL)
289	static int mi_madvise(void* addr, size_t length, int advice) {
290	#if defined(__sun)
291	return madvise((caddr_t)addr, length, advice); // Solaris needs cast (issue #520)
292	#else
293	return madvise(addr, length, advice);
294	#endif
295	}
296	#endif
297
298
299	/ -----------------------------------------------------------*
300	aligned hinting
301	-------------------------------------------------------------- /*
302
303	// On 64-bit systems, we can do efficient aligned allocation by using
304	// the 2TiB to 30TiB area to allocate those.
305	#if (MI_INTPTR_SIZE >= 8)
306	static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
307
308	// Return a MI_SEGMENT_SIZE aligned address that is probably available.
309	// If this returns NULL, the OS will determine the address but on some OS's that may not be
310	// properly aligned which can be more costly as it needs to be adjusted afterwards.
311	// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
312	// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
313	// in the middle of the 2TiB - 6TiB address range (see issue #372))
314
315	#define MI_HINT_BASE ((uintptr_t)2 << 40) // 2TiB start
316	#define MI_HINT_AREA ((uintptr_t)4 << 40) // upto 6TiB (since before win8 there is "only" 8TiB available to processes)
317	#define MI_HINT_MAX ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
318
319	static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
320	{
321	if (try_alignment <= `1` \|\| try_alignment > MI_SEGMENT_SIZE) return NULL;
322	size = _mi_align_up(size, MI_SEGMENT_SIZE);
323	if (size > `1`MI_GiB) return* NULL; // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
324	#if (MI_SECURE>0)
325	size += MI_SEGMENT_SIZE; // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
326	#endif
327
328	uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
329	if (hint == `0` \|\| hint > MI_HINT_MAX) { // wrap or initialize
330	uintptr_t init = MI_HINT_BASE;
331	#if (MI_SECURE>0 \|\| MI_DEBUG==0) // security: randomize start of aligned allocations unless in debug mode
332	uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
333	init = init + ((MI_SEGMENT_SIZE * ((r>>`17`) & `0xFFFFF`)) % MI_HINT_AREA); // (randomly 20 bits)4MiB == 0 to 4TiB*
334	#endif
335	uintptr_t expected = hint + size;
336	mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
337	hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
338	}
339	if (hint%try_alignment != `0`) return NULL;
340	return (void*)hint;
341	}
342	#else
343	static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
344	MI_UNUSED(try_alignment); MI_UNUSED(size);
345	return NULL;
346	}
347	#endif
348
349	/ -----------------------------------------------------------*
350	Free memory
351	-------------------------------------------------------------- /*
352
353	static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
354	{
355	if (addr == NULL \|\| size == `0`) return true; // \|\| _mi_os_is_huge_reserved(addr)
356	bool err = false;
357	#if defined(_WIN32)
358	DWORD errcode = `0`;
359	err = (VirtualFree(addr, `0`, MEM_RELEASE) == `0`);
360	if (err) { errcode = GetLastError(); }
361	if (errcode == ERROR_INVALID_ADDRESS) {
362	// In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
363	// the memory region returned by VirtualAlloc; in that case we need to free using
364	// the start of the region.
365	MEMORY_BASIC_INFORMATION info = { `0` };
366	VirtualQuery(addr, &info, sizeof(info));
367	if (info.AllocationBase < addr && ((uint8_t)addr - (uint8_t)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
368	errcode = `0`;
369	err = (VirtualFree(info.AllocationBase, `0`, MEM_RELEASE) == `0`);
370	if (err) { errcode = GetLastError(); }
371	}
372	}
373	if (errcode != `0`) {
374	_mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size);
375	}
376	#elif defined(MI_USE_SBRK) \|\| defined(__wasi__)
377	err = false; // sbrk heap cannot be shrunk
378	#else
379	err = (munmap(addr, size) == -`1`);
380	if (err) {
381	_mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size);
382	}
383	#endif
384	if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
385	_mi_stat_decrease(&stats->reserved, size);
386	return !err;
387	}
388
389
390	/ -----------------------------------------------------------*
391	Raw allocation on Windows (VirtualAlloc)
392	-------------------------------------------------------------- /*
393
394	#ifdef _WIN32
395
396	#define MEM_COMMIT_RESERVE (MEM_COMMIT\|MEM_RESERVE)
397
398	static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
399	#if (MI_INTPTR_SIZE >= 8)
400	// on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
401	if (addr == NULL) {
402	void* hint = mi_os_get_aligned_hint(try_alignment,size);
403	if (hint != NULL) {
404	void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
405	if (p != NULL) return p;
406	_mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
407	// fall through on error
408	}
409	}
410	#endif
411	// on modern Windows try use VirtualAlloc2 for aligned allocation
412	if (try_alignment > `1` && (try_alignment % _mi_os_page_size()) == `0` && pVirtualAlloc2 != NULL) {
413	MI_MEM_ADDRESS_REQUIREMENTS reqs = { `0`, `0`, `0` };
414	reqs.Alignment = try_alignment;
415	MI_MEM_EXTENDED_PARAMETER param = { {`0`, `0`}, {`0`} };
416	param.Type.Type = MiMemExtendedParameterAddressRequirements;
417	param.Arg.Pointer = &reqs;
418	void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, `1`);
419	if (p != NULL) return p;
420	_mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
421	// fall through on error
422	}
423	// last resort
424	return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
425	}
426
427	static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
428	mi_assert_internal(!(large_only && !allow_large));
429	static _Atomic(size_t) large_page_try_ok; // = 0;
430	void* p = NULL;
431	// Try to allocate large OS pages (2MiB) if allowed or required.
432	if ((large_only \|\| use_large_os_page(size, try_alignment))
433	&& allow_large && (flags&MEM_COMMIT)!=`0` && (flags&MEM_RESERVE)!=`0`) {
434	size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
435	if (!large_only && try_ok > `0`) {
436	// if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
437	// therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
438	mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - `1`);
439	}
440	else {
441	// large OS pages must always reserve and commit.
442	*is_large = true;
443	p = mi_win_virtual_allocx(addr, size, try_alignment, flags \| MEM_LARGE_PAGES);
444	if (large_only) return p;
445	// fall back to non-large page allocation on error (`p == NULL`).
446	if (p == NULL) {
447	mi_atomic_store_release(&large_page_try_ok,`10UL`); // on error, don't try again for the next N allocations
448	}
449	}
450	}
451	// Fall back to regular page allocation
452	if (p == NULL) {
453	*is_large = ((flags&MEM_LARGE_PAGES) != `0`);
454	p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
455	}
456	if (p == NULL) {
457	_mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large);
458	}
459	return p;
460	}
461
462	/ -----------------------------------------------------------*
463	Raw allocation using `sbrk` or `wasm_memory_grow`
464	-------------------------------------------------------------- /*
465
466	#elif defined(MI_USE_SBRK) \|\| defined(__wasi__)
467	#if defined(MI_USE_SBRK)
468	static void* mi_memory_grow( size_t size ) {
469	void* p = sbrk(size);
470	if (p == (void)(-`1`)) return* NULL;
471	#if !defined(__wasi__) // on wasi this is always zero initialized already (?)
472	memset(p,`0`,size);
473	#endif
474	return p;
475	}
476	#elif defined(__wasi__)
477	static void* mi_memory_grow( size_t size ) {
478	size_t base = (size > `0` ? __builtin_wasm_memory_grow(`0`,_mi_divide_up(size, _mi_os_page_size()))
479	: __builtin_wasm_memory_size(`0`));
480	if (base == SIZE_MAX) return NULL;
481	return (void)(base _mi_os_page_size());
482	}
483	#endif
484
485	#if defined(MI_USE_PTHREADS)
486	static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
487	#endif
488
489	static void* mi_heap_grow(size_t size, size_t try_alignment) {
490	void* p = NULL;
491	if (try_alignment <= `1`) {
492	// `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
493	#if defined(MI_USE_PTHREADS)
494	pthread_mutex_lock(&mi_heap_grow_mutex);
495	#endif
496	p = mi_memory_grow(size);
497	#if defined(MI_USE_PTHREADS)
498	pthread_mutex_unlock(&mi_heap_grow_mutex);
499	#endif
500	}
501	else {
502	void* base = NULL;
503	size_t alloc_size = `0`;
504	// to allocate aligned use a lock to try to avoid thread interaction
505	// between getting the current size and actual allocation
506	// (also, `sbrk` is not thread safe in general)
507	#if defined(MI_USE_PTHREADS)
508	pthread_mutex_lock(&mi_heap_grow_mutex);
509	#endif
510	{
511	void* current = mi_memory_grow(`0`); // get current size
512	if (current != NULL) {
513	void* aligned_current = mi_align_up_ptr(current, try_alignment); // and align from there to minimize wasted space
514	alloc_size = _mi_align_up( ((uint8_t)aligned_current - (uint8_t)current) + size, _mi_os_page_size());
515	base = mi_memory_grow(alloc_size);
516	}
517	}
518	#if defined(MI_USE_PTHREADS)
519	pthread_mutex_unlock(&mi_heap_grow_mutex);
520	#endif
521	if (base != NULL) {
522	p = mi_align_up_ptr(base, try_alignment);
523	if ((uint8_t)p + size > (uint8_t)base + alloc_size) {
524	// another thread used wasm_memory_grow/sbrk in-between and we do not have enough
525	// space after alignment. Give up (and waste the space as we cannot shrink :-( )
526	// (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
527	p = NULL;
528	}
529	}
530	}
531	if (p == NULL) {
532	_mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
533	errno = ENOMEM;
534	return NULL;
535	}
536	mi_assert_internal( try_alignment == `0` \|\| (uintptr_t)p % try_alignment == `0` );
537	return p;
538	}
539
540	/ -----------------------------------------------------------*
541	Raw allocation on Unix's (mmap)
542	-------------------------------------------------------------- /*
543	#else
544	#define MI_OS_USE_MMAP
545	static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
546	MI_UNUSED(try_alignment);
547	#if defined(MAP_ALIGNED) // BSD
548	if (addr == NULL && try_alignment > `1` && (try_alignment % _mi_os_page_size()) == `0`) {
549	size_t n = mi_bsr(try_alignment);
550	if (((size_t)`1` << n) == try_alignment && n >= `12` && n <= `30`) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB
551	flags \|= MAP_ALIGNED(n);
552	void* p = mmap(addr, size, protect_flags, flags \| MAP_ALIGNED(n), fd, `0`);
553	if (p!=MAP_FAILED) return p;
554	// fall back to regular mmap
555	}
556	}
557	#elif defined(MAP_ALIGN) // Solaris
558	if (addr == NULL && try_alignment > `1` && (try_alignment % _mi_os_page_size()) == `0`) {
559	void* p = mmap((void)try_alignment, size, protect_flags, flags \| MAP_ALIGN, fd, `0`); // addr parameter is the required alignment*
560	if (p!=MAP_FAILED) return p;
561	// fall back to regular mmap
562	}
563	#endif
564	#if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
565	// on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
566	if (addr == NULL) {
567	void* hint = mi_os_get_aligned_hint(try_alignment, size);
568	if (hint != NULL) {
569	void* p = mmap(hint, size, protect_flags, flags, fd, `0`);
570	if (p!=MAP_FAILED) return p;
571	// fall back to regular mmap
572	}
573	}
574	#endif
575	// regular mmap
576	void* p = mmap(addr, size, protect_flags, flags, fd, `0`);
577	if (p!=MAP_FAILED) return p;
578	// failed to allocate
579	return NULL;
580	}
581
582	static int mi_unix_mmap_fd(void) {
583	#if defined(VM_MAKE_TAG)
584	// macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
585	int os_tag = (int)mi_option_get(mi_option_os_tag);
586	if (os_tag < `100` \|\| os_tag > `255`) os_tag = `100`;
587	return VM_MAKE_TAG(os_tag);
588	#else
589	return -`1`;
590	#endif
591	}
592
593	static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
594	void* p = NULL;
595	#if !defined(MAP_ANONYMOUS)
596	#define MAP_ANONYMOUS MAP_ANON
597	#endif
598	#if !defined(MAP_NORESERVE)
599	#define MAP_NORESERVE 0
600	#endif
601	const int fd = mi_unix_mmap_fd();
602	int flags = MAP_PRIVATE \| MAP_ANONYMOUS;
603	if (_mi_os_has_overcommit()) {
604	flags \|= MAP_NORESERVE;
605	}
606	#if defined(PROT_MAX)
607	protect_flags \|= PROT_MAX(PROT_READ \| PROT_WRITE); // BSD
608	#endif
609	// huge page allocation
610	if ((large_only \|\| use_large_os_page(size, try_alignment)) && allow_large) {
611	static _Atomic(size_t) large_page_try_ok; // = 0;
612	size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
613	if (!large_only && try_ok > `0`) {
614	// If the OS is not configured for large OS pages, or the user does not have
615	// enough permission, the `mmap` will always fail (but it might also fail for other reasons).
616	// Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
617	// to avoid too many failing calls to mmap.
618	mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - `1`);
619	}
620	else {
621	int lflags = flags & ~MAP_NORESERVE; // using NORESERVE on huge pages seems to fail on Linux
622	int lfd = fd;
623	#ifdef MAP_ALIGNED_SUPER
624	lflags \|= MAP_ALIGNED_SUPER;
625	#endif
626	#ifdef MAP_HUGETLB
627	lflags \|= MAP_HUGETLB;
628	#endif
629	#ifdef MAP_HUGE_1GB
630	static bool mi_huge_pages_available = true;
631	if ((size % MI_GiB) == `0` && mi_huge_pages_available) {
632	lflags \|= MAP_HUGE_1GB;
633	}
634	else
635	#endif
636	{
637	#ifdef MAP_HUGE_2MB
638	lflags \|= MAP_HUGE_2MB;
639	#endif
640	}
641	#ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
642	lfd \|= VM_FLAGS_SUPERPAGE_SIZE_2MB;
643	#endif
644	if (large_only \|\| lflags != flags) {
645	// try large OS page allocation
646	*is_large = true;
647	p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
648	#ifdef MAP_HUGE_1GB
649	if (p == NULL && (lflags & MAP_HUGE_1GB) != `0`) {
650	mi_huge_pages_available = false; // don't try huge 1GiB pages again
651	_mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
652	lflags = ((lflags & ~MAP_HUGE_1GB) \| MAP_HUGE_2MB);
653	p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
654	}
655	#endif
656	if (large_only) return p;
657	if (p == NULL) {
658	mi_atomic_store_release(&large_page_try_ok, (size_t)`8`); // on error, don't try again for the next N allocations
659	}
660	}
661	}
662	}
663	// regular allocation
664	if (p == NULL) {
665	*is_large = false;
666	p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
667	if (p != NULL) {
668	#if defined(MADV_HUGEPAGE)
669	// Many Linux systems don't allow MAP_HUGETLB but they support instead
670	// transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
671	// though since properly aligned allocations will already use large pages if available
672	// in that case -- in particular for our large regions (in `memory.c`).
673	// However, some systems only allow THP if called with explicit `madvise`, so
674	// when large OS pages are enabled for mimalloc, we call `madvise` anyways.
675	if (allow_large && use_large_os_page(size, try_alignment)) {
676	if (mi_madvise(p, size, MADV_HUGEPAGE) == `0`) {
677	is_large = true; // possibly*
678	};
679	}
680	#elif defined(__sun)
681	if (allow_large && use_large_os_page(size, try_alignment)) {
682	struct memcntl_mha cmd = {`0`};
683	cmd.mha_pagesize = large_os_page_size;
684	cmd.mha_cmd = MHA_MAPSIZE_VA;
685	if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, `0`, `0`) == `0`) {
686	*is_large = true;
687	}
688	}
689	#endif
690	}
691	}
692	if (p == NULL) {
693	_mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
694	}
695	return p;
696	}
697	#endif
698
699
700	/ -----------------------------------------------------------*
701	Primitive allocation from the OS.
702	-------------------------------------------------------------- /*
703
704	// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
705	static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
706	mi_assert_internal(size > `0` && (size % _mi_os_page_size()) == `0`);
707	if (size == `0`) return NULL;
708	if (!commit) allow_large = false;
709	if (try_alignment == `0`) try_alignment = `1`; // avoid 0 to ensure there will be no divide by zero when aligning
710
711	void* p = NULL;
712	/*
713	if (commit && allow_large) {
714	p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
715	if (p != NULL) {
716	*is_large = true;
717	return p;
718	}
719	}
720	*/
721
722	#if defined(_WIN32)
723	int flags = MEM_RESERVE;
724	if (commit) { flags \|= MEM_COMMIT; }
725	p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
726	#elif defined(MI_USE_SBRK) \|\| defined(__wasi__)
727	MI_UNUSED(allow_large);
728	*is_large = false;
729	p = mi_heap_grow(size, try_alignment);
730	#else
731	int protect_flags = (commit ? (PROT_WRITE \| PROT_READ) : PROT_NONE);
732	p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
733	#endif
734	mi_stat_counter_increase(stats->mmap_calls, `1`);
735	if (p != NULL) {
736	_mi_stat_increase(&stats->reserved, size);
737	if (commit) { _mi_stat_increase(&stats->committed, size); }
738	}
739	return p;
740	}
741
742
743	// Primitive aligned allocation from the OS.
744	// This function guarantees the allocated memory is aligned.
745	static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
746	mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - `1`)) == `0`));
747	mi_assert_internal(size > `0` && (size % _mi_os_page_size()) == `0`);
748	mi_assert_internal(is_large != NULL);
749	if (!commit) allow_large = false;
750	if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - `1`)) == `0`))) return NULL;
751	size = _mi_align_up(size, _mi_os_page_size());
752
753	// try first with a hint (this will be aligned directly on Win 10+ or BSD)
754	void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
755	if (p == NULL) return NULL;
756
757	// if not aligned, free it, overallocate, and unmap around it
758	if (((uintptr_t)p % alignment != `0`)) {
759	mi_os_mem_free(p, size, commit, stats);
760	_mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit);
761	if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
762	const size_t over_size = size + alignment;
763
764	#if _WIN32
765	// over-allocate uncommitted (virtual) memory
766	p = mi_os_mem_alloc(over_size, `0` /alignment/, false / commit? /, false / allow_large /, is_large, stats);
767	if (p == NULL) return NULL;
768
769	// set p to the aligned part in the full region
770	// note: this is dangerous on Windows as VirtualFree needs the actual region pointer
771	// but in mi_os_mem_free we handle this (hopefully exceptional) situation.
772	p = mi_align_up_ptr(p, alignment);
773
774	// explicitly commit only the aligned part
775	if (commit) {
776	_mi_os_commit(p, size, NULL, stats);
777	}
778	#else
779	// overallocate...
780	p = mi_os_mem_alloc(over_size, `1`, commit, false, is_large, stats);
781	if (p == NULL) return NULL;
782	// and selectively unmap parts around the over-allocated area. (noop on sbrk)
783	void* aligned_p = mi_align_up_ptr(p, alignment);
784	size_t pre_size = (uint8_t)aligned_p - (uint8_t)p;
785	size_t mid_size = _mi_align_up(size, _mi_os_page_size());
786	size_t post_size = over_size - pre_size - mid_size;
787	mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
788	if (pre_size > `0`) mi_os_mem_free(p, pre_size, commit, stats);
789	if (post_size > `0`) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
790	// we can return the aligned pointer on `mmap` (and sbrk) systems
791	p = aligned_p;
792	#endif
793	}
794
795	mi_assert_internal(p == NULL \|\| (p != NULL && ((uintptr_t)p % alignment) == `0`));
796	return p;
797	}
798
799
800	/ -----------------------------------------------------------*
801	OS API: alloc, free, alloc_aligned
802	----------------------------------------------------------- /*
803
804	void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
805	MI_UNUSED(tld_stats);
806	mi_stats_t* stats = &_mi_stats_main;
807	if (size == `0`) return NULL;
808	size = _mi_os_good_alloc_size(size);
809	bool is_large = false;
810	return mi_os_mem_alloc(size, `0`, true, false, &is_large, stats);
811	}
812
813	void _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
814	MI_UNUSED(tld_stats);
815	mi_stats_t* stats = &_mi_stats_main;
816	if (size == `0` \|\| p == NULL) return;
817	size = _mi_os_good_alloc_size(size);
818	mi_os_mem_free(p, size, was_committed, stats);
819	}
820
821	void _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
822	_mi_os_free_ex(p, size, true, stats);
823	}
824
825	void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
826	{
827	MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
828	MI_UNUSED(tld_stats);
829	if (size == `0`) return NULL;
830	size = _mi_os_good_alloc_size(size);
831	alignment = _mi_align_up(alignment, _mi_os_page_size());
832	bool allow_large = false;
833	if (large != NULL) {
834	allow_large = *large;
835	*large = false;
836	}
837	return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /tld->stats/ );
838	}
839
840
841
842	/ -----------------------------------------------------------*
843	OS memory API: reset, commit, decommit, protect, unprotect.
844	----------------------------------------------------------- /*
845
846
847	// OS page align within a given area, either conservative (pages inside the area only),
848	// or not (straddling pages outside the area is possible)
849	static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
850	mi_assert(addr != NULL && size > `0`);
851	if (newsize != NULL) *newsize = `0`;
852	if (size == `0` \|\| addr == NULL) return NULL;
853
854	// page align conservatively within the range
855	void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
856	: mi_align_down_ptr(addr, _mi_os_page_size()));
857	void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
858	: mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
859	ptrdiff_t diff = (uint8_t)end - (uint8_t)start;
860	if (diff <= `0`) return NULL;
861
862	mi_assert_internal((conservative && (size_t)diff <= size) \|\| (!conservative && (size_t)diff >= size));
863	if (newsize != NULL) *newsize = (size_t)diff;
864	return start;
865	}
866
867	static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) {
868	return mi_os_page_align_areax(true, addr, size, newsize);
869	}
870
871	static void mi_mprotect_hint(int err) {
872	#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
873	if (err == ENOMEM) {
874	_mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
875	" On Linux this is controlled by the vm.max_map_count. For example:\n"
876	" > sudo sysctl -w vm.max_map_count=262144\n");
877	}
878	#else
879	MI_UNUSED(err);
880	#endif
881	}
882
883	// Commit/Decommit memory.
884	// Usually commit is aligned liberal, while decommit is aligned conservative.
885	// (but not for the reset version where we want commit to be conservative as well)
886	static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
887	// page align in the range, commit liberally, decommit conservative
888	if (is_zero != NULL) { *is_zero = false; }
889	size_t csize;
890	void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
891	if (csize == `0`) return true; // \|\| _mi_os_is_huge_reserved(addr))
892	int err = `0`;
893	if (commit) {
894	_mi_stat_increase(&stats->committed, size); // use size for precise commit vs. decommit
895	_mi_stat_counter_increase(&stats->commit_calls, `1`);
896	}
897	else {
898	_mi_stat_decrease(&stats->committed, size);
899	}
900
901	#if defined(_WIN32)
902	if (commit) {
903	// is_zero = true; // note: if the memory was already committed, the call succeeds but the memory is not zero'd*
904	void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
905	err = (p == start ? `0` : GetLastError());
906	}
907	else {
908	BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
909	err = (ok ? `0` : GetLastError());
910	}
911	#elif defined(__wasi__)
912	// WebAssembly guests can't control memory protection
913	#elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
914	// Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
915	if (commit) {
916	// commit: just change the protection
917	err = mprotect(start, csize, (PROT_READ \| PROT_WRITE));
918	if (err != `0`) { err = errno; }
919	}
920	else {
921	// decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
922	const int fd = mi_unix_mmap_fd();
923	void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED \| MAP_PRIVATE \| MAP_ANONYMOUS \| MAP_NORESERVE), fd, `0`);
924	if (p != start) { err = errno; }
925	}
926	#else
927	// Linux, macOSX and others.
928	if (commit) {
929	// commit: ensure we can access the area
930	err = mprotect(start, csize, (PROT_READ \| PROT_WRITE));
931	if (err != `0`) { err = errno; }
932	}
933	else {
934	#if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
935	// decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
936	// (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
937	err = madvise(start, csize, MADV_DONTNEED);
938	#else
939	// decommit: just disable access (also used in debug and secure mode to trap on illegal access)
940	err = mprotect(start, csize, PROT_NONE);
941	if (err != `0`) { err = errno; }
942	#endif
943	//#if defined(MADV_FREE_REUSE)
944	// while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
945	//#endif
946	}
947	#endif
948	if (err != `0`) {
949	_mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
950	mi_mprotect_hint(err);
951	}
952	mi_assert_internal(err == `0`);
953	return (err == `0`);
954	}
955
956	bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
957	MI_UNUSED(tld_stats);
958	mi_stats_t* stats = &_mi_stats_main;
959	return mi_os_commitx(addr, size, true, false / liberal /, is_zero, stats);
960	}
961
962	bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
963	MI_UNUSED(tld_stats);
964	mi_stats_t* stats = &_mi_stats_main;
965	bool is_zero;
966	return mi_os_commitx(addr, size, false, true / conservative /, &is_zero, stats);
967	}
968
969	/*
970	static bool mi_os_commit_unreset(void addr, size_t size, bool* is_zero, mi_stats_t* stats) {*
971	return mi_os_commitx(addr, size, true, true // conservative
972	, is_zero, stats);
973	}
974	*/
975
976	// Signal to the OS that the address range is no longer in use
977	// but may be used later again. This will release physical memory
978	// pages and reduce swapping while keeping the memory committed.
979	// We page align to a conservative area inside the range to reset.
980	static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
981	// page align conservatively within the range
982	size_t csize;
983	void* start = mi_os_page_align_area_conservative(addr, size, &csize);
984	if (csize == `0`) return true; // \|\| _mi_os_is_huge_reserved(addr)
985	if (reset) _mi_stat_increase(&stats->reset, csize);
986	else _mi_stat_decrease(&stats->reset, csize);
987	if (!reset) return true; // nothing to do on unreset!
988
989	#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
990	if (MI_SECURE==`0`) {
991	memset(start, `0`, csize); // pretend it is eagerly reset
992	}
993	#endif
994
995	#if defined(_WIN32)
996	// Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
997	void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
998	mi_assert_internal(p == start);
999	#if 1
1000	if (p == start && start != NULL) {
1001	VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
1002	}
1003	#endif
1004	if (p != start) return false;
1005	#else
1006	#if defined(MADV_FREE)
1007	static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
1008	int oadvice = (int)mi_atomic_load_relaxed(&advice);
1009	int err;
1010	while ((err = mi_madvise(start, csize, oadvice)) != `0` && errno == EAGAIN) { errno = `0`; };
1011	if (err != `0` && errno == EINVAL && oadvice == MADV_FREE) {
1012	// if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
1013	mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
1014	err = mi_madvise(start, csize, MADV_DONTNEED);
1015	}
1016	#elif defined(__wasi__)
1017	int err = `0`;
1018	#else
1019	int err = mi_madvise(start, csize, MADV_DONTNEED);
1020	#endif
1021	if (err != `0`) {
1022	_mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
1023	}
1024	//mi_assert(err == 0);
1025	if (err != `0`) return false;
1026	#endif
1027	return true;
1028	}
1029
1030	// Signal to the OS that the address range is no longer in use
1031	// but may be used later again. This will release physical memory
1032	// pages and reduce swapping while keeping the memory committed.
1033	// We page align to a conservative area inside the range to reset.
1034	bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
1035	MI_UNUSED(tld_stats);
1036	mi_stats_t* stats = &_mi_stats_main;
1037	return mi_os_resetx(addr, size, true, stats);
1038	}
1039
1040	/*
1041	bool _mi_os_unreset(void addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {*
1042	MI_UNUSED(tld_stats);
1043	mi_stats_t stats = &_mi_stats_main;*
1044	if (mi_option_is_enabled(mi_option_reset_decommits)) {
1045	return mi_os_commit_unreset(addr, size, is_zero, stats); // re-commit it (conservatively!)
1046	}
1047	else {
1048	*is_zero = false;
1049	return mi_os_resetx(addr, size, false, stats);
1050	}
1051	}
1052	*/
1053
1054	// Protect a region in memory to be not accessible.
1055	static bool mi_os_protectx(void* addr, size_t size, bool protect) {
1056	// page align conservatively within the range
1057	size_t csize = `0`;
1058	void* start = mi_os_page_align_area_conservative(addr, size, &csize);
1059	if (csize == `0`) return false;
1060	/*
1061	if (_mi_os_is_huge_reserved(addr)) {
1062	_mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
1063	}
1064	*/
1065	int err = `0`;
1066	#ifdef _WIN32
1067	DWORD oldprotect = `0`;
1068	BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
1069	err = (ok ? `0` : GetLastError());
1070	#elif defined(__wasi__)
1071	err = `0`;
1072	#else
1073	err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ \| PROT_WRITE));
1074	if (err != `0`) { err = errno; }
1075	#endif
1076	if (err != `0`) {
1077	_mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
1078	mi_mprotect_hint(err);
1079	}
1080	return (err == `0`);
1081	}
1082
1083	bool _mi_os_protect(void* addr, size_t size) {
1084	return mi_os_protectx(addr, size, true);
1085	}
1086
1087	bool _mi_os_unprotect(void* addr, size_t size) {
1088	return mi_os_protectx(addr, size, false);
1089	}
1090
1091
1092
1093	bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
1094	// page align conservatively within the range
1095	mi_assert_internal(oldsize > newsize && p != NULL);
1096	if (oldsize < newsize \|\| p == NULL) return false;
1097	if (oldsize == newsize) return true;
1098
1099	// oldsize and newsize should be page aligned or we cannot shrink precisely
1100	void* addr = (uint8_t*)p + newsize;
1101	size_t size = `0`;
1102	void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
1103	if (size == `0` \|\| start != addr) return false;
1104
1105	#ifdef _WIN32
1106	// we cannot shrink on windows, but we can decommit
1107	return _mi_os_decommit(start, size, stats);
1108	#else
1109	return mi_os_mem_free(start, size, true, stats);
1110	#endif
1111	}
1112
1113
1114	/ ----------------------------------------------------------------------------*
1115	Support for allocating huge OS pages (1Gib) that are reserved up-front
1116	and possibly associated with a specific NUMA node. (use `numa_node>=0`)
1117	-----------------------------------------------------------------------------/*
1118	#define MI_HUGE_OS_PAGE_SIZE (MI_GiB)
1119
1120	#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
1121	static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
1122	{
1123	mi_assert_internal(size%MI_GiB == `0`);
1124	mi_assert_internal(addr != NULL);
1125	const DWORD flags = MEM_LARGE_PAGES \| MEM_COMMIT \| MEM_RESERVE;
1126
1127	mi_win_enable_large_os_pages();
1128
1129	MI_MEM_EXTENDED_PARAMETER params[`3`] = { {{`0`,`0`},{`0`}},{{`0`,`0`},{`0`}},{{`0`,`0`},{`0`}} };
1130	// on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
1131	static bool mi_huge_pages_available = true;
1132	if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
1133	params[`0`].Type.Type = MiMemExtendedParameterAttributeFlags;
1134	params[`0`].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
1135	ULONG param_count = `1`;
1136	if (numa_node >= `0`) {
1137	param_count++;
1138	params[`1`].Type.Type = MiMemExtendedParameterNumaNode;
1139	params[`1`].Arg.ULong = (unsigned)numa_node;
1140	}
1141	SIZE_T psize = size;
1142	void* base = addr;
1143	NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
1144	if (err == `0` && base != NULL) {
1145	return base;
1146	}
1147	else {
1148	// fall back to regular large pages
1149	mi_huge_pages_available = false; // don't try further huge pages
1150	_mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
1151	}
1152	}
1153	// on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
1154	if (pVirtualAlloc2 != NULL && numa_node >= `0`) {
1155	params[`0`].Type.Type = MiMemExtendedParameterNumaNode;
1156	params[`0`].Arg.ULong = (unsigned)numa_node;
1157	return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, `1`);
1158	}
1159
1160	// otherwise use regular virtual alloc on older windows
1161	return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
1162	}
1163
1164	#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
1165	#include <sys/syscall.h>
1166	#ifndef MPOL_PREFERRED
1167	#define MPOL_PREFERRED 1
1168	#endif
1169	#if defined(SYS_mbind)
1170	static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
1171	return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
1172	}
1173	#else
1174	static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
1175	MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
1176	return `0`;
1177	}
1178	#endif
1179	static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
1180	mi_assert_internal(size%MI_GiB == `0`);
1181	bool is_large = true;
1182	void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ \| PROT_WRITE, true, true, &is_large);
1183	if (p == NULL) return NULL;
1184	if (numa_node >= `0` && numa_node < `8`MI_INTPTR_SIZE) { // at most 64 nodes*
1185	unsigned long numa_mask = (`1UL` << numa_node);
1186	// TODO: does `mbind` work correctly for huge OS pages? should we
1187	// use `set_mempolicy` before calling mmap instead?
1188	// see: <https://lkml.org/lkml/2017/2/9/875>
1189	long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, `8`*MI_INTPTR_SIZE, `0`);
1190	if (err != `0`) {
1191	_mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
1192	}
1193	}
1194	return p;
1195	}
1196	#else
1197	static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
1198	MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
1199	return NULL;
1200	}
1201	#endif
1202
1203	#if (MI_INTPTR_SIZE >= 8)
1204	// To ensure proper alignment, use our own area for huge OS pages
1205	static mi_decl_cache_align _Atomic(uintptr_t) mi_huge_start; // = 0
1206
1207	// Claim an aligned address range for huge pages
1208	static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
1209	if (total_size != NULL) *total_size = `0`;
1210	const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
1211
1212	uintptr_t start = `0`;
1213	uintptr_t end = `0`;
1214	uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start);
1215	do {
1216	start = huge_start;
1217	if (start == `0`) {
1218	// Initialize the start address after the 32TiB area
1219	start = ((uintptr_t)`32` << `40`); // 32TiB virtual start address
1220	#if (MI_SECURE>0 \|\| MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode
1221	uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
1222	start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>`17`) & `0x0FFF`)); // (randomly 12bits)1GiB == between 0 to 4TiB*
1223	#endif
1224	}
1225	end = start + size;
1226	mi_assert_internal(end % MI_SEGMENT_SIZE == `0`);
1227	} while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
1228
1229	if (total_size != NULL) *total_size = size;
1230	return (uint8_t*)start;
1231	}
1232	#else
1233	static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
1234	MI_UNUSED(pages);
1235	if (total_size != NULL) *total_size = `0`;
1236	return NULL;
1237	}
1238	#endif
1239
1240	// Allocate MI_SEGMENT_SIZE aligned huge pages
1241	void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
1242	if (psize != NULL) *psize = `0`;
1243	if (pages_reserved != NULL) *pages_reserved = `0`;
1244	size_t size = `0`;
1245	uint8_t* start = mi_os_claim_huge_pages(pages, &size);
1246	if (start == NULL) return NULL; // or 32-bit systems
1247
1248	// Allocate one page at the time but try to place them contiguously
1249	// We allocate one page at the time to be able to abort if it takes too long
1250	// or to at least allocate as many as available on the system.
1251	mi_msecs_t start_t = _mi_clock_start();
1252	size_t page;
1253	for (page = `0`; page < pages; page++) {
1254	// allocate a page
1255	void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
1256	void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
1257
1258	// Did we succeed at a contiguous address?
1259	if (p != addr) {
1260	// no success, issue a warning and break
1261	if (p != NULL) {
1262	_mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
1263	_mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
1264	}
1265	break;
1266	}
1267
1268	// success, record it
1269	_mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
1270	_mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
1271
1272	// check for timeout
1273	if (max_msecs > `0`) {
1274	mi_msecs_t elapsed = _mi_clock_end(start_t);
1275	if (page >= `1`) {
1276	mi_msecs_t estimate = ((elapsed / (page+`1`)) * pages);
1277	if (estimate > `2`max_msecs) { // seems like we are going to timeout, break*
1278	elapsed = max_msecs + `1`;
1279	}
1280	}
1281	if (elapsed > max_msecs) {
1282	_mi_warning_message("huge page allocation timed out\n");
1283	break;
1284	}
1285	}
1286	}
1287	mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
1288	if (pages_reserved != NULL) { *pages_reserved = page; }
1289	if (psize != NULL) { psize = page MI_HUGE_OS_PAGE_SIZE; }
1290	return (page == `0` ? NULL : start);
1291	}
1292
1293	// free every huge page in a range individually (as we allocated per page)
1294	// note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
1295	void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
1296	if (p==NULL \|\| size==`0`) return;
1297	uint8_t* base = (uint8_t*)p;
1298	while (size >= MI_HUGE_OS_PAGE_SIZE) {
1299	_mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
1300	size -= MI_HUGE_OS_PAGE_SIZE;
1301	base += MI_HUGE_OS_PAGE_SIZE;
1302	}
1303	}
1304
1305	/ ----------------------------------------------------------------------------*
1306	Support NUMA aware allocation
1307	-----------------------------------------------------------------------------/*
1308	#ifdef _WIN32
1309	static size_t mi_os_numa_nodex(void) {
1310	USHORT numa_node = `0`;
1311	if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
1312	// Extended API is supported
1313	MI_PROCESSOR_NUMBER pnum;
1314	(*pGetCurrentProcessorNumberEx)(&pnum);
1315	USHORT nnode = `0`;
1316	BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
1317	if (ok) numa_node = nnode;
1318	}
1319	else {
1320	// Vista or earlier, use older API that is limited to 64 processors. Issue #277
1321	DWORD pnum = GetCurrentProcessorNumber();
1322	UCHAR nnode = `0`;
1323	BOOL ok = GetNumaProcessorNode((UCHAR)pnum, &nnode);
1324	if (ok) numa_node = nnode;
1325	}
1326	return numa_node;
1327	}
1328
1329	static size_t mi_os_numa_node_countx(void) {
1330	ULONG numa_max = `0`;
1331	GetNumaHighestNodeNumber(&numa_max);
1332	// find the highest node number that has actual processors assigned to it. Issue #282
1333	while(numa_max > `0`) {
1334	if (pGetNumaNodeProcessorMaskEx != NULL) {
1335	// Extended API is supported
1336	GROUP_AFFINITY affinity;
1337	if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
1338	if (affinity.Mask != `0`) break; // found the maximum non-empty node
1339	}
1340	}
1341	else {
1342	// Vista or earlier, use older API that is limited to 64 processors.
1343	ULONGLONG mask;
1344	if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
1345	if (mask != `0`) break; // found the maximum non-empty node
1346	};
1347	}
1348	// max node was invalid or had no processor assigned, try again
1349	numa_max--;
1350	}
1351	return ((size_t)numa_max + `1`);
1352	}
1353	#elif defined(__linux__)
1354	#include <sys/syscall.h> // getcpu
1355	#include <stdio.h> // access
1356
1357	static size_t mi_os_numa_nodex(void) {
1358	#ifdef SYS_getcpu
1359	unsigned long node = `0`;
1360	unsigned long ncpu = `0`;
1361	long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
1362	if (err != `0`) return `0`;
1363	return node;
1364	#else
1365	return `0`;
1366	#endif
1367	}
1368	static size_t mi_os_numa_node_countx(void) {
1369	char buf[`128`];
1370	unsigned node = `0`;
1371	for(node = `0`; node < `256`; node++) {
1372	// enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
1373	snprintf(buf, `127`, "/sys/devices/system/node/node%u", node + `1`);
1374	if (access(buf,R_OK) != `0`) break;
1375	}
1376	return (node+`1`);
1377	}
1378	#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
1379	static size_t mi_os_numa_nodex(void) {
1380	domainset_t dom;
1381	size_t node;
1382	int policy;
1383	if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -`1`, sizeof(dom), &dom, &policy) == -`1`) return `0ul`;
1384	for (node = `0`; node < MAXMEMDOM; node++) {
1385	if (DOMAINSET_ISSET(node, &dom)) return node;
1386	}
1387	return `0ul`;
1388	}
1389	static size_t mi_os_numa_node_countx(void) {
1390	size_t ndomains = `0`;
1391	size_t len = sizeof(ndomains);
1392	if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, `0`) == -`1`) return `0ul`;
1393	return ndomains;
1394	}
1395	#elif defined(__DragonFly__)
1396	static size_t mi_os_numa_nodex(void) {
1397	// TODO: DragonFly does not seem to provide any userland means to get this information.
1398	return `0ul`;
1399	}
1400	static size_t mi_os_numa_node_countx(void) {
1401	size_t ncpus = `0`, nvirtcoresperphys = `0`;
1402	size_t len = sizeof(size_t);
1403	if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, `0`) == -`1`) return `0ul`;
1404	if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, `0`) == -`1`) return `0ul`;
1405	return nvirtcoresperphys * ncpus;
1406	}
1407	#else
1408	static size_t mi_os_numa_nodex(void) {
1409	return `0`;
1410	}
1411	static size_t mi_os_numa_node_countx(void) {
1412	return `1`;
1413	}
1414	#endif
1415
1416	_Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count
1417
1418	size_t _mi_os_numa_node_count_get(void) {
1419	size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
1420	if (count <= `0`) {
1421	long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
1422	if (ncount > `0`) {
1423	count = (size_t)ncount;
1424	}
1425	else {
1426	count = mi_os_numa_node_countx(); // or detect dynamically
1427	if (count == `0`) count = `1`;
1428	}
1429	mi_atomic_store_release(&_mi_numa_node_count, count); // save it
1430	_mi_verbose_message("using %zd numa regions\n", count);
1431	}
1432	return count;
1433	}
1434
1435	int _mi_os_numa_node_get(mi_os_tld_t* tld) {
1436	MI_UNUSED(tld);
1437	size_t numa_count = _mi_os_numa_node_count();
1438	if (numa_count<=`1`) return `0`; // optimize on single numa node systems: always node 0
1439	// never more than the node count and >= 0
1440	size_t numa_node = mi_os_numa_nodex();
1441	if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
1442	return (int)numa_node;
1443	}
1444

Browse the source code of mimalloc/src/os.c