CPUAllocator.cpp source code [pytorch/c10/core/CPUAllocator.cpp]

1	#include <c10/core/Allocator.h>
2	#include <c10/core/CPUAllocator.h>
3	#include <c10/core/DeviceType.h>
4	#include <c10/core/alignment.h>
5	#include <c10/core/impl/alloc_cpu.h>
6	#include <c10/mobile/CPUCachingAllocator.h>
7	#include <c10/mobile/CPUProfilingAllocator.h>
8	#include <c10/util/Logging.h>
9
10	// TODO: rename flag to C10
11	C10_DEFINE_bool(
12	caffe2_report_cpu_memory_usage,
13	false,
14	"If set, print out detailed memory usage");
15
16	namespace c10 {
17
18	struct C10_API DefaultCPUAllocator final : at::Allocator {
19	DefaultCPUAllocator() = default;
20	at::DataPtr allocate(size_t nbytes) const override {
21	void* data = nullptr;
22	try {
23	data = c10::alloc_cpu(nbytes);
24	} catch (c10::Error& e) {
25	profiledCPUMemoryReporter().OutOfMemory(nbytes);
26	throw e;
27	}
28	profiledCPUMemoryReporter().New(data, nbytes);
29	return {data, data, &ReportAndDelete, at::Device (at::DeviceType::CPU)};
30	}
31
32	static void ReportAndDelete(void* ptr) {
33	if (!ptr) {
34	return;
35	}
36	profiledCPUMemoryReporter().Delete(ptr);
37	free_cpu(ptr);
38	}
39
40	at::DeleterFnPtr raw_deleter() const override {
41	return &ReportAndDelete;
42	}
43	};
44
45	ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
46	static ProfiledCPUMemoryReporter reporter_;
47	return reporter_;
48	}
49
50	// QNNPACK AND XNNPACK may out-of-bound access the input and / or output
51	// tensors. This is by-design, and chosen to make the implementation of
52	// micro-kernels both simpler and faster as a result of not having to
53	// individually handle the corner cases where the number of processed elements
54	// is not a multiple of SIMD register width. This behavior will trigger ASAN
55	// though, and may result in a segfault if the accessed memory location just so
56	// happens to fall on a page the current process has no read access to. Here we
57	// define a custom allocator that allocates the extra storage required to keep
58	// this behavior safe. This allocator could have been restricted to QNNPACK and
59	// XNNPACK only, but that would have negative performance ramifications, as
60	// input tensors must now be reallocated, and copied over, if the tensor is not
61	// allocated with this allocator to begin with. Making this allocator the
62	// default on mobile builds minimizes the probability of unnecessary
63	// reallocations and copies, and also enables acceleration of operations where
64	// the output tensor is allocated outside of the function doing the
65	// implementation, wherein the implementation cannot simply re-allocate the
66	// output with the guarding allocator.
67	//
68	// PreGuardBytes: Number of guard bytes to allocate before the allocation.
69	// PostGuardBytes: Number of guard bytes to allocate after the allocation.
70
71	template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
72	class DefaultMobileCPUAllocator final : public at::Allocator {
73	public:
74	DefaultMobileCPUAllocator() = default;
75	~DefaultMobileCPUAllocator() override = default;
76
77	static void deleter(void* const pointer) {
78	if (C10_UNLIKELY(!pointer)) {
79	return;
80	}
81	// TODO: enable with better TLS support on mobile
82	// profiledCPUMemoryReporter().Delete(pointer);
83	auto allocator_ptr = GetThreadLocalCachingAllocator();
84	auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
85	if (allocator_ptr != nullptr) {
86	allocator_ptr->free(pointer);
87	} else if (profiling_allocator_ptr != nullptr) {
88	profiling_allocator_ptr->free(pointer);
89	} else {
90	c10::free_cpu(pointer);
91	// This adds extra cost to freeing memory to the default case when
92	// caching allocator is not enabled.
93	// NOLINTNEXTLINE(clang-analyzer-unix.Malloc)
94	CPUCachingAllocator::record_free(pointer);
95	auto allocation_planner = GetThreadLocalAllocationPlanner();
96	if (allocation_planner != nullptr) {
97	allocation_planner->record_free(pointer);
98	}
99	}
100	}
101
102	DataPtr allocate(const size_t nbytes) const override {
103	if (C10_UNLIKELY(`0u` == nbytes)) {
104	return {
105	nullptr,
106	nullptr,
107	&deleter,
108	at::Device (DeviceType::CPU),
109	};
110	}
111
112	auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
113	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
114	void* data;
115	auto allocator_ptr = GetThreadLocalCachingAllocator();
116	auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
117	if (allocator_ptr != nullptr) {
118	data = allocator_ptr->allocate(alloc_size);
119	} else if (profiling_allocator_ptr != nullptr) {
120	data = profiling_allocator_ptr->allocate(alloc_size);
121	} else {
122	try {
123	data = c10::alloc_cpu(alloc_size);
124	} catch (c10::Error& e) {
125	profiledCPUMemoryReporter().OutOfMemory(alloc_size);
126	throw e;
127	}
128	auto allocation_planner = GetThreadLocalAllocationPlanner();
129	if (allocation_planner != nullptr) {
130	allocation_planner->record_allocation(alloc_size, data);
131	}
132	}
133	profiledCPUMemoryReporter().New(data, alloc_size);
134	return {
135	reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
136	data,
137	&deleter,
138	at::Device (DeviceType::CPU),
139	};
140	}
141
142	DeleterFnPtr raw_deleter() const override {
143	return deleter;
144	}
145	};
146
147	void NoDelete(void*) {}
148
149	at::Allocator* GetCPUAllocator() {
150	return GetAllocator(DeviceType::CPU);
151	}
152
153	void SetCPUAllocator(at::Allocator* alloc, uint8_t priority) {
154	SetAllocator(DeviceType::CPU, alloc, priority);
155	}
156
157	// The Mobile CPU allocator must always be present even on non-mobile builds
158	// because QNNPACK and XNNPACK are not mobile specific.
159	//
160	// Pre-guard: 8 bytes for QNNPACK, but set to gAlignment to ensure SIMD
161	// alignment, not on the allocated memory, but memory location
162	// returned to the user.
163	// Post-guard: 16 bytes for XNNPACK.
164
165	// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables)
166	static DefaultMobileCPUAllocator<gAlignment, `16u`> g_mobile_cpu_allocator;
167
168	at::Allocator* GetDefaultMobileCPUAllocator() {
169	return &g_mobile_cpu_allocator;
170	}
171
172	#ifdef C10_MOBILE
173
174	at::Allocator* GetDefaultCPUAllocator() {
175	return GetDefaultMobileCPUAllocator();
176	}
177
178	REGISTER_ALLOCATOR(DeviceType::CPU, &g_mobile_cpu_allocator);
179
180	#else
181
182	// Global default CPU Allocator
183	static DefaultCPUAllocator g_cpu_alloc;
184
185	at::Allocator* GetDefaultCPUAllocator() {
186	return &g_cpu_alloc;
187	}
188
189	REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
190
191	#endif /* C10_Mobile */
192
193	void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) {
194	if (nbytes == `0`) {
195	return;
196	}
197	auto profile_memory = memoryProfilingEnabled();
198	size_t allocated = `0`;
199	if (FLAGS_caffe2_report_cpu_memory_usage \|\| profile_memory) {
200	std::lock_guard<std::mutex> guard(mutex_);
201	size_table_[ptr] = nbytes;
202	allocated_ += nbytes;
203	allocated = allocated_;
204	}
205	if (FLAGS_caffe2_report_cpu_memory_usage) {
206	LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated
207	<< " bytes.";
208	}
209	if (profile_memory) {
210	reportMemoryUsageToProfiler(
211	ptr,
212	static_cast<int64_t>(nbytes),
213	allocated,
214	`0`,
215	c10::Device (c10::DeviceType::CPU));
216	}
217	}
218
219	void ProfiledCPUMemoryReporter::Delete(void* ptr) {
220	size_t nbytes = `0`;
221	auto profile_memory = memoryProfilingEnabled();
222	size_t allocated = `0`;
223	if (FLAGS_caffe2_report_cpu_memory_usage \|\| profile_memory) {
224	std::lock_guard<std::mutex> guard(mutex_);
225	auto it = size_table_.find(ptr);
226	if (it != size_table_.end()) {
227	allocated_ -= it ->second;
228	allocated = allocated_;
229	nbytes = it ->second;
230	size_table_.erase(it);
231	} else {
232	// C10_LOG_EVERY_MS might log every time in some builds,
233	// using a simple counter to avoid spammy logs
234	if (log_cnt_++ % `1000` == `0`) {
235	LOG(WARNING) << "Memory block of unknown size was allocated before "
236	<< "the profiling started, profiler results will not "
237	<< "include the deallocation event";
238	}
239	}
240	}
241	if (nbytes == `0`) {
242	return;
243	}
244	if (FLAGS_caffe2_report_cpu_memory_usage) {
245	LOG(INFO) << "C10 deleted " << nbytes << " bytes, total alloc " << allocated
246	<< " bytes.";
247	}
248	if (profile_memory) {
249	reportMemoryUsageToProfiler(
250	ptr,
251	-static_cast<int64_t>(nbytes),
252	allocated,
253	`0`,
254	c10::Device (c10::DeviceType::CPU));
255	}
256	}
257
258	void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
259	auto profile_memory = memoryProfilingEnabled();
260	size_t allocated = `0`;
261	if (FLAGS_caffe2_report_cpu_memory_usage \|\| profile_memory) {
262	std::lock_guard<std::mutex> guard(mutex_);
263
264	allocated = allocated_;
265	}
266	if (nbytes == `0`) {
267	return;
268	}
269	if (FLAGS_caffe2_report_cpu_memory_usage) {
270	LOG(INFO) << "C10 Out of Memory. Trying to allocate " << nbytes
271	<< " bytes, total alloc " << allocated << " bytes.";
272	}
273	if (profile_memory) {
274	reportOutOfMemoryToProfiler(
275	static_cast<int64_t>(nbytes),
276	allocated,
277	`0`,
278	c10::Device (c10::DeviceType::CPU));
279	}
280	}
281
282	C10_API at::Allocator* cpu_caching_alloc = nullptr;
283	C10_API uint8_t cpu_caching_alloc_priority = `0`;
284
285	void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) {
286	if (priority >= cpu_caching_alloc_priority) {
287	cpu_caching_alloc = alloc;
288	cpu_caching_alloc_priority = priority;
289	}
290	}
291
292	Allocator* GetCPUCachingAllocator() {
293	if (cpu_caching_alloc == nullptr) {
294	VLOG(`1`)
295	<< "There is not caching allocator registered for CPU, use the default allocator instead.";
296	return GetAllocator(DeviceType::CPU);
297	}
298	return cpu_caching_alloc;
299	}
300
301	} // namespace c10
302

Browse the source code of pytorch/c10/core/CPUAllocator.cpp