1#include <c10/core/Allocator.h>
2#include <c10/core/CPUAllocator.h>
3#include <c10/core/DeviceType.h>
4#include <c10/core/alignment.h>
5#include <c10/core/impl/alloc_cpu.h>
6#include <c10/mobile/CPUCachingAllocator.h>
7#include <c10/mobile/CPUProfilingAllocator.h>
8#include <c10/util/Logging.h>
9
10// TODO: rename flag to C10
11C10_DEFINE_bool(
12 caffe2_report_cpu_memory_usage,
13 false,
14 "If set, print out detailed memory usage");
15
16namespace c10 {
17
18struct C10_API DefaultCPUAllocator final : at::Allocator {
19 DefaultCPUAllocator() = default;
20 at::DataPtr allocate(size_t nbytes) const override {
21 void* data = nullptr;
22 try {
23 data = c10::alloc_cpu(nbytes);
24 } catch (c10::Error& e) {
25 profiledCPUMemoryReporter().OutOfMemory(nbytes);
26 throw e;
27 }
28 profiledCPUMemoryReporter().New(data, nbytes);
29 return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
30 }
31
32 static void ReportAndDelete(void* ptr) {
33 if (!ptr) {
34 return;
35 }
36 profiledCPUMemoryReporter().Delete(ptr);
37 free_cpu(ptr);
38 }
39
40 at::DeleterFnPtr raw_deleter() const override {
41 return &ReportAndDelete;
42 }
43};
44
45ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
46 static ProfiledCPUMemoryReporter reporter_;
47 return reporter_;
48}
49
50// QNNPACK AND XNNPACK may out-of-bound access the input and / or output
51// tensors. This is by-design, and chosen to make the implementation of
52// micro-kernels both simpler and faster as a result of not having to
53// individually handle the corner cases where the number of processed elements
54// is not a multiple of SIMD register width. This behavior will trigger ASAN
55// though, and may result in a segfault if the accessed memory location just so
56// happens to fall on a page the current process has no read access to. Here we
57// define a custom allocator that allocates the extra storage required to keep
58// this behavior safe. This allocator could have been restricted to QNNPACK and
59// XNNPACK only, but that would have negative performance ramifications, as
60// input tensors must now be reallocated, and copied over, if the tensor is not
61// allocated with this allocator to begin with. Making this allocator the
62// default on mobile builds minimizes the probability of unnecessary
63// reallocations and copies, and also enables acceleration of operations where
64// the output tensor is allocated outside of the function doing the
65// implementation, wherein the implementation cannot simply re-allocate the
66// output with the guarding allocator.
67//
68// PreGuardBytes: Number of guard bytes to allocate before the allocation.
69// PostGuardBytes: Number of guard bytes to allocate after the allocation.
70
71template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
72class DefaultMobileCPUAllocator final : public at::Allocator {
73 public:
74 DefaultMobileCPUAllocator() = default;
75 ~DefaultMobileCPUAllocator() override = default;
76
77 static void deleter(void* const pointer) {
78 if (C10_UNLIKELY(!pointer)) {
79 return;
80 }
81 // TODO: enable with better TLS support on mobile
82 // profiledCPUMemoryReporter().Delete(pointer);
83 auto allocator_ptr = GetThreadLocalCachingAllocator();
84 auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
85 if (allocator_ptr != nullptr) {
86 allocator_ptr->free(pointer);
87 } else if (profiling_allocator_ptr != nullptr) {
88 profiling_allocator_ptr->free(pointer);
89 } else {
90 c10::free_cpu(pointer);
91 // This adds extra cost to freeing memory to the default case when
92 // caching allocator is not enabled.
93 // NOLINTNEXTLINE(clang-analyzer-unix.Malloc)
94 CPUCachingAllocator::record_free(pointer);
95 auto allocation_planner = GetThreadLocalAllocationPlanner();
96 if (allocation_planner != nullptr) {
97 allocation_planner->record_free(pointer);
98 }
99 }
100 }
101
102 DataPtr allocate(const size_t nbytes) const override {
103 if (C10_UNLIKELY(0u == nbytes)) {
104 return {
105 nullptr,
106 nullptr,
107 &deleter,
108 at::Device(DeviceType::CPU),
109 };
110 }
111
112 auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
113 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
114 void* data;
115 auto allocator_ptr = GetThreadLocalCachingAllocator();
116 auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
117 if (allocator_ptr != nullptr) {
118 data = allocator_ptr->allocate(alloc_size);
119 } else if (profiling_allocator_ptr != nullptr) {
120 data = profiling_allocator_ptr->allocate(alloc_size);
121 } else {
122 try {
123 data = c10::alloc_cpu(alloc_size);
124 } catch (c10::Error& e) {
125 profiledCPUMemoryReporter().OutOfMemory(alloc_size);
126 throw e;
127 }
128 auto allocation_planner = GetThreadLocalAllocationPlanner();
129 if (allocation_planner != nullptr) {
130 allocation_planner->record_allocation(alloc_size, data);
131 }
132 }
133 profiledCPUMemoryReporter().New(data, alloc_size);
134 return {
135 reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
136 data,
137 &deleter,
138 at::Device(DeviceType::CPU),
139 };
140 }
141
142 DeleterFnPtr raw_deleter() const override {
143 return deleter;
144 }
145};
146
147void NoDelete(void*) {}
148
149at::Allocator* GetCPUAllocator() {
150 return GetAllocator(DeviceType::CPU);
151}
152
153void SetCPUAllocator(at::Allocator* alloc, uint8_t priority) {
154 SetAllocator(DeviceType::CPU, alloc, priority);
155}
156
157// The Mobile CPU allocator must always be present even on non-mobile builds
158// because QNNPACK and XNNPACK are not mobile specific.
159//
160// Pre-guard: 8 bytes for QNNPACK, but set to gAlignment to ensure SIMD
161// alignment, not on the allocated memory, but memory location
162// returned to the user.
163// Post-guard: 16 bytes for XNNPACK.
164
165// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables)
166static DefaultMobileCPUAllocator<gAlignment, 16u> g_mobile_cpu_allocator;
167
168at::Allocator* GetDefaultMobileCPUAllocator() {
169 return &g_mobile_cpu_allocator;
170}
171
172#ifdef C10_MOBILE
173
174at::Allocator* GetDefaultCPUAllocator() {
175 return GetDefaultMobileCPUAllocator();
176}
177
178REGISTER_ALLOCATOR(DeviceType::CPU, &g_mobile_cpu_allocator);
179
180#else
181
182// Global default CPU Allocator
183static DefaultCPUAllocator g_cpu_alloc;
184
185at::Allocator* GetDefaultCPUAllocator() {
186 return &g_cpu_alloc;
187}
188
189REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
190
191#endif /* C10_Mobile */
192
193void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) {
194 if (nbytes == 0) {
195 return;
196 }
197 auto profile_memory = memoryProfilingEnabled();
198 size_t allocated = 0;
199 if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
200 std::lock_guard<std::mutex> guard(mutex_);
201 size_table_[ptr] = nbytes;
202 allocated_ += nbytes;
203 allocated = allocated_;
204 }
205 if (FLAGS_caffe2_report_cpu_memory_usage) {
206 LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated
207 << " bytes.";
208 }
209 if (profile_memory) {
210 reportMemoryUsageToProfiler(
211 ptr,
212 static_cast<int64_t>(nbytes),
213 allocated,
214 0,
215 c10::Device(c10::DeviceType::CPU));
216 }
217}
218
219void ProfiledCPUMemoryReporter::Delete(void* ptr) {
220 size_t nbytes = 0;
221 auto profile_memory = memoryProfilingEnabled();
222 size_t allocated = 0;
223 if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
224 std::lock_guard<std::mutex> guard(mutex_);
225 auto it = size_table_.find(ptr);
226 if (it != size_table_.end()) {
227 allocated_ -= it->second;
228 allocated = allocated_;
229 nbytes = it->second;
230 size_table_.erase(it);
231 } else {
232 // C10_LOG_EVERY_MS might log every time in some builds,
233 // using a simple counter to avoid spammy logs
234 if (log_cnt_++ % 1000 == 0) {
235 LOG(WARNING) << "Memory block of unknown size was allocated before "
236 << "the profiling started, profiler results will not "
237 << "include the deallocation event";
238 }
239 }
240 }
241 if (nbytes == 0) {
242 return;
243 }
244 if (FLAGS_caffe2_report_cpu_memory_usage) {
245 LOG(INFO) << "C10 deleted " << nbytes << " bytes, total alloc " << allocated
246 << " bytes.";
247 }
248 if (profile_memory) {
249 reportMemoryUsageToProfiler(
250 ptr,
251 -static_cast<int64_t>(nbytes),
252 allocated,
253 0,
254 c10::Device(c10::DeviceType::CPU));
255 }
256}
257
258void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
259 auto profile_memory = memoryProfilingEnabled();
260 size_t allocated = 0;
261 if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
262 std::lock_guard<std::mutex> guard(mutex_);
263
264 allocated = allocated_;
265 }
266 if (nbytes == 0) {
267 return;
268 }
269 if (FLAGS_caffe2_report_cpu_memory_usage) {
270 LOG(INFO) << "C10 Out of Memory. Trying to allocate " << nbytes
271 << " bytes, total alloc " << allocated << " bytes.";
272 }
273 if (profile_memory) {
274 reportOutOfMemoryToProfiler(
275 static_cast<int64_t>(nbytes),
276 allocated,
277 0,
278 c10::Device(c10::DeviceType::CPU));
279 }
280}
281
282C10_API at::Allocator* cpu_caching_alloc = nullptr;
283C10_API uint8_t cpu_caching_alloc_priority = 0;
284
285void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) {
286 if (priority >= cpu_caching_alloc_priority) {
287 cpu_caching_alloc = alloc;
288 cpu_caching_alloc_priority = priority;
289 }
290}
291
292Allocator* GetCPUCachingAllocator() {
293 if (cpu_caching_alloc == nullptr) {
294 VLOG(1)
295 << "There is not caching allocator registered for CPU, use the default allocator instead.";
296 return GetAllocator(DeviceType::CPU);
297 }
298 return cpu_caching_alloc;
299}
300
301} // namespace c10
302