1 | #include <c10/core/Allocator.h> |
2 | #include <c10/core/CPUAllocator.h> |
3 | #include <c10/core/DeviceType.h> |
4 | #include <c10/core/alignment.h> |
5 | #include <c10/core/impl/alloc_cpu.h> |
6 | #include <c10/mobile/CPUCachingAllocator.h> |
7 | #include <c10/mobile/CPUProfilingAllocator.h> |
8 | #include <c10/util/Logging.h> |
9 | |
10 | // TODO: rename flag to C10 |
11 | C10_DEFINE_bool( |
12 | caffe2_report_cpu_memory_usage, |
13 | false, |
14 | "If set, print out detailed memory usage" ); |
15 | |
16 | namespace c10 { |
17 | |
18 | struct C10_API DefaultCPUAllocator final : at::Allocator { |
19 | DefaultCPUAllocator() = default; |
20 | at::DataPtr allocate(size_t nbytes) const override { |
21 | void* data = nullptr; |
22 | try { |
23 | data = c10::alloc_cpu(nbytes); |
24 | } catch (c10::Error& e) { |
25 | profiledCPUMemoryReporter().OutOfMemory(nbytes); |
26 | throw e; |
27 | } |
28 | profiledCPUMemoryReporter().New(data, nbytes); |
29 | return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)}; |
30 | } |
31 | |
32 | static void ReportAndDelete(void* ptr) { |
33 | if (!ptr) { |
34 | return; |
35 | } |
36 | profiledCPUMemoryReporter().Delete(ptr); |
37 | free_cpu(ptr); |
38 | } |
39 | |
40 | at::DeleterFnPtr raw_deleter() const override { |
41 | return &ReportAndDelete; |
42 | } |
43 | }; |
44 | |
45 | ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() { |
46 | static ProfiledCPUMemoryReporter reporter_; |
47 | return reporter_; |
48 | } |
49 | |
50 | // QNNPACK AND XNNPACK may out-of-bound access the input and / or output |
51 | // tensors. This is by-design, and chosen to make the implementation of |
52 | // micro-kernels both simpler and faster as a result of not having to |
53 | // individually handle the corner cases where the number of processed elements |
54 | // is not a multiple of SIMD register width. This behavior will trigger ASAN |
55 | // though, and may result in a segfault if the accessed memory location just so |
56 | // happens to fall on a page the current process has no read access to. Here we |
57 | // define a custom allocator that allocates the extra storage required to keep |
58 | // this behavior safe. This allocator could have been restricted to QNNPACK and |
59 | // XNNPACK only, but that would have negative performance ramifications, as |
60 | // input tensors must now be reallocated, and copied over, if the tensor is not |
61 | // allocated with this allocator to begin with. Making this allocator the |
62 | // default on mobile builds minimizes the probability of unnecessary |
63 | // reallocations and copies, and also enables acceleration of operations where |
64 | // the output tensor is allocated outside of the function doing the |
65 | // implementation, wherein the implementation cannot simply re-allocate the |
66 | // output with the guarding allocator. |
67 | // |
68 | // PreGuardBytes: Number of guard bytes to allocate before the allocation. |
69 | // PostGuardBytes: Number of guard bytes to allocate after the allocation. |
70 | |
71 | template <uint32_t PreGuardBytes, uint32_t PostGuardBytes> |
72 | class DefaultMobileCPUAllocator final : public at::Allocator { |
73 | public: |
74 | DefaultMobileCPUAllocator() = default; |
75 | ~DefaultMobileCPUAllocator() override = default; |
76 | |
77 | static void deleter(void* const pointer) { |
78 | if (C10_UNLIKELY(!pointer)) { |
79 | return; |
80 | } |
81 | // TODO: enable with better TLS support on mobile |
82 | // profiledCPUMemoryReporter().Delete(pointer); |
83 | auto allocator_ptr = GetThreadLocalCachingAllocator(); |
84 | auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator(); |
85 | if (allocator_ptr != nullptr) { |
86 | allocator_ptr->free(pointer); |
87 | } else if (profiling_allocator_ptr != nullptr) { |
88 | profiling_allocator_ptr->free(pointer); |
89 | } else { |
90 | c10::free_cpu(pointer); |
91 | // This adds extra cost to freeing memory to the default case when |
92 | // caching allocator is not enabled. |
93 | // NOLINTNEXTLINE(clang-analyzer-unix.Malloc) |
94 | CPUCachingAllocator::record_free(pointer); |
95 | auto allocation_planner = GetThreadLocalAllocationPlanner(); |
96 | if (allocation_planner != nullptr) { |
97 | allocation_planner->record_free(pointer); |
98 | } |
99 | } |
100 | } |
101 | |
102 | DataPtr allocate(const size_t nbytes) const override { |
103 | if (C10_UNLIKELY(0u == nbytes)) { |
104 | return { |
105 | nullptr, |
106 | nullptr, |
107 | &deleter, |
108 | at::Device(DeviceType::CPU), |
109 | }; |
110 | } |
111 | |
112 | auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes; |
113 | // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
114 | void* data; |
115 | auto allocator_ptr = GetThreadLocalCachingAllocator(); |
116 | auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator(); |
117 | if (allocator_ptr != nullptr) { |
118 | data = allocator_ptr->allocate(alloc_size); |
119 | } else if (profiling_allocator_ptr != nullptr) { |
120 | data = profiling_allocator_ptr->allocate(alloc_size); |
121 | } else { |
122 | try { |
123 | data = c10::alloc_cpu(alloc_size); |
124 | } catch (c10::Error& e) { |
125 | profiledCPUMemoryReporter().OutOfMemory(alloc_size); |
126 | throw e; |
127 | } |
128 | auto allocation_planner = GetThreadLocalAllocationPlanner(); |
129 | if (allocation_planner != nullptr) { |
130 | allocation_planner->record_allocation(alloc_size, data); |
131 | } |
132 | } |
133 | profiledCPUMemoryReporter().New(data, alloc_size); |
134 | return { |
135 | reinterpret_cast<uint8_t*>(data) + PreGuardBytes, |
136 | data, |
137 | &deleter, |
138 | at::Device(DeviceType::CPU), |
139 | }; |
140 | } |
141 | |
142 | DeleterFnPtr raw_deleter() const override { |
143 | return deleter; |
144 | } |
145 | }; |
146 | |
147 | void NoDelete(void*) {} |
148 | |
149 | at::Allocator* GetCPUAllocator() { |
150 | return GetAllocator(DeviceType::CPU); |
151 | } |
152 | |
153 | void SetCPUAllocator(at::Allocator* alloc, uint8_t priority) { |
154 | SetAllocator(DeviceType::CPU, alloc, priority); |
155 | } |
156 | |
157 | // The Mobile CPU allocator must always be present even on non-mobile builds |
158 | // because QNNPACK and XNNPACK are not mobile specific. |
159 | // |
160 | // Pre-guard: 8 bytes for QNNPACK, but set to gAlignment to ensure SIMD |
161 | // alignment, not on the allocated memory, but memory location |
162 | // returned to the user. |
163 | // Post-guard: 16 bytes for XNNPACK. |
164 | |
165 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables) |
166 | static DefaultMobileCPUAllocator<gAlignment, 16u> g_mobile_cpu_allocator; |
167 | |
168 | at::Allocator* GetDefaultMobileCPUAllocator() { |
169 | return &g_mobile_cpu_allocator; |
170 | } |
171 | |
172 | #ifdef C10_MOBILE |
173 | |
174 | at::Allocator* GetDefaultCPUAllocator() { |
175 | return GetDefaultMobileCPUAllocator(); |
176 | } |
177 | |
178 | REGISTER_ALLOCATOR(DeviceType::CPU, &g_mobile_cpu_allocator); |
179 | |
180 | #else |
181 | |
182 | // Global default CPU Allocator |
183 | static DefaultCPUAllocator g_cpu_alloc; |
184 | |
185 | at::Allocator* GetDefaultCPUAllocator() { |
186 | return &g_cpu_alloc; |
187 | } |
188 | |
189 | REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc); |
190 | |
191 | #endif /* C10_Mobile */ |
192 | |
193 | void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) { |
194 | if (nbytes == 0) { |
195 | return; |
196 | } |
197 | auto profile_memory = memoryProfilingEnabled(); |
198 | size_t allocated = 0; |
199 | if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) { |
200 | std::lock_guard<std::mutex> guard(mutex_); |
201 | size_table_[ptr] = nbytes; |
202 | allocated_ += nbytes; |
203 | allocated = allocated_; |
204 | } |
205 | if (FLAGS_caffe2_report_cpu_memory_usage) { |
206 | LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated |
207 | << " bytes." ; |
208 | } |
209 | if (profile_memory) { |
210 | reportMemoryUsageToProfiler( |
211 | ptr, |
212 | static_cast<int64_t>(nbytes), |
213 | allocated, |
214 | 0, |
215 | c10::Device(c10::DeviceType::CPU)); |
216 | } |
217 | } |
218 | |
219 | void ProfiledCPUMemoryReporter::Delete(void* ptr) { |
220 | size_t nbytes = 0; |
221 | auto profile_memory = memoryProfilingEnabled(); |
222 | size_t allocated = 0; |
223 | if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) { |
224 | std::lock_guard<std::mutex> guard(mutex_); |
225 | auto it = size_table_.find(ptr); |
226 | if (it != size_table_.end()) { |
227 | allocated_ -= it->second; |
228 | allocated = allocated_; |
229 | nbytes = it->second; |
230 | size_table_.erase(it); |
231 | } else { |
232 | // C10_LOG_EVERY_MS might log every time in some builds, |
233 | // using a simple counter to avoid spammy logs |
234 | if (log_cnt_++ % 1000 == 0) { |
235 | LOG(WARNING) << "Memory block of unknown size was allocated before " |
236 | << "the profiling started, profiler results will not " |
237 | << "include the deallocation event" ; |
238 | } |
239 | } |
240 | } |
241 | if (nbytes == 0) { |
242 | return; |
243 | } |
244 | if (FLAGS_caffe2_report_cpu_memory_usage) { |
245 | LOG(INFO) << "C10 deleted " << nbytes << " bytes, total alloc " << allocated |
246 | << " bytes." ; |
247 | } |
248 | if (profile_memory) { |
249 | reportMemoryUsageToProfiler( |
250 | ptr, |
251 | -static_cast<int64_t>(nbytes), |
252 | allocated, |
253 | 0, |
254 | c10::Device(c10::DeviceType::CPU)); |
255 | } |
256 | } |
257 | |
258 | void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) { |
259 | auto profile_memory = memoryProfilingEnabled(); |
260 | size_t allocated = 0; |
261 | if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) { |
262 | std::lock_guard<std::mutex> guard(mutex_); |
263 | |
264 | allocated = allocated_; |
265 | } |
266 | if (nbytes == 0) { |
267 | return; |
268 | } |
269 | if (FLAGS_caffe2_report_cpu_memory_usage) { |
270 | LOG(INFO) << "C10 Out of Memory. Trying to allocate " << nbytes |
271 | << " bytes, total alloc " << allocated << " bytes." ; |
272 | } |
273 | if (profile_memory) { |
274 | reportOutOfMemoryToProfiler( |
275 | static_cast<int64_t>(nbytes), |
276 | allocated, |
277 | 0, |
278 | c10::Device(c10::DeviceType::CPU)); |
279 | } |
280 | } |
281 | |
282 | C10_API at::Allocator* cpu_caching_alloc = nullptr; |
283 | C10_API uint8_t cpu_caching_alloc_priority = 0; |
284 | |
285 | void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) { |
286 | if (priority >= cpu_caching_alloc_priority) { |
287 | cpu_caching_alloc = alloc; |
288 | cpu_caching_alloc_priority = priority; |
289 | } |
290 | } |
291 | |
292 | Allocator* GetCPUCachingAllocator() { |
293 | if (cpu_caching_alloc == nullptr) { |
294 | VLOG(1) |
295 | << "There is not caching allocator registered for CPU, use the default allocator instead." ; |
296 | return GetAllocator(DeviceType::CPU); |
297 | } |
298 | return cpu_caching_alloc; |
299 | } |
300 | |
301 | } // namespace c10 |
302 | |