1 | #include <climits> |
2 | |
3 | #include <c10/core/impl/alloc_cpu.h> |
4 | #include <c10/mobile/CPUProfilingAllocator.h> |
5 | #include <c10/util/irange.h> |
6 | |
7 | #include <map> |
8 | #include <set> |
9 | |
10 | namespace c10 { |
11 | |
12 | namespace { |
13 | thread_local AllocationPlanner* allocation_planner{nullptr}; |
14 | thread_local CPUProfilingAllocator* profiling_allocator{nullptr}; |
15 | |
16 | struct MemBlock { |
17 | uint64_t start_offset, end_offset; |
18 | MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {} |
19 | bool operator<(const MemBlock& other) const { |
20 | return start_offset < other.start_offset; |
21 | } |
22 | }; |
23 | |
24 | enum class EventType { Allocate = 0, Free, Invalid }; |
25 | |
26 | struct MemEvent { |
27 | uint64_t time; |
28 | uint64_t allocation_id; |
29 | uint64_t size; |
30 | EventType type{EventType::Invalid}; |
31 | MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) |
32 | : time(t), allocation_id(id), size(s), type(e) {} |
33 | }; |
34 | |
35 | bool overlaps(const MemBlock& a, const MemBlock& b) { |
36 | // two blocks dont overlap if |
37 | // |---a--------|--------------b--------| |
38 | // strat_a end_a <= start_b end_b |
39 | return !( |
40 | (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset)); |
41 | } |
42 | |
43 | bool validate_allocation_plan( |
44 | const std::vector<MemEvent>& alloc_events, |
45 | const std::vector<uint64_t>& allocation_offsets) { |
46 | std::set<MemBlock> allocations; |
47 | for (const auto& event : alloc_events) { |
48 | auto alloc_id = event.allocation_id; |
49 | // Skip allocations not managed by AllocationPlan |
50 | if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) { |
51 | continue; |
52 | } |
53 | auto start_offset = allocation_offsets[alloc_id]; |
54 | auto end_offset = allocation_offsets[alloc_id] + event.size; |
55 | MemBlock mem_block(start_offset, end_offset); |
56 | if (event.type == EventType::Allocate) { |
57 | auto it = allocations.lower_bound(mem_block); |
58 | if (it != allocations.end()) { |
59 | auto next_block = *it; |
60 | if (overlaps(next_block, mem_block)) { |
61 | return false; |
62 | } |
63 | } |
64 | if (it != allocations.begin()) { |
65 | auto prev_block = *(--it); |
66 | if (overlaps(prev_block, mem_block)) { |
67 | return false; |
68 | } |
69 | } |
70 | allocations.emplace(mem_block); |
71 | } else if (event.type == EventType::Free) { |
72 | auto it = allocations.find(mem_block); |
73 | TORCH_CHECK( |
74 | (*it).end_offset == end_offset, |
75 | "Enf offset of allocation being freed must match the one recorded." ); |
76 | TORCH_CHECK( |
77 | it != allocations.end(), |
78 | "ProfilingAllocator: Allocate event " |
79 | "must have preceded deallocate event." ); |
80 | allocations.erase(it); |
81 | } else { |
82 | TORCH_CHECK(false, "ProfilingAllocator: Invalid event type." ); |
83 | } |
84 | } |
85 | return true; |
86 | } |
87 | |
88 | std::vector<MemEvent> create_and_sort_mem_events( |
89 | const std::vector<uint64_t>& allocation_sizes, |
90 | const std::vector<uint64_t>& allocation_lifetimes) { |
91 | std::vector<MemEvent> events; |
92 | for (uint64_t i = 0; i < allocation_sizes.size(); ++i) { |
93 | // If observed allocation are freed outside the scope of |
94 | // observation, then allocations are not managed by the |
95 | // AllocationPlan. |
96 | if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) { |
97 | continue; |
98 | } |
99 | events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate); |
100 | events.emplace_back( |
101 | allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free); |
102 | } |
103 | std::sort( |
104 | events.begin(), |
105 | events.end(), |
106 | [](const MemEvent& a, const MemEvent& b) -> bool { |
107 | return a.time < b.time; |
108 | }); |
109 | return events; |
110 | } |
111 | |
112 | std::vector<uint64_t> formulate_greedy_allocation_plan( |
113 | const std::vector<uint64_t>& allocation_sizes, |
114 | const std::vector<uint64_t>& allocation_lifetimes) { |
115 | // Step 1. Construct all allocation/free events. |
116 | // Sort these events by timestamp. |
117 | // Step 2. Iterate through all events. |
118 | // 2.1 If allocate event: |
119 | // Find all candidate in free_size_to_offset map |
120 | // Greedily pick the first one. |
121 | // Remove the entry from free_size_to_offset map. |
122 | // new_offset = offset + request_size |
123 | // new_size = size - request_size |
124 | // Add new entry to both maps |
125 | // 2.2 If free event. |
126 | // Check if the returned offset merges with another chunk. |
127 | // If so merge until no more merging is possible. |
128 | // If returned offset does not merge, then |
129 | // just return it as a chunk. |
130 | |
131 | // lower_bound on this map will get all candidates of |
132 | // the right size for allocation. |
133 | std::map<uint64_t, uint64_t> free_size_to_offset; |
134 | // This provides fast lookup when we want to insert freed block |
135 | // back, especially when we want to merge blocks. |
136 | ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> |
137 | free_start_offset_to_size_iter; |
138 | ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> |
139 | free_end_offset_to_size_iter; |
140 | // Upon free end_ptr = offset + size |
141 | // If end_ptr exists merge freed allocation |
142 | // Also find corresponding offset in size_to_offset |
143 | // Remove that entry and update with new size and offset |
144 | // If end_ptr does not exist then just insert offset,size |
145 | // in map and correspondingly size, offset in the other map. |
146 | // Merging should always be done recursively until no more chunks |
147 | // that can be found. |
148 | // After last free we should have only one entry left in these maps. |
149 | |
150 | std::vector<uint64_t> allocation_offsets( |
151 | allocation_sizes.size(), std::numeric_limits<uint64_t>::max()); |
152 | auto mem_events = |
153 | create_and_sort_mem_events(allocation_sizes, allocation_lifetimes); |
154 | uint64_t max_offset{0}; |
155 | for (const auto& mem_event : mem_events) { |
156 | // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
157 | uint64_t alloc_offset; |
158 | // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
159 | uint64_t new_offset, new_size; |
160 | if (mem_event.type == EventType::Allocate) { |
161 | auto it = free_size_to_offset.lower_bound(mem_event.size); |
162 | if (it == free_size_to_offset.end()) { |
163 | // If there is no contiguous block of the size requested |
164 | // allocate a new one. |
165 | alloc_offset = max_offset; |
166 | max_offset += mem_event.size; |
167 | } else { |
168 | // If we have found a block of the size we want |
169 | // 1. change the block by allocating out of it. |
170 | // 1.1 Erase the entire block |
171 | // 1.2 Erase the reverse map entries |
172 | // 2. If block still has space left insert the remainder back in map. |
173 | // Including reverse map entries. |
174 | alloc_offset = it->second; |
175 | new_offset = alloc_offset + mem_event.size; |
176 | new_size = it->first - mem_event.size; |
177 | free_size_to_offset.erase(it); |
178 | free_start_offset_to_size_iter.erase(alloc_offset); |
179 | free_end_offset_to_size_iter.erase(alloc_offset + it->first); |
180 | if (new_size > 0) { |
181 | auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first; |
182 | free_start_offset_to_size_iter.emplace(new_offset, ref_it); |
183 | free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it); |
184 | } |
185 | } |
186 | allocation_offsets[mem_event.allocation_id] = alloc_offset; |
187 | } else { |
188 | // 1. Check if freed block is adjacent to an existing free block |
189 | // at its end boundary. This is done by checking |
190 | // free_end_offset_to_size_iter. |
191 | // If we find such a block, remove it and adjust size of |
192 | // the block being freed. |
193 | // 2. Similarly check if freed block is adjacent to an existing |
194 | // free block at start boundary. This is done by checking |
195 | // free_start_offset_to_size_iter. |
196 | // If we find such a block, remove it and adjust size of |
197 | // the block being freed. |
198 | // 3. Insert the freed block in map. |
199 | auto freed_offset = allocation_offsets[mem_event.allocation_id]; |
200 | auto freed_size = mem_event.size; |
201 | auto end_offset = freed_offset + freed_size; |
202 | // Merge when another free block exist at the end of this block |
203 | auto end_it = free_start_offset_to_size_iter.find(end_offset); |
204 | if (end_it != free_start_offset_to_size_iter.end()) { |
205 | auto merge_block_iter = end_it->second; |
206 | auto merge_block_size = merge_block_iter->first; |
207 | freed_size += merge_block_size; |
208 | free_size_to_offset.erase(merge_block_iter); |
209 | free_start_offset_to_size_iter.erase(end_it); |
210 | // If the block is being merged then also remove it from |
211 | // free_end_offset_to_size_iter |
212 | free_end_offset_to_size_iter.erase(end_offset + merge_block_size); |
213 | } |
214 | // Merge when freed block exist at the end of another free block |
215 | auto start_it = free_end_offset_to_size_iter.find(freed_offset); |
216 | if (start_it != free_end_offset_to_size_iter.end()) { |
217 | auto merge_block_iter = start_it->second; |
218 | auto merge_block_size = merge_block_iter->first; |
219 | freed_size += merge_block_size; |
220 | freed_offset -= merge_block_size; |
221 | free_size_to_offset.erase(merge_block_iter); |
222 | free_end_offset_to_size_iter.erase(start_it); |
223 | // If the block is being merged then also remove it from |
224 | // free_start_offset_to_size_iter |
225 | free_start_offset_to_size_iter.erase(freed_offset); |
226 | } |
227 | auto freed_block_it = |
228 | free_size_to_offset.emplace(freed_size, freed_offset).first; |
229 | free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it); |
230 | free_end_offset_to_size_iter.emplace( |
231 | freed_offset + freed_size, freed_block_it); |
232 | } |
233 | } |
234 | TORCH_CHECK( |
235 | validate_allocation_plan(mem_events, allocation_offsets), |
236 | "ProfilingAllocator: Allocation plan invalid." ); |
237 | return allocation_offsets; |
238 | } |
239 | |
240 | } // namespace |
241 | |
242 | void AllocationPlan::clear() { |
243 | allocation_sizes.clear(); |
244 | allocation_lifetimes.clear(); |
245 | allocation_offsets.clear(); |
246 | } |
247 | |
248 | void AllocationPlanner::record_allocation( |
249 | const uint64_t size, |
250 | const void* ptr) { |
251 | if (validation_mode_) { |
252 | validation_success = validation_success && validate_allocation(size, ptr); |
253 | return; |
254 | } |
255 | allocation_plan_->allocation_sizes.push_back(size); |
256 | allocation_plan_->allocation_lifetimes.push_back( |
257 | std::numeric_limits<uint64_t>::max()); |
258 | allocation_ptr_to_id_[ptr] = allocation_id_; |
259 | allocation_id_++; |
260 | } |
261 | |
262 | void AllocationPlanner::record_free(const void* ptr) { |
263 | if (validation_mode_) { |
264 | validation_success = validation_success && validate_free(ptr); |
265 | return; |
266 | } |
267 | auto it = allocation_ptr_to_id_.find(ptr); |
268 | if (it == allocation_ptr_to_id_.end()) { |
269 | // Free being recorded was allocated outside of WithProfileAllocationGuard |
270 | return; |
271 | } |
272 | auto id = it->second; |
273 | TORCH_CHECK( |
274 | id < allocation_plan_->allocation_lifetimes.size(), |
275 | "Allocation must have been recorded during record_allocation." ); |
276 | allocation_plan_->allocation_lifetimes[id] = allocation_id_; |
277 | } |
278 | |
279 | bool AllocationPlanner::validate_allocation( |
280 | const uint64_t size, |
281 | const void* ptr) { |
282 | if (allocation_id_ >= allocation_plan_->allocation_sizes.size() || |
283 | allocation_plan_->allocation_sizes[allocation_id_] != size) { |
284 | TORCH_WARN( |
285 | "Allocation request does not match plan:" , |
286 | "Allocation id:" , |
287 | allocation_id_, |
288 | ", Number of recorded allocations:" , |
289 | allocation_plan_->allocation_sizes.size(), |
290 | ", Recorded size of the requested allocation:" , |
291 | allocation_plan_->allocation_sizes[allocation_id_], |
292 | ", but got:" , |
293 | size); |
294 | |
295 | return false; |
296 | } |
297 | allocation_ptr_to_id_[ptr] = allocation_id_; |
298 | allocation_id_++; |
299 | return true; |
300 | } |
301 | |
302 | bool AllocationPlanner::validate_free(const void* ptr) { |
303 | auto it = allocation_ptr_to_id_.find(ptr); |
304 | if (it == allocation_ptr_to_id_.end()) { |
305 | // Allocation that was made outside the validation scope is being freed here |
306 | return true; |
307 | } |
308 | auto id = (*it).second; |
309 | TORCH_CHECK( |
310 | id < allocation_plan_->allocation_lifetimes.size(), |
311 | "Allocation must have been recorded during validate_allocation." ); |
312 | auto lifetime_id = allocation_plan_->allocation_lifetimes[id]; |
313 | return (lifetime_id == allocation_id_); |
314 | } |
315 | |
316 | void AllocationPlanner::formulate_plan() { |
317 | allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan( |
318 | allocation_plan_->allocation_sizes, |
319 | allocation_plan_->allocation_lifetimes); |
320 | allocation_plan_->total_size = 0; |
321 | for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) { |
322 | if (allocation_plan_->allocation_lifetimes[i] == |
323 | std::numeric_limits<uint64_t>::max()) { |
324 | continue; |
325 | } |
326 | auto limit = allocation_plan_->allocation_offsets[i] + |
327 | allocation_plan_->allocation_sizes[i]; |
328 | allocation_plan_->total_size = |
329 | std::max(allocation_plan_->total_size, limit); |
330 | } |
331 | } |
332 | |
333 | void AllocationPlanner::clear() { |
334 | allocation_plan_->clear(); |
335 | allocation_ptr_to_id_.clear(); |
336 | } |
337 | |
338 | void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) { |
339 | TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr." ); |
340 | plan_ = plan; |
341 | allocation_id_ = 0; |
342 | allocation_ptr_to_id_.clear(); |
343 | if (current_size_ < plan->total_size) { |
344 | // Free existing memory and reallocate for larger size. |
345 | c10::free_cpu(blob_); |
346 | blob_ = c10::alloc_cpu(plan->total_size); |
347 | current_size_ = plan->total_size; |
348 | } |
349 | } |
350 | |
351 | void CPUProfilingAllocator::unset_plan() { |
352 | allocation_id_ = 0; |
353 | allocation_ptr_to_id_.clear(); |
354 | plan_ = nullptr; |
355 | } |
356 | |
357 | void* CPUProfilingAllocator::allocate(const size_t bytes) { |
358 | TORCH_CHECK( |
359 | bytes == plan_->allocation_sizes[allocation_id_], |
360 | "Got allocation request that does not match with the plan." ); |
361 | if (plan_->allocation_lifetimes[allocation_id_] == |
362 | std::numeric_limits<uint64_t>::max()) { |
363 | // This allocation is not managed by ProfilingAllocator. |
364 | allocation_id_++; |
365 | return c10::alloc_cpu(bytes); |
366 | } |
367 | void* ptr = reinterpret_cast<uint8_t*>(blob_) + |
368 | plan_->allocation_offsets[allocation_id_]; |
369 | allocation_ptr_to_id_[ptr] = allocation_id_; |
370 | allocation_id_++; |
371 | return ptr; |
372 | } |
373 | |
374 | void CPUProfilingAllocator::free(void* const ptr) { |
375 | auto it = allocation_ptr_to_id_.find(ptr); |
376 | if (it == allocation_ptr_to_id_.end()) { |
377 | // Either |
378 | // 1. Allocation that was made outside the validation scope is being freed |
379 | // here or |
380 | // 2. Allocation that is not managed by profiling allocator is being freed. |
381 | // Example of the second type |
382 | // Tensor out; |
383 | // for (....) { |
384 | // { |
385 | // CPUProfilingAllocator |
386 | // out = ...some op (This also frees previous memory held by out) |
387 | // } |
388 | // out is used.. |
389 | // } |
390 | c10::free_cpu(ptr); |
391 | return; |
392 | } |
393 | auto id = it->second; |
394 | TORCH_CHECK( |
395 | id < plan_->allocation_lifetimes.size(), |
396 | "Freeing allocation that is not accordingly to the plan." ); |
397 | auto lifetime_id = plan_->allocation_lifetimes[id]; |
398 | TORCH_CHECK( |
399 | lifetime_id == allocation_id_, |
400 | "Lifetime of allocations do not match: allocation_id " , |
401 | id, |
402 | ", expected:" , |
403 | lifetime_id, |
404 | ", got:" , |
405 | allocation_id_); |
406 | } |
407 | |
408 | CPUProfilingAllocator::~CPUProfilingAllocator() { |
409 | c10::free_cpu(blob_); |
410 | } |
411 | |
412 | WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) { |
413 | // Nesting of allocation profiling does not seem meaningful. |
414 | TORCH_CHECK( |
415 | allocation_planner == nullptr, |
416 | "Nesting profiling allocations is not supported." ); |
417 | planner_ = std::make_unique<AllocationPlanner>(plan); |
418 | planner_->clear(); |
419 | allocation_planner = planner_.get(); |
420 | } |
421 | |
422 | WithProfileAllocationsGuard::~WithProfileAllocationsGuard() { |
423 | planner_->formulate_plan(); |
424 | allocation_planner = nullptr; |
425 | } |
426 | |
427 | WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard( |
428 | AllocationPlan* plan, |
429 | bool* success) { |
430 | // Nesting of allocation profiling does not seem meaningful. |
431 | TORCH_CHECK( |
432 | allocation_planner == nullptr, |
433 | "Nesting profiling allocations is not supported." ); |
434 | planner_ = std::make_unique<AllocationPlanner>(plan, true); |
435 | success_ = success; |
436 | allocation_planner = planner_.get(); |
437 | } |
438 | |
439 | WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() { |
440 | *success_ = planner_->validation_success; |
441 | allocation_planner = nullptr; |
442 | } |
443 | |
444 | AllocationPlanner* GetThreadLocalAllocationPlanner() { |
445 | return allocation_planner; |
446 | } |
447 | |
448 | WithProfilingAllocatorGuard::WithProfilingAllocatorGuard( |
449 | CPUProfilingAllocator* allocator, |
450 | const AllocationPlan* plan) { |
451 | // Nesting of profiling allocator is not supported. |
452 | TORCH_CHECK( |
453 | profiling_allocator == nullptr, |
454 | "Nesting profiling allocators is not supported." ); |
455 | profiling_allocator = allocator; |
456 | profiling_allocator->set_plan(plan); |
457 | } |
458 | |
459 | WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() { |
460 | profiling_allocator->unset_plan(); |
461 | profiling_allocator = nullptr; |
462 | } |
463 | |
464 | CPUProfilingAllocator* GetThreadLocalProfilingAllocator() { |
465 | return profiling_allocator; |
466 | } |
467 | |
468 | } // namespace c10 |
469 | |