1#include <climits>
2
3#include <c10/core/impl/alloc_cpu.h>
4#include <c10/mobile/CPUProfilingAllocator.h>
5#include <c10/util/irange.h>
6
7#include <map>
8#include <set>
9
10namespace c10 {
11
12namespace {
13thread_local AllocationPlanner* allocation_planner{nullptr};
14thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
15
16struct MemBlock {
17 uint64_t start_offset, end_offset;
18 MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
19 bool operator<(const MemBlock& other) const {
20 return start_offset < other.start_offset;
21 }
22};
23
24enum class EventType { Allocate = 0, Free, Invalid };
25
26struct MemEvent {
27 uint64_t time;
28 uint64_t allocation_id;
29 uint64_t size;
30 EventType type{EventType::Invalid};
31 MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
32 : time(t), allocation_id(id), size(s), type(e) {}
33};
34
35bool overlaps(const MemBlock& a, const MemBlock& b) {
36 // two blocks dont overlap if
37 // |---a--------|--------------b--------|
38 // strat_a end_a <= start_b end_b
39 return !(
40 (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
41}
42
43bool validate_allocation_plan(
44 const std::vector<MemEvent>& alloc_events,
45 const std::vector<uint64_t>& allocation_offsets) {
46 std::set<MemBlock> allocations;
47 for (const auto& event : alloc_events) {
48 auto alloc_id = event.allocation_id;
49 // Skip allocations not managed by AllocationPlan
50 if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
51 continue;
52 }
53 auto start_offset = allocation_offsets[alloc_id];
54 auto end_offset = allocation_offsets[alloc_id] + event.size;
55 MemBlock mem_block(start_offset, end_offset);
56 if (event.type == EventType::Allocate) {
57 auto it = allocations.lower_bound(mem_block);
58 if (it != allocations.end()) {
59 auto next_block = *it;
60 if (overlaps(next_block, mem_block)) {
61 return false;
62 }
63 }
64 if (it != allocations.begin()) {
65 auto prev_block = *(--it);
66 if (overlaps(prev_block, mem_block)) {
67 return false;
68 }
69 }
70 allocations.emplace(mem_block);
71 } else if (event.type == EventType::Free) {
72 auto it = allocations.find(mem_block);
73 TORCH_CHECK(
74 (*it).end_offset == end_offset,
75 "Enf offset of allocation being freed must match the one recorded.");
76 TORCH_CHECK(
77 it != allocations.end(),
78 "ProfilingAllocator: Allocate event "
79 "must have preceded deallocate event.");
80 allocations.erase(it);
81 } else {
82 TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
83 }
84 }
85 return true;
86}
87
88std::vector<MemEvent> create_and_sort_mem_events(
89 const std::vector<uint64_t>& allocation_sizes,
90 const std::vector<uint64_t>& allocation_lifetimes) {
91 std::vector<MemEvent> events;
92 for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
93 // If observed allocation are freed outside the scope of
94 // observation, then allocations are not managed by the
95 // AllocationPlan.
96 if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
97 continue;
98 }
99 events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
100 events.emplace_back(
101 allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
102 }
103 std::sort(
104 events.begin(),
105 events.end(),
106 [](const MemEvent& a, const MemEvent& b) -> bool {
107 return a.time < b.time;
108 });
109 return events;
110}
111
112std::vector<uint64_t> formulate_greedy_allocation_plan(
113 const std::vector<uint64_t>& allocation_sizes,
114 const std::vector<uint64_t>& allocation_lifetimes) {
115 // Step 1. Construct all allocation/free events.
116 // Sort these events by timestamp.
117 // Step 2. Iterate through all events.
118 // 2.1 If allocate event:
119 // Find all candidate in free_size_to_offset map
120 // Greedily pick the first one.
121 // Remove the entry from free_size_to_offset map.
122 // new_offset = offset + request_size
123 // new_size = size - request_size
124 // Add new entry to both maps
125 // 2.2 If free event.
126 // Check if the returned offset merges with another chunk.
127 // If so merge until no more merging is possible.
128 // If returned offset does not merge, then
129 // just return it as a chunk.
130
131 // lower_bound on this map will get all candidates of
132 // the right size for allocation.
133 std::map<uint64_t, uint64_t> free_size_to_offset;
134 // This provides fast lookup when we want to insert freed block
135 // back, especially when we want to merge blocks.
136 ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
137 free_start_offset_to_size_iter;
138 ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
139 free_end_offset_to_size_iter;
140 // Upon free end_ptr = offset + size
141 // If end_ptr exists merge freed allocation
142 // Also find corresponding offset in size_to_offset
143 // Remove that entry and update with new size and offset
144 // If end_ptr does not exist then just insert offset,size
145 // in map and correspondingly size, offset in the other map.
146 // Merging should always be done recursively until no more chunks
147 // that can be found.
148 // After last free we should have only one entry left in these maps.
149
150 std::vector<uint64_t> allocation_offsets(
151 allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
152 auto mem_events =
153 create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
154 uint64_t max_offset{0};
155 for (const auto& mem_event : mem_events) {
156 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
157 uint64_t alloc_offset;
158 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
159 uint64_t new_offset, new_size;
160 if (mem_event.type == EventType::Allocate) {
161 auto it = free_size_to_offset.lower_bound(mem_event.size);
162 if (it == free_size_to_offset.end()) {
163 // If there is no contiguous block of the size requested
164 // allocate a new one.
165 alloc_offset = max_offset;
166 max_offset += mem_event.size;
167 } else {
168 // If we have found a block of the size we want
169 // 1. change the block by allocating out of it.
170 // 1.1 Erase the entire block
171 // 1.2 Erase the reverse map entries
172 // 2. If block still has space left insert the remainder back in map.
173 // Including reverse map entries.
174 alloc_offset = it->second;
175 new_offset = alloc_offset + mem_event.size;
176 new_size = it->first - mem_event.size;
177 free_size_to_offset.erase(it);
178 free_start_offset_to_size_iter.erase(alloc_offset);
179 free_end_offset_to_size_iter.erase(alloc_offset + it->first);
180 if (new_size > 0) {
181 auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
182 free_start_offset_to_size_iter.emplace(new_offset, ref_it);
183 free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
184 }
185 }
186 allocation_offsets[mem_event.allocation_id] = alloc_offset;
187 } else {
188 // 1. Check if freed block is adjacent to an existing free block
189 // at its end boundary. This is done by checking
190 // free_end_offset_to_size_iter.
191 // If we find such a block, remove it and adjust size of
192 // the block being freed.
193 // 2. Similarly check if freed block is adjacent to an existing
194 // free block at start boundary. This is done by checking
195 // free_start_offset_to_size_iter.
196 // If we find such a block, remove it and adjust size of
197 // the block being freed.
198 // 3. Insert the freed block in map.
199 auto freed_offset = allocation_offsets[mem_event.allocation_id];
200 auto freed_size = mem_event.size;
201 auto end_offset = freed_offset + freed_size;
202 // Merge when another free block exist at the end of this block
203 auto end_it = free_start_offset_to_size_iter.find(end_offset);
204 if (end_it != free_start_offset_to_size_iter.end()) {
205 auto merge_block_iter = end_it->second;
206 auto merge_block_size = merge_block_iter->first;
207 freed_size += merge_block_size;
208 free_size_to_offset.erase(merge_block_iter);
209 free_start_offset_to_size_iter.erase(end_it);
210 // If the block is being merged then also remove it from
211 // free_end_offset_to_size_iter
212 free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
213 }
214 // Merge when freed block exist at the end of another free block
215 auto start_it = free_end_offset_to_size_iter.find(freed_offset);
216 if (start_it != free_end_offset_to_size_iter.end()) {
217 auto merge_block_iter = start_it->second;
218 auto merge_block_size = merge_block_iter->first;
219 freed_size += merge_block_size;
220 freed_offset -= merge_block_size;
221 free_size_to_offset.erase(merge_block_iter);
222 free_end_offset_to_size_iter.erase(start_it);
223 // If the block is being merged then also remove it from
224 // free_start_offset_to_size_iter
225 free_start_offset_to_size_iter.erase(freed_offset);
226 }
227 auto freed_block_it =
228 free_size_to_offset.emplace(freed_size, freed_offset).first;
229 free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
230 free_end_offset_to_size_iter.emplace(
231 freed_offset + freed_size, freed_block_it);
232 }
233 }
234 TORCH_CHECK(
235 validate_allocation_plan(mem_events, allocation_offsets),
236 "ProfilingAllocator: Allocation plan invalid.");
237 return allocation_offsets;
238}
239
240} // namespace
241
242void AllocationPlan::clear() {
243 allocation_sizes.clear();
244 allocation_lifetimes.clear();
245 allocation_offsets.clear();
246}
247
248void AllocationPlanner::record_allocation(
249 const uint64_t size,
250 const void* ptr) {
251 if (validation_mode_) {
252 validation_success = validation_success && validate_allocation(size, ptr);
253 return;
254 }
255 allocation_plan_->allocation_sizes.push_back(size);
256 allocation_plan_->allocation_lifetimes.push_back(
257 std::numeric_limits<uint64_t>::max());
258 allocation_ptr_to_id_[ptr] = allocation_id_;
259 allocation_id_++;
260}
261
262void AllocationPlanner::record_free(const void* ptr) {
263 if (validation_mode_) {
264 validation_success = validation_success && validate_free(ptr);
265 return;
266 }
267 auto it = allocation_ptr_to_id_.find(ptr);
268 if (it == allocation_ptr_to_id_.end()) {
269 // Free being recorded was allocated outside of WithProfileAllocationGuard
270 return;
271 }
272 auto id = it->second;
273 TORCH_CHECK(
274 id < allocation_plan_->allocation_lifetimes.size(),
275 "Allocation must have been recorded during record_allocation.");
276 allocation_plan_->allocation_lifetimes[id] = allocation_id_;
277}
278
279bool AllocationPlanner::validate_allocation(
280 const uint64_t size,
281 const void* ptr) {
282 if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
283 allocation_plan_->allocation_sizes[allocation_id_] != size) {
284 TORCH_WARN(
285 "Allocation request does not match plan:",
286 "Allocation id:",
287 allocation_id_,
288 ", Number of recorded allocations:",
289 allocation_plan_->allocation_sizes.size(),
290 ", Recorded size of the requested allocation:",
291 allocation_plan_->allocation_sizes[allocation_id_],
292 ", but got:",
293 size);
294
295 return false;
296 }
297 allocation_ptr_to_id_[ptr] = allocation_id_;
298 allocation_id_++;
299 return true;
300}
301
302bool AllocationPlanner::validate_free(const void* ptr) {
303 auto it = allocation_ptr_to_id_.find(ptr);
304 if (it == allocation_ptr_to_id_.end()) {
305 // Allocation that was made outside the validation scope is being freed here
306 return true;
307 }
308 auto id = (*it).second;
309 TORCH_CHECK(
310 id < allocation_plan_->allocation_lifetimes.size(),
311 "Allocation must have been recorded during validate_allocation.");
312 auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
313 return (lifetime_id == allocation_id_);
314}
315
316void AllocationPlanner::formulate_plan() {
317 allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
318 allocation_plan_->allocation_sizes,
319 allocation_plan_->allocation_lifetimes);
320 allocation_plan_->total_size = 0;
321 for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
322 if (allocation_plan_->allocation_lifetimes[i] ==
323 std::numeric_limits<uint64_t>::max()) {
324 continue;
325 }
326 auto limit = allocation_plan_->allocation_offsets[i] +
327 allocation_plan_->allocation_sizes[i];
328 allocation_plan_->total_size =
329 std::max(allocation_plan_->total_size, limit);
330 }
331}
332
333void AllocationPlanner::clear() {
334 allocation_plan_->clear();
335 allocation_ptr_to_id_.clear();
336}
337
338void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
339 TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
340 plan_ = plan;
341 allocation_id_ = 0;
342 allocation_ptr_to_id_.clear();
343 if (current_size_ < plan->total_size) {
344 // Free existing memory and reallocate for larger size.
345 c10::free_cpu(blob_);
346 blob_ = c10::alloc_cpu(plan->total_size);
347 current_size_ = plan->total_size;
348 }
349}
350
351void CPUProfilingAllocator::unset_plan() {
352 allocation_id_ = 0;
353 allocation_ptr_to_id_.clear();
354 plan_ = nullptr;
355}
356
357void* CPUProfilingAllocator::allocate(const size_t bytes) {
358 TORCH_CHECK(
359 bytes == plan_->allocation_sizes[allocation_id_],
360 "Got allocation request that does not match with the plan.");
361 if (plan_->allocation_lifetimes[allocation_id_] ==
362 std::numeric_limits<uint64_t>::max()) {
363 // This allocation is not managed by ProfilingAllocator.
364 allocation_id_++;
365 return c10::alloc_cpu(bytes);
366 }
367 void* ptr = reinterpret_cast<uint8_t*>(blob_) +
368 plan_->allocation_offsets[allocation_id_];
369 allocation_ptr_to_id_[ptr] = allocation_id_;
370 allocation_id_++;
371 return ptr;
372}
373
374void CPUProfilingAllocator::free(void* const ptr) {
375 auto it = allocation_ptr_to_id_.find(ptr);
376 if (it == allocation_ptr_to_id_.end()) {
377 // Either
378 // 1. Allocation that was made outside the validation scope is being freed
379 // here or
380 // 2. Allocation that is not managed by profiling allocator is being freed.
381 // Example of the second type
382 // Tensor out;
383 // for (....) {
384 // {
385 // CPUProfilingAllocator
386 // out = ...some op (This also frees previous memory held by out)
387 // }
388 // out is used..
389 // }
390 c10::free_cpu(ptr);
391 return;
392 }
393 auto id = it->second;
394 TORCH_CHECK(
395 id < plan_->allocation_lifetimes.size(),
396 "Freeing allocation that is not accordingly to the plan.");
397 auto lifetime_id = plan_->allocation_lifetimes[id];
398 TORCH_CHECK(
399 lifetime_id == allocation_id_,
400 "Lifetime of allocations do not match: allocation_id ",
401 id,
402 ", expected:",
403 lifetime_id,
404 ", got:",
405 allocation_id_);
406}
407
408CPUProfilingAllocator::~CPUProfilingAllocator() {
409 c10::free_cpu(blob_);
410}
411
412WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
413 // Nesting of allocation profiling does not seem meaningful.
414 TORCH_CHECK(
415 allocation_planner == nullptr,
416 "Nesting profiling allocations is not supported.");
417 planner_ = std::make_unique<AllocationPlanner>(plan);
418 planner_->clear();
419 allocation_planner = planner_.get();
420}
421
422WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
423 planner_->formulate_plan();
424 allocation_planner = nullptr;
425}
426
427WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
428 AllocationPlan* plan,
429 bool* success) {
430 // Nesting of allocation profiling does not seem meaningful.
431 TORCH_CHECK(
432 allocation_planner == nullptr,
433 "Nesting profiling allocations is not supported.");
434 planner_ = std::make_unique<AllocationPlanner>(plan, true);
435 success_ = success;
436 allocation_planner = planner_.get();
437}
438
439WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
440 *success_ = planner_->validation_success;
441 allocation_planner = nullptr;
442}
443
444AllocationPlanner* GetThreadLocalAllocationPlanner() {
445 return allocation_planner;
446}
447
448WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
449 CPUProfilingAllocator* allocator,
450 const AllocationPlan* plan) {
451 // Nesting of profiling allocator is not supported.
452 TORCH_CHECK(
453 profiling_allocator == nullptr,
454 "Nesting profiling allocators is not supported.");
455 profiling_allocator = allocator;
456 profiling_allocator->set_plan(plan);
457}
458
459WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
460 profiling_allocator->unset_plan();
461 profiling_allocator = nullptr;
462}
463
464CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
465 return profiling_allocator;
466}
467
468} // namespace c10
469