CPUProfilingAllocator.cpp source code [pytorch/c10/mobile/CPUProfilingAllocator.cpp]

1	#include <climits>
2
3	#include <c10/core/impl/alloc_cpu.h>
4	#include <c10/mobile/CPUProfilingAllocator.h>
5	#include <c10/util/irange.h>
6
7	#include <map>
8	#include <set>
9
10	namespace c10 {
11
12	namespace {
13	thread_local AllocationPlanner* allocation_planner{nullptr};
14	thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
15
16	struct MemBlock {
17	uint64_t start_offset, end_offset;
18	MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
19	bool operator<(const MemBlock& other) const {
20	return start_offset < other.start_offset;
21	}
22	};
23
24	enum class EventType { Allocate = `0`, Free, Invalid };
25
26	struct MemEvent {
27	uint64_t time;
28	uint64_t allocation_id;
29	uint64_t size;
30	EventType type{EventType::Invalid};
31	MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
32	: time(t), allocation_id(id), size(s), type(e) {}
33	};
34
35	bool overlaps(const MemBlock& a, const MemBlock& b) {
36	// two blocks dont overlap if
37	// \|---a--------\|--------------b--------\|
38	// strat_a end_a <= start_b end_b
39	return !(
40	(a.end_offset <= b.start_offset) \|\| (b.end_offset <= a.start_offset));
41	}
42
43	bool validate_allocation_plan(
44	const std::vector<MemEvent>& alloc_events,
45	const std::vector<uint64_t>& allocation_offsets) {
46	std::set<MemBlock> allocations;
47	for (const auto& event : alloc_events) {
48	auto alloc_id = event.allocation_id;
49	// Skip allocations not managed by AllocationPlan
50	if (allocation_offsets [alloc_id] == std::numeric_limits<uint64_t>::max()) {
51	continue;
52	}
53	auto start_offset = allocation_offsets [alloc_id];
54	auto end_offset = allocation_offsets [alloc_id] + event.size;
55	MemBlock mem_block(start_offset, end_offset);
56	if (event.type == EventType::Allocate) {
57	auto it = allocations.lower_bound(mem_block);
58	if (it != allocations.end()) {
59	auto next_block = *it;
60	if (overlaps(next_block, mem_block)) {
61	return false;
62	}
63	}
64	if (it != allocations.begin()) {
65	auto prev_block = *(--it);
66	if (overlaps(prev_block, mem_block)) {
67	return false;
68	}
69	}
70	allocations.emplace(mem_block);
71	} else if (event.type == EventType::Free) {
72	auto it = allocations.find(mem_block);
73	TORCH_CHECK(
74	(*it).end_offset == end_offset,
75	"Enf offset of allocation being freed must match the one recorded.");
76	TORCH_CHECK(
77	it != allocations.end(),
78	"ProfilingAllocator: Allocate event "
79	"must have preceded deallocate event.");
80	allocations.erase(it);
81	} else {
82	TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
83	}
84	}
85	return true;
86	}
87
88	std::vector<MemEvent> create_and_sort_mem_events(
89	const std::vector<uint64_t>& allocation_sizes,
90	const std::vector<uint64_t>& allocation_lifetimes) {
91	std::vector<MemEvent> events;
92	for (uint64_t i = `0`; i < allocation_sizes.size(); ++i) {
93	// If observed allocation are freed outside the scope of
94	// observation, then allocations are not managed by the
95	// AllocationPlan.
96	if (allocation_lifetimes [i] == std::numeric_limits<uint64_t>::max()) {
97	continue;
98	}
99	events.emplace_back(i, i, allocation_sizes [i], EventType::Allocate);
100	events.emplace_back(
101	allocation_lifetimes [i], i, allocation_sizes [i], EventType::Free);
102	}
103	std::sort(
104	events.begin(),
105	events.end(),
106	[](const MemEvent& a, const MemEvent& b) -> bool {
107	return a.time < b.time;
108	});
109	return events;
110	}
111
112	std::vector<uint64_t> formulate_greedy_allocation_plan(
113	const std::vector<uint64_t>& allocation_sizes,
114	const std::vector<uint64_t>& allocation_lifetimes) {
115	// Step 1. Construct all allocation/free events.
116	// Sort these events by timestamp.
117	// Step 2. Iterate through all events.
118	// 2.1 If allocate event:
119	// Find all candidate in free_size_to_offset map
120	// Greedily pick the first one.
121	// Remove the entry from free_size_to_offset map.
122	// new_offset = offset + request_size
123	// new_size = size - request_size
124	// Add new entry to both maps
125	// 2.2 If free event.
126	// Check if the returned offset merges with another chunk.
127	// If so merge until no more merging is possible.
128	// If returned offset does not merge, then
129	// just return it as a chunk.
130
131	// lower_bound on this map will get all candidates of
132	// the right size for allocation.
133	std::map<uint64_t, uint64_t> free_size_to_offset;
134	// This provides fast lookup when we want to insert freed block
135	// back, especially when we want to merge blocks.
136	ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
137	free_start_offset_to_size_iter;
138	ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
139	free_end_offset_to_size_iter;
140	// Upon free end_ptr = offset + size
141	// If end_ptr exists merge freed allocation
142	// Also find corresponding offset in size_to_offset
143	// Remove that entry and update with new size and offset
144	// If end_ptr does not exist then just insert offset,size
145	// in map and correspondingly size, offset in the other map.
146	// Merging should always be done recursively until no more chunks
147	// that can be found.
148	// After last free we should have only one entry left in these maps.
149
150	std::vector<uint64_t> allocation_offsets(
151	allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
152	auto mem_events =
153	create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
154	uint64_t max_offset{`0`};
155	for (const auto& mem_event : mem_events) {
156	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
157	uint64_t alloc_offset;
158	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
159	uint64_t new_offset, new_size;
160	if (mem_event.type == EventType::Allocate) {
161	auto it = free_size_to_offset.lower_bound(mem_event.size);
162	if (it == free_size_to_offset.end()) {
163	// If there is no contiguous block of the size requested
164	// allocate a new one.
165	alloc_offset = max_offset;
166	max_offset += mem_event.size;
167	} else {
168	// If we have found a block of the size we want
169	// 1. change the block by allocating out of it.
170	// 1.1 Erase the entire block
171	// 1.2 Erase the reverse map entries
172	// 2. If block still has space left insert the remainder back in map.
173	// Including reverse map entries.
174	alloc_offset = it ->second;
175	new_offset = alloc_offset + mem_event.size;
176	new_size = it ->first - mem_event.size;
177	free_size_to_offset.erase(it);
178	free_start_offset_to_size_iter.erase(alloc_offset);
179	free_end_offset_to_size_iter.erase(alloc_offset + it ->first);
180	if (new_size > `0`) {
181	auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
182	free_start_offset_to_size_iter.emplace(new_offset, ref_it);
183	free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
184	}
185	}
186	allocation_offsets [mem_event.allocation_id] = alloc_offset;
187	} else {
188	// 1. Check if freed block is adjacent to an existing free block
189	// at its end boundary. This is done by checking
190	// free_end_offset_to_size_iter.
191	// If we find such a block, remove it and adjust size of
192	// the block being freed.
193	// 2. Similarly check if freed block is adjacent to an existing
194	// free block at start boundary. This is done by checking
195	// free_start_offset_to_size_iter.
196	// If we find such a block, remove it and adjust size of
197	// the block being freed.
198	// 3. Insert the freed block in map.
199	auto freed_offset = allocation_offsets [mem_event.allocation_id];
200	auto freed_size = mem_event.size;
201	auto end_offset = freed_offset + freed_size;
202	// Merge when another free block exist at the end of this block
203	auto end_it = free_start_offset_to_size_iter.find(end_offset);
204	if (end_it != free_start_offset_to_size_iter.end()) {
205	auto merge_block_iter = end_it ->second;
206	auto merge_block_size = merge_block_iter ->first;
207	freed_size += merge_block_size;
208	free_size_to_offset.erase(merge_block_iter);
209	free_start_offset_to_size_iter.erase(end_it);
210	// If the block is being merged then also remove it from
211	// free_end_offset_to_size_iter
212	free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
213	}
214	// Merge when freed block exist at the end of another free block
215	auto start_it = free_end_offset_to_size_iter.find(freed_offset);
216	if (start_it != free_end_offset_to_size_iter.end()) {
217	auto merge_block_iter = start_it ->second;
218	auto merge_block_size = merge_block_iter ->first;
219	freed_size += merge_block_size;
220	freed_offset -= merge_block_size;
221	free_size_to_offset.erase(merge_block_iter);
222	free_end_offset_to_size_iter.erase(start_it);
223	// If the block is being merged then also remove it from
224	// free_start_offset_to_size_iter
225	free_start_offset_to_size_iter.erase(freed_offset);
226	}
227	auto freed_block_it =
228	free_size_to_offset.emplace(freed_size, freed_offset).first;
229	free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
230	free_end_offset_to_size_iter.emplace(
231	freed_offset + freed_size, freed_block_it);
232	}
233	}
234	TORCH_CHECK(
235	validate_allocation_plan(mem_events, allocation_offsets),
236	"ProfilingAllocator: Allocation plan invalid.");
237	return allocation_offsets;
238	}
239
240	} // namespace
241
242	void AllocationPlan::clear() {
243	allocation_sizes.clear();
244	allocation_lifetimes.clear();
245	allocation_offsets.clear();
246	}
247
248	void AllocationPlanner::record_allocation(
249	const uint64_t size,
250	const void* ptr) {
251	if (validation_mode_) {
252	validation_success = validation_success && validate_allocation(size, ptr);
253	return;
254	}
255	allocation_plan_->allocation_sizes.push_back(size);
256	allocation_plan_->allocation_lifetimes.push_back(
257	std::numeric_limits<uint64_t>::max());
258	allocation_ptr_to_id_[ptr] = allocation_id_;
259	allocation_id_++;
260	}
261
262	void AllocationPlanner::record_free(const void* ptr) {
263	if (validation_mode_) {
264	validation_success = validation_success && validate_free(ptr);
265	return;
266	}
267	auto it = allocation_ptr_to_id_.find(ptr);
268	if (it == allocation_ptr_to_id_.end()) {
269	// Free being recorded was allocated outside of WithProfileAllocationGuard
270	return;
271	}
272	auto id = it ->second;
273	TORCH_CHECK(
274	id < allocation_plan_->allocation_lifetimes.size(),
275	"Allocation must have been recorded during record_allocation.");
276	allocation_plan_->allocation_lifetimes [id] = allocation_id_;
277	}
278
279	bool AllocationPlanner::validate_allocation(
280	const uint64_t size,
281	const void* ptr) {
282	if (allocation_id_ >= allocation_plan_->allocation_sizes.size() \|\|
283	allocation_plan_->allocation_sizes [allocation_id_] != size) {
284	TORCH_WARN(
285	"Allocation request does not match plan:",
286	"Allocation id:",
287	allocation_id_,
288	", Number of recorded allocations:",
289	allocation_plan_->allocation_sizes.size(),
290	", Recorded size of the requested allocation:",
291	allocation_plan_->allocation_sizes[allocation_id_],
292	", but got:",
293	size);
294
295	return false;
296	}
297	allocation_ptr_to_id_[ptr] = allocation_id_;
298	allocation_id_++;
299	return true;
300	}
301
302	bool AllocationPlanner::validate_free(const void* ptr) {
303	auto it = allocation_ptr_to_id_.find(ptr);
304	if (it == allocation_ptr_to_id_.end()) {
305	// Allocation that was made outside the validation scope is being freed here
306	return true;
307	}
308	auto id = (*it).second;
309	TORCH_CHECK(
310	id < allocation_plan_->allocation_lifetimes.size(),
311	"Allocation must have been recorded during validate_allocation.");
312	auto lifetime_id = allocation_plan_->allocation_lifetimes [id];
313	return (lifetime_id == allocation_id_);
314	}
315
316	void AllocationPlanner::formulate_plan() {
317	allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
318	allocation_plan_->allocation_sizes,
319	allocation_plan_->allocation_lifetimes);
320	allocation_plan_->total_size = `0`;
321	for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
322	if (allocation_plan_->allocation_lifetimes [i] ==
323	std::numeric_limits<uint64_t>::max()) {
324	continue;
325	}
326	auto limit = allocation_plan_->allocation_offsets [i] +
327	allocation_plan_->allocation_sizes [i];
328	allocation_plan_->total_size =
329	std::max(allocation_plan_->total_size, limit);
330	}
331	}
332
333	void AllocationPlanner::clear() {
334	allocation_plan_->clear();
335	allocation_ptr_to_id_.clear();
336	}
337
338	void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
339	TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
340	plan_ = plan;
341	allocation_id_ = `0`;
342	allocation_ptr_to_id_.clear();
343	if (current_size_ < plan->total_size) {
344	// Free existing memory and reallocate for larger size.
345	c10::free_cpu(blob_);
346	blob_ = c10::alloc_cpu(plan->total_size);
347	current_size_ = plan->total_size;
348	}
349	}
350
351	void CPUProfilingAllocator::unset_plan() {
352	allocation_id_ = `0`;
353	allocation_ptr_to_id_.clear();
354	plan_ = nullptr;
355	}
356
357	void* CPUProfilingAllocator::allocate(const size_t bytes) {
358	TORCH_CHECK(
359	bytes == plan_->allocation_sizes[allocation_id_],
360	"Got allocation request that does not match with the plan.");
361	if (plan_->allocation_lifetimes [allocation_id_] ==
362	std::numeric_limits<uint64_t>::max()) {
363	// This allocation is not managed by ProfilingAllocator.
364	allocation_id_++;
365	return c10::alloc_cpu(bytes);
366	}
367	void* ptr = reinterpret_cast<uint8_t*>(blob_) +
368	plan_->allocation_offsets [allocation_id_];
369	allocation_ptr_to_id_[ptr] = allocation_id_;
370	allocation_id_++;
371	return ptr;
372	}
373
374	void CPUProfilingAllocator::free(void* const ptr) {
375	auto it = allocation_ptr_to_id_.find(ptr);
376	if (it == allocation_ptr_to_id_.end()) {
377	// Either
378	// 1. Allocation that was made outside the validation scope is being freed
379	// here or
380	// 2. Allocation that is not managed by profiling allocator is being freed.
381	// Example of the second type
382	// Tensor out;
383	// for (....) {
384	// {
385	// CPUProfilingAllocator
386	// out = ...some op (This also frees previous memory held by out)
387	// }
388	// out is used..
389	// }
390	c10::free_cpu(ptr);
391	return;
392	}
393	auto id = it ->second;
394	TORCH_CHECK(
395	id < plan_->allocation_lifetimes.size(),
396	"Freeing allocation that is not accordingly to the plan.");
397	auto lifetime_id = plan_->allocation_lifetimes [id];
398	TORCH_CHECK(
399	lifetime_id == allocation_id_,
400	"Lifetime of allocations do not match: allocation_id ",
401	id,
402	", expected:",
403	lifetime_id,
404	", got:",
405	allocation_id_);
406	}
407
408	CPUProfilingAllocator::~CPUProfilingAllocator() {
409	c10::free_cpu(blob_);
410	}
411
412	WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
413	// Nesting of allocation profiling does not seem meaningful.
414	TORCH_CHECK(
415	allocation_planner == nullptr,
416	"Nesting profiling allocations is not supported.");
417	planner_ = std::make_unique<AllocationPlanner>(plan);
418	planner_->clear();
419	allocation_planner = planner_.get();
420	}
421
422	WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
423	planner_->formulate_plan();
424	allocation_planner = nullptr;
425	}
426
427	WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
428	AllocationPlan* plan,
429	bool* success) {
430	// Nesting of allocation profiling does not seem meaningful.
431	TORCH_CHECK(
432	allocation_planner == nullptr,
433	"Nesting profiling allocations is not supported.");
434	planner_ = std::make_unique<AllocationPlanner>(plan, true);
435	success_ = success;
436	allocation_planner = planner_.get();
437	}
438
439	WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
440	*success_ = planner_->validation_success;
441	allocation_planner = nullptr;
442	}
443
444	AllocationPlanner* GetThreadLocalAllocationPlanner() {
445	return allocation_planner;
446	}
447
448	WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
449	CPUProfilingAllocator* allocator,
450	const AllocationPlan* plan) {
451	// Nesting of profiling allocator is not supported.
452	TORCH_CHECK(
453	profiling_allocator == nullptr,
454	"Nesting profiling allocators is not supported.");
455	profiling_allocator = allocator;
456	profiling_allocator->set_plan(plan);
457	}
458
459	WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
460	profiling_allocator->unset_plan();
461	profiling_allocator = nullptr;
462	}
463
464	CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
465	return profiling_allocator;
466	}
467
468	} // namespace c10
469

Browse the source code of pytorch/c10/mobile/CPUProfilingAllocator.cpp