1 | #pragma once |
2 | |
3 | #include <algorithm> |
4 | #include <deque> |
5 | #include <memory> |
6 | #include <mutex> |
7 | |
8 | #include <c10/util/Exception.h> |
9 | #include <c10/util/SmallVector.h> |
10 | #include <c10/util/flat_hash_map.h> |
11 | |
12 | namespace c10 { |
13 | |
14 | /* |
15 | * Given a sequence of allocations in a thread, AllocationPlan records |
16 | * 1. size of each allocation |
17 | * 2. Lifetime of each allocation. |
18 | * 3. allocation offsets: Memory offset for each allocation in a single blob of |
19 | * memory |
20 | * 4. Total size of a blob of memory required to satisfy all the allocations. |
21 | */ |
22 | class C10_API AllocationPlan { |
23 | private: |
24 | // Records size of each allocation by their sequential allocation ids. |
25 | std::vector<uint64_t> allocation_sizes; |
26 | // This maps one allocation id (X) to another allocation id (Y). |
27 | // Allocation X is alive until allocation Y. From allocation Y onwards |
28 | // allocation X is not referenced. |
29 | // Thus Y is the id of the first allocation after X is freed. |
30 | // NB: When an allocation is recorded, along with recording its size, |
31 | // we also set the lifetime to be numeric_limits::max() |
32 | // This is to track allocations that are made during the scope of |
33 | // profiling but were not freed until after the scope ended. |
34 | // Such allocations are not managed by profiling allocator. |
35 | std::vector<uint64_t> allocation_lifetimes; |
36 | // Maps an allocation to some offset in a blob of memory. |
37 | std::vector<uint64_t> allocation_offsets; |
38 | uint64_t total_size{0}; |
39 | void clear(); |
40 | friend class AllocationPlanner; |
41 | friend class CPUProfilingAllocator; |
42 | }; |
43 | |
44 | /* |
45 | * Map of memory ptr to allocation id. This is auxiliary information only |
46 | * used to establish lifetime of allocations. |
47 | */ |
48 | class C10_API AllocationPlanner { |
49 | private: |
50 | AllocationPlan* allocation_plan_{nullptr}; |
51 | // Maps allocated ptr to its allocation id. |
52 | // This is used when freeing the memory to look up the allocation id |
53 | // in order to establish the lifetime of a particular allocation. |
54 | ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_; |
55 | uint64_t allocation_id_{0}; |
56 | bool validation_mode_{false}; |
57 | |
58 | bool validate_allocation(const uint64_t size, const void* ptr); |
59 | bool validate_free(const void* ptr); |
60 | |
61 | public: |
62 | bool validation_success{true}; |
63 | |
64 | AllocationPlanner() = delete; |
65 | AllocationPlanner(AllocationPlan* plan, bool validate = false) |
66 | : allocation_plan_(plan), validation_mode_(validate) {} |
67 | void record_allocation(const uint64_t size, const void* ptr); |
68 | void record_free(const void* ptr); |
69 | void formulate_plan(); |
70 | void clear(); |
71 | }; |
72 | |
73 | // NOT THREAD SAFE profiling allocator. |
74 | class C10_API CPUProfilingAllocator { |
75 | private: |
76 | const AllocationPlan* plan_{nullptr}; |
77 | uint64_t allocation_id_{0}; |
78 | uint64_t current_size_{0}; |
79 | void* blob_{nullptr}; |
80 | ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_; |
81 | |
82 | public: |
83 | ~CPUProfilingAllocator(); |
84 | void set_plan(const AllocationPlan* plan); |
85 | void unset_plan(); |
86 | void* allocate(const size_t bytes); |
87 | void free(void* const ptr); |
88 | }; |
89 | |
90 | /* |
91 | * Usage: Profile allocations made by one run of the model. |
92 | * AllocationPlan plan; |
93 | * { |
94 | * WithProfileAllocationGuard profile_guard(&plan); |
95 | * module.forward(...); |
96 | * } |
97 | * plan now contains allocation plan. |
98 | */ |
99 | class C10_API WithProfileAllocationsGuard { |
100 | public: |
101 | WithProfileAllocationsGuard(AllocationPlan* plan); |
102 | ~WithProfileAllocationsGuard(); |
103 | |
104 | private: |
105 | std::unique_ptr<AllocationPlanner> planner_; |
106 | }; |
107 | |
108 | /* |
109 | * Usage: Validate allocation plan made with WithProfileAllocationGuard |
110 | * bool plan_validation_success, success = true; |
111 | * for (some number of representative inputs) |
112 | * { |
113 | * WithValidateAllocationPlanGuard(&plan, &plan_validation_success); |
114 | * module.forward(...); |
115 | * success = success && plan_validation_success; |
116 | * } |
117 | * success == true means allocations are according to plan |
118 | * else for some inputs allocation pattern changed. |
119 | */ |
120 | class C10_API WithValidateAllocationPlanGuard { |
121 | public: |
122 | WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success); |
123 | ~WithValidateAllocationPlanGuard(); |
124 | |
125 | private: |
126 | std::unique_ptr<AllocationPlanner> planner_; |
127 | bool* success_; |
128 | }; |
129 | |
130 | AllocationPlanner* GetThreadLocalAllocationPlanner(); |
131 | |
132 | /* |
133 | * Usage: Allocate tensors accordingly to allocation plan |
134 | * First make allocation plan. |
135 | * See WithProfileAllocationsGuard usage. |
136 | * Second validate allocation plan. |
137 | * See WithValidateAllocationPlanGuard usage. |
138 | * CPUProfilingAllocator profiling_allocator; |
139 | * { |
140 | * WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan); |
141 | * module.forward(...); |
142 | * } |
143 | */ |
144 | class C10_API WithProfilingAllocatorGuard { |
145 | public: |
146 | WithProfilingAllocatorGuard( |
147 | CPUProfilingAllocator* allocator, |
148 | const AllocationPlan* plan); |
149 | ~WithProfilingAllocatorGuard(); |
150 | }; |
151 | |
152 | CPUProfilingAllocator* GetThreadLocalProfilingAllocator(); |
153 | |
154 | } // namespace c10 |
155 | |