1 | #pragma once |
2 | |
3 | #include <c10/core/Allocator.h> |
4 | #include <c10/cuda/CUDAGraphsC10Utils.h> |
5 | #include <c10/cuda/CUDAMacros.h> |
6 | #include <c10/cuda/CUDAStream.h> |
7 | #include <c10/util/Registry.h> |
8 | |
9 | #include <array> |
10 | #include <mutex> |
11 | |
12 | namespace c10 { |
13 | |
14 | // Caching allocator will execute every registered callback if it unable to find |
15 | // block inside of already allocated area. |
16 | class C10_CUDA_API FreeMemoryCallback { |
17 | public: |
18 | virtual ~FreeMemoryCallback() = default; |
19 | virtual bool Execute() = 0; |
20 | }; |
21 | |
22 | C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback); |
23 | #define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \ |
24 | C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__); |
25 | |
26 | namespace cuda { |
27 | |
28 | // TODO: Turn this into an honest to goodness class. I briefly attempted to do |
29 | // this, but it was a bit irritating to figure out how to also correctly |
30 | // apply pimpl pattern so I didn't have to leak any internal implementation |
31 | // details in the header (CUDACachingAllocator could be made a pimpl, but |
32 | // you also need to appropriately define a class which is a subclass |
33 | // of Allocator. Not impossible, but required a bit more surgery than |
34 | // I wanted to do at the time.) |
35 | // |
36 | // Why is this using a namespace rather than old-style THCCachingAllocator_ |
37 | // prefix? Mostly because it made the HIPify rules easier to write; _ is |
38 | // not counted as a word boundary, so you would otherwise have to list each |
39 | // of these functions. |
40 | |
41 | namespace CUDACachingAllocator { |
42 | |
43 | struct Stat { |
44 | int64_t current = 0; |
45 | int64_t peak = 0; |
46 | int64_t allocated = 0; |
47 | int64_t freed = 0; |
48 | }; |
49 | |
50 | enum struct StatType : uint64_t { |
51 | AGGREGATE = 0, |
52 | SMALL_POOL = 1, |
53 | LARGE_POOL = 2, |
54 | NUM_TYPES = 3 // remember to update this whenever a new stat type is added |
55 | }; |
56 | |
57 | typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray; |
58 | |
59 | // Struct containing memory allocator summary statistics for a device. |
60 | struct DeviceStats { |
61 | // COUNT: allocations requested by client code |
62 | StatArray allocation; |
63 | // COUNT: number of allocated segments from cudaMalloc(). |
64 | StatArray segment; |
65 | // COUNT: number of active memory blocks (allocated or used by stream) |
66 | StatArray active; |
67 | // COUNT: number of inactive, split memory blocks (unallocated but can't be |
68 | // released via cudaFree) |
69 | StatArray inactive_split; |
70 | |
71 | // SUM: bytes allocated by this memory alocator |
72 | StatArray allocated_bytes; |
73 | // SUM: bytes reserved by this memory allocator (both free and used) |
74 | StatArray reserved_bytes; |
75 | // SUM: bytes within active memory blocks |
76 | StatArray active_bytes; |
77 | // SUM: bytes within inactive, split memory blocks |
78 | StatArray inactive_split_bytes; |
79 | // SUM: bytes requested by client code |
80 | StatArray requested_bytes; |
81 | |
82 | // COUNT: total number of failed calls to CUDA malloc necessitating cache |
83 | // flushes. |
84 | int64_t num_alloc_retries = 0; |
85 | |
86 | // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush) |
87 | int64_t num_ooms = 0; |
88 | |
89 | // COUNT: total number of oversize blocks allocated from pool |
90 | Stat oversize_allocations; |
91 | |
92 | // COUNT: total number of oversize blocks requiring malloc |
93 | Stat oversize_segments; |
94 | |
95 | // SIZE: maximum block size that is allowed to be split. |
96 | int64_t max_split_size = 0; |
97 | }; |
98 | |
99 | struct Context { |
100 | virtual ~Context() = default; |
101 | }; |
102 | |
103 | typedef std::shared_ptr<Context> (*CreateContextFn)(void); |
104 | |
105 | struct History { |
106 | void* addr; |
107 | size_t real_size; // unrounded, actually requested size |
108 | std::shared_ptr<Context> context; // per-watcher context |
109 | }; |
110 | |
111 | // Struct containing info of an allocation block (i.e. a fractional part of a |
112 | // cudaMalloc).. |
113 | struct BlockInfo { |
114 | int64_t size = 0; |
115 | int64_t requested_size = 0; |
116 | int32_t gc_counter = 0; |
117 | bool allocated = false; |
118 | bool active = false; |
119 | std::vector<History> history; |
120 | }; |
121 | |
122 | // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc). |
123 | struct SegmentInfo { |
124 | int64_t device = 0; |
125 | int64_t address = 0; |
126 | int64_t total_size = 0; |
127 | int64_t requested_size = 0; |
128 | int64_t allocated_size = 0; |
129 | int64_t active_size = 0; |
130 | cudaStream_t stream = 0; |
131 | bool is_large = false; |
132 | std::vector<BlockInfo> blocks; |
133 | }; |
134 | |
135 | struct TraceEntry { |
136 | enum Action { |
137 | ALLOC, // API made to the caching allocator for new memory |
138 | FREE_REQUESTED, // API call made to the caching allocator to free memory |
139 | FREE_COMPLETED, // The allocator might have to delay a free because |
140 | // it is still in use on another stream via record_stream |
141 | // This event is generated when a free actually completes. |
142 | SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS |
143 | SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to |
144 | // defragement or empty_caches) |
145 | SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace |
146 | // events |
147 | OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free |
148 | // bytes reported by cuda) |
149 | }; |
150 | TraceEntry( |
151 | Action action, |
152 | int64_t addr, |
153 | size_t size, |
154 | cudaStream_t stream, |
155 | std::shared_ptr<Context> context = nullptr) |
156 | : action_(action), |
157 | addr_(addr), |
158 | context_(context), |
159 | stream_(stream), |
160 | size_(size) {} |
161 | Action action_; |
162 | int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda |
163 | std::shared_ptr<Context> context_; |
164 | cudaStream_t stream_; |
165 | int64_t size_; |
166 | }; |
167 | |
168 | struct SnapshotInfo { |
169 | std::vector<SegmentInfo> segments; |
170 | std::vector<std::vector<TraceEntry>> device_traces; |
171 | }; |
172 | |
173 | C10_CUDA_API void setAllocatorSettings(const std::string& env); |
174 | |
175 | // Size pretty-printer |
176 | std::string format_size(uint64_t size); |
177 | |
178 | using OutOfMemoryObserver = std::function<void( |
179 | int64_t device, |
180 | int64_t allocated, |
181 | int64_t device_total, |
182 | int64_t device_free)>; |
183 | |
184 | class CUDAAllocator : public Allocator { |
185 | public: |
186 | virtual void* raw_alloc(size_t nbytes) = 0; |
187 | virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0; |
188 | virtual void raw_delete(void* ptr) = 0; |
189 | virtual void init(int device_count) = 0; |
190 | virtual bool initialized() = 0; |
191 | virtual void setMemoryFraction(double fraction, int device) = 0; |
192 | virtual void emptyCache() = 0; |
193 | virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0; |
194 | virtual void* getBaseAllocation(void* ptr, size_t* size) = 0; |
195 | virtual void recordStream(const DataPtr&, CUDAStream stream) = 0; |
196 | virtual DeviceStats getDeviceStats(int device) = 0; |
197 | virtual void resetAccumulatedStats(int device) = 0; |
198 | virtual void resetPeakStats(int device) = 0; |
199 | virtual SnapshotInfo snapshot() = 0; |
200 | virtual void notifyCaptureBegin( |
201 | int device, |
202 | CaptureId_t graph_id, |
203 | MempoolId_t mempool_id) = 0; |
204 | virtual void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) = 0; |
205 | virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) = 0; |
206 | virtual void notifyCaptureDestroy(int device, MempoolId_t mempool_id) = 0; |
207 | virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0; |
208 | virtual void recordHistory( |
209 | bool enabled, |
210 | CreateContextFn context_recorder, |
211 | size_t alloc_trace_max_entries, |
212 | bool alloc_trace_record_context) = 0; |
213 | virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0; |
214 | virtual bool needsPoolSpecificPeerAccess() = 0; |
215 | virtual std::string name() = 0; |
216 | }; |
217 | |
218 | // Allocator object, statically initialized |
219 | // See BackendInitializer in CUDACachingAllocator.cpp. |
220 | // Atomic loads on x86 are just normal loads, |
221 | // (atomic stores are different), so reading this value |
222 | // is no different than loading a pointer. |
223 | C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator; |
224 | |
225 | inline CUDAAllocator* get() { |
226 | return allocator.load(); |
227 | } |
228 | |
229 | // Called directly by clients. |
230 | inline void* raw_alloc(size_t nbytes) { |
231 | return get()->raw_alloc(nbytes); |
232 | } |
233 | |
234 | inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) { |
235 | return get()->raw_alloc_with_stream(nbytes, stream); |
236 | } |
237 | |
238 | inline void raw_delete(void* ptr) { |
239 | return get()->raw_delete(ptr); |
240 | } |
241 | |
242 | inline void init(int device_count) { |
243 | return get()->init(device_count); |
244 | } |
245 | |
246 | inline void setMemoryFraction(double fraction, int device) { |
247 | return get()->setMemoryFraction(fraction, device); |
248 | } |
249 | |
250 | inline void emptyCache() { |
251 | return get()->emptyCache(); |
252 | } |
253 | |
254 | inline void cacheInfo(int dev_id, size_t* largestBlock) { |
255 | return get()->cacheInfo(dev_id, largestBlock); |
256 | } |
257 | |
258 | inline void* getBaseAllocation(void* ptr, size_t* size) { |
259 | return get()->getBaseAllocation(ptr, size); |
260 | } |
261 | |
262 | inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) { |
263 | return get()->recordStream(dataPtr, stream); |
264 | } |
265 | |
266 | inline DeviceStats getDeviceStats(int device) { |
267 | return get()->getDeviceStats(device); |
268 | } |
269 | |
270 | inline void resetAccumulatedStats(int device) { |
271 | return get()->resetAccumulatedStats(device); |
272 | } |
273 | |
274 | inline void resetPeakStats(int device) { |
275 | return get()->resetPeakStats(device); |
276 | } |
277 | |
278 | inline SnapshotInfo snapshot() { |
279 | return get()->snapshot(); |
280 | } |
281 | |
282 | // CUDAGraph interactions |
283 | inline void notifyCaptureBegin( |
284 | int device, |
285 | CaptureId_t graph_id, |
286 | MempoolId_t mempool_id) { |
287 | return get()->notifyCaptureBegin(device, graph_id, mempool_id); |
288 | } |
289 | |
290 | inline void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) { |
291 | return get()->notifyCaptureAboutToEnd(device, graph_id); |
292 | } |
293 | |
294 | inline void recordHistory( |
295 | bool enabled, |
296 | CreateContextFn context_recorder, |
297 | size_t alloc_trace_max_entries, |
298 | bool alloc_trace_record_context) { |
299 | return get()->recordHistory( |
300 | enabled, |
301 | context_recorder, |
302 | alloc_trace_max_entries, |
303 | alloc_trace_record_context); |
304 | } |
305 | |
306 | inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) { |
307 | return get()->attachOutOfMemoryObserver(observer); |
308 | } |
309 | |
310 | inline void notifyCaptureEnded(int device, CaptureId_t graph_id) { |
311 | return get()->notifyCaptureEnded(device, graph_id); |
312 | } |
313 | |
314 | inline void notifyCaptureDestroy(int device, MempoolId_t mempool_id) { |
315 | return get()->notifyCaptureDestroy(device, mempool_id); |
316 | } |
317 | // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE |
318 | inline std::shared_ptr<void> getIpcDevPtr(std::string handle) { |
319 | return get()->getIpcDevPtr(handle); |
320 | } |
321 | |
322 | inline std::string name() { |
323 | return get()->name(); |
324 | } |
325 | |
326 | } // namespace CUDACachingAllocator |
327 | } // namespace cuda |
328 | } // namespace c10 |
329 | |