1#include <c10/cuda/CUDACachingAllocator.h>
2
3#include <c10/core/impl/GPUTrace.h>
4#include <c10/cuda/CUDAException.h>
5#include <c10/cuda/CUDAFunctions.h>
6#include <c10/cuda/CUDAGuard.h>
7#include <c10/util/UniqueVoidPtr.h>
8#include <c10/util/flat_hash_map.h>
9#include <c10/util/irange.h>
10#include <c10/util/llvmMathExtras.h>
11
12#include <cuda_runtime_api.h>
13#include <algorithm>
14#include <bitset>
15#include <cstdint>
16#include <deque>
17#include <iterator>
18#include <map>
19#include <memory>
20#include <mutex>
21#include <regex>
22#include <set>
23#include <utility>
24#include <vector>
25
26namespace c10 {
27
28C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
29
30namespace cuda {
31namespace CUDACachingAllocator {
32namespace Native {
33
34//
35// Yet another caching allocator for CUDA device allocations.
36//
37// - Allocations are associated with a stream. Once freed, blocks can be
38// re-allocated on the same stream, but not on any other stream.
39// - The allocator attempts to find the smallest cached block that will fit the
40// requested size. If the block is larger than the requested size, it may be
41// split. If no block is found, the allocator will delegate to cudaMalloc.
42// - If the cudaMalloc fails, the allocator will attempt to free one cached
43// block of sufficient size that is not split and retry the allocation.
44// If this also fails, the allocator will attempt to free all cached blocks
45// that are not split and retry the allocation.
46// - Large (>1MB) and small allocations are stored in separate pools.
47// Small requests are packed into 2MB buffers. Large requests will use the
48// smallest available free block or allocate a new block using cudaMalloc.
49// - To reduce fragmentation, requests between 1MB and 10MB will allocate and
50// split a 20MB block, if no free block of sufficient size is available.
51// - To further reduce fragmentation, blocks >= 200MB are not allowed to be
52// split. These oversize cached blocks will still satisfy requests within
53// 20MB of the oversize cached block size.
54//
55// With this allocator, allocations and frees should logically be considered
56// "usages" of the memory segment associated with streams, just like kernel
57// launches. The programmer must insert the proper synchronization if memory
58// segments are used from multiple streams.
59//
60// The library provides a recordStream() function to help insert the correct
61// synchronization when allocations are used on multiple streams. This will
62// ensure that the block is not reused before each recorded stream completes
63// work.
64//
65
66/**
67 * Note [Interaction with CUDA graph capture]
68 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 * Graph capture performs a dry run of a region of execution, freezing all CUDA
70 * work (and virtual addresses used during that work) into a "graph." The graph
71 * may be "replayed" like a single giant kernel, with greatly reduced CPU
72 * overhead as well as modestly improved GPU performance.
73 *
74 * Because capture bakes in memory addresses, the memory used during capture
75 * must be available for the graph to use during replay. DeviceCachingAllocator
76 * assigns and frees memory eagerly and dynamically, so if we're not careful
77 * about managing graphs' memory, at replay time those memory addresses could be
78 * used by other tensors.
79 *
80 * To guarantee a graph's baked in addresses are safe to reuse in replay,
81 * DeviceAllocator satisfies allocations from a graph-private memory pool during
82 * capture, and doesn't begin cudaFreeing those addresses until the graph is
83 * destroyed.
84 *
85 * Within the private pool, allocations are freed and reassigned as usual during
86 * capture. Memory regions will be used in a consistent order during replay. So
87 * a private pool doesn't use memory more wastefully than the default pools
88 * during capture, but it does reserve its high-water mark of used memory away
89 * from the default pools as long as the capture(s) it served survive
90 * (regardless whether those captures are idle or replaying).
91 *
92 * CUDAGraph's requests for private pools are mediated by
93 * DeviceAllocator::notifyCaptureBegin,
94 * notifyCaptureAboutToEnd,
95 * notifyCaptureEnded,
96 * notifyCaptureDestroy.
97 */
98
99constexpr size_t kMinBlockSize =
100 512; // all sizes are rounded to at least 512 bytes
101constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
102constexpr size_t kSmallBuffer =
103 2097152; // "small" allocations are packed in 2 MiB blocks
104constexpr size_t kLargeBuffer =
105 20971520; // "large" allocations may be packed in 20 MiB blocks
106constexpr size_t kMinLargeAlloc =
107 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
108constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
109constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
110
111namespace {
112
113using stream_set = ska::flat_hash_set<cuda::CUDAStream>;
114
115using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
116
117void update_stat(Stat& stat, int64_t amount) {
118 stat.current += amount;
119
120 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
121 stat.current >= 0,
122 "Negative tracked stat in CUDA allocator (likely logic error).");
123
124 stat.peak = std::max(stat.current, stat.peak);
125 if (amount > 0) {
126 stat.allocated += amount;
127 }
128 if (amount < 0) {
129 stat.freed += -amount;
130 }
131}
132
133void reset_accumulated_stat(Stat& stat) {
134 stat.allocated = 0;
135 stat.freed = 0;
136}
137
138void reset_peak_stat(Stat& stat) {
139 stat.peak = stat.current;
140}
141
142template <typename Func>
143void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
144 for (const auto stat_type : c10::irange(stat_types.size())) {
145 if (stat_types[stat_type]) {
146 f(stat_type);
147 }
148 }
149}
150
151void update_stat_array(
152 StatArray& stat_array,
153 int64_t amount,
154 const StatTypes& stat_types) {
155 for_each_selected_stat_type(
156 stat_types, [&stat_array, amount](size_t stat_type) {
157 update_stat(stat_array[stat_type], amount);
158 });
159}
160
161struct Block;
162struct PrivatePool;
163typedef bool (*Comparison)(const Block*, const Block*);
164
165struct BlockPool {
166 BlockPool(
167 Comparison comparator,
168 bool small,
169 PrivatePool* private_pool = nullptr)
170 : blocks(comparator), is_small(small), owner_PrivatePool(private_pool) {}
171 std::set<Block*, Comparison> blocks;
172 const bool is_small;
173 PrivatePool* owner_PrivatePool;
174};
175
176struct HistoryChain {
177 History h;
178 std::unique_ptr<HistoryChain> next; // when blocks are merged we keep records
179 // of what used to be in the block
180};
181
182struct Block {
183 int device; // gpu
184 cudaStream_t stream; // allocation stream
185 stream_set stream_uses; // streams on which the block was used
186 size_t size; // block size in bytes
187 size_t requested_size; // memory originally requested
188 BlockPool* pool{nullptr}; // owning memory pool
189 void* ptr{nullptr}; // memory address
190 bool allocated{false}; // in-use flag
191 Block* prev{nullptr}; // prev block if split from a larger allocation
192 Block* next{nullptr}; // next block if split from a larger allocation
193 int event_count{0}; // number of outstanding CUDA events
194 int gc_count{0}; // counter for prioritizing older / less useful blocks for
195 // garbage collection
196 std::unique_ptr<HistoryChain> history;
197 HistoryChain* history_last{nullptr};
198
199 Block(
200 int device,
201 cudaStream_t stream,
202 size_t size,
203 BlockPool* pool,
204 void* ptr)
205 : device(device),
206 stream(stream),
207 stream_uses(),
208 size(size),
209 requested_size(0),
210 pool(pool),
211 ptr(ptr) {}
212
213 // constructor for search key
214 Block(int device, cudaStream_t stream, size_t size)
215 : device(device),
216 stream(stream),
217 stream_uses(),
218 size(size),
219 requested_size(0) {}
220
221 bool is_split() const {
222 return (prev != nullptr) || (next != nullptr);
223 }
224};
225
226static bool BlockComparator(const Block* a, const Block* b) {
227 if (a->stream != b->stream) {
228 return (uintptr_t)a->stream < (uintptr_t)b->stream;
229 }
230 if (a->size != b->size) {
231 return a->size < b->size;
232 }
233 return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
234}
235
236struct AllocParams {
237 AllocParams(
238 int device,
239 size_t size,
240 cudaStream_t stream,
241 BlockPool* pool,
242 size_t alloc_size,
243 DeviceStats& stats)
244 : search_key(device, stream, size),
245 pool(pool),
246 alloc_size(alloc_size),
247 block(nullptr),
248 err(cudaSuccess) {}
249
250 int device() const {
251 return search_key.device;
252 }
253 cudaStream_t stream() const {
254 return search_key.stream;
255 }
256 size_t size() const {
257 return search_key.size;
258 }
259
260 Block search_key;
261 BlockPool* pool;
262 size_t alloc_size;
263 Block* block;
264 StatTypes stat_types = {false};
265 cudaError_t err;
266};
267
268int trimHistoryBefore(Block* block, void* point) {
269 int n = 0;
270 while (block->history && block->history->h.addr < point) {
271 block->history = std::move(block->history->next);
272 ++n;
273 }
274 if (!block->history) {
275 block->history_last = nullptr;
276 }
277 return n;
278}
279
280// Note: cudaEventCreate when concurrently invoked from multiple threads can be
281// very expensive (at least on certain device/driver combinations). Thus, we a)
282// serialize event creation at a per-device level, and b) pool the events to
283// avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
284// significant improvements in multithreaded workloads with high allocation
285// rates.
286class EventPool {
287 public:
288 using Event = std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>>;
289 // TODO: Explicit device count
290 EventPool() : pools_(at::cuda::device_count()) {}
291
292 Event get(int device) {
293 TORCH_INTERNAL_ASSERT(0 <= device);
294 TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()));
295 auto& pool = pools_[device];
296 auto destructor = [&pool](cudaEvent_t* event) {
297 std::lock_guard<std::mutex> g(pool.mutex_);
298 pool.event_pool_.push_back(std::unique_ptr<cudaEvent_t>(event));
299 };
300
301 // Try to acquire an event from the per-device pool.
302 {
303 std::lock_guard<std::mutex> g(pool.mutex_);
304 if (!pool.event_pool_.empty()) {
305 auto* event = pool.event_pool_.back().release();
306 pool.event_pool_.pop_back();
307 return Event(event, destructor);
308 }
309 }
310 // otherwise, allocate a new event that will be returned to the pool on
311 // destruction.
312 auto new_ptr = std::make_unique<cudaEvent_t>();
313 C10_CUDA_CHECK(
314 cudaEventCreateWithFlags(new_ptr.get(), cudaEventDisableTiming));
315
316 return Event(new_ptr.release(), destructor);
317 }
318
319 void empty_cache() {
320 for (auto& pool : pools_) {
321 std::lock_guard<std::mutex> g(pool.mutex_);
322 pool.event_pool_.clear();
323 }
324 }
325
326 private:
327 struct PerDevicePool {
328 alignas(64) std::mutex mutex_;
329 std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
330 };
331 std::vector<PerDevicePool> pools_;
332};
333
334// CUDA graphs helper
335struct PrivatePool {
336 PrivatePool()
337 : use_count(1),
338 cudaMalloc_count(0),
339 large_blocks(BlockComparator, /*is_small=*/false, this),
340 small_blocks(BlockComparator, /*is_small=*/true, this) {}
341 PrivatePool(const PrivatePool&) = delete;
342 PrivatePool(PrivatePool&&) = delete;
343 PrivatePool& operator=(const PrivatePool&) = delete;
344 // Number of live graphs using this pool
345 int use_count;
346 // Number of unfreed cudaMallocs made for this pool. When use_count and
347 // cudaMalloc_count drop to zero, we can delete this PrivatePool from
348 // graph_pools.
349 int cudaMalloc_count;
350 // Instead of maintaining private BlockPools here, I could stuff all blocks
351 // (private or no) into the top-level large_blocks and small_blocks, and
352 // distinguish private blocks by adding a "pool id" check above the stream
353 // check in BlockComparator. BlockComparator is performance- critial though,
354 // I'd rather not add more logic to it.
355 BlockPool large_blocks;
356 BlockPool small_blocks;
357};
358
359struct MempoolIdHash {
360 std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
361 return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
362 }
363};
364
365cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
366// TODO: ideally we'd replace this with something like
367// !defined(TORCH_HIP_VERSION) as CUDA <= 10 support was dropped and really
368// this is only a workaround for TORCH_HIP_VERSION not being a sufficient guard
369// to prevent ROCM build breakage.
370#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
371 if (at::cuda::currentStreamCaptureStatusMayInitCtx() ==
372 at::cuda::CaptureStatus::None) {
373#endif
374 return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
375#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
376 } else {
377 // It's ok to capture cudaMallocs, as long as we never cudaFree those
378 // addresses before replay.
379 // Capturing cudaMalloc behaves nicely: it gives the graph new VA,
380 // but is ignored (won't leakily allocate new memory) in replays.
381 at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed};
382 return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
383 }
384#endif
385}
386
387} // anonymous namespace
388} // namespace Native
389
390// Environment config parser
391// Defined here, rather than its own .cpp file,
392// because parseArgs needs to know kLargeBuffer.
393// Defined outside namespace Native because it's not Native-specific.
394class CachingAllocatorConfig {
395 public:
396 static size_t max_split_size() {
397 return instance().m_max_split_size;
398 }
399 static double garbage_collection_threshold() {
400 return instance().m_garbage_collection_threshold;
401 }
402
403 // This is used to round-up allocation size to nearest power of 2 divisions.
404 // More description below in function roundup_power2_next_division
405 // As ane example, if we want 4 divisions between 2's power, this can be done
406 // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
407 static size_t roundup_power2_divisions(size_t size) {
408 size_t log_size = (63 - llvm::countLeadingZeros(size));
409
410 // Our intervals start at 1MB and end at 64GB
411 const size_t interval_start =
412 63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
413 const size_t interval_end =
414 63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
415 TORCH_CHECK(
416 (interval_end - interval_start == Native::kRoundUpPowerOfTwoIntervals),
417 "kRoundUpPowerOfTwoIntervals mismatch");
418
419 int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
420
421 index = std::max(0, index);
422 index = std::min(
423 index, static_cast<int>(Native::kRoundUpPowerOfTwoIntervals) - 1);
424 return instance().m_roundup_power2_divisions[index];
425 }
426
427 static CachingAllocatorConfig& instance() {
428 static CachingAllocatorConfig* s_instance = ([]() {
429 auto inst = new CachingAllocatorConfig();
430 const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
431 inst->parseArgs(env);
432 return inst;
433 })();
434 return *s_instance;
435 }
436
437 void parseArgs(const char* env);
438
439 private:
440 CachingAllocatorConfig()
441 : m_max_split_size(std::numeric_limits<size_t>::max()),
442 m_garbage_collection_threshold(0) {
443 m_roundup_power2_divisions.assign(Native::kRoundUpPowerOfTwoIntervals, 0);
444 }
445
446 void lexArgs(const char* env, std::vector<std::string>& config);
447 void consumeToken(
448 const std::vector<std::string>& config,
449 size_t i,
450 const char c);
451 size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
452 size_t parseGarbageCollectionThreshold(
453 const std::vector<std::string>& config,
454 size_t i);
455 size_t parseRoundUpPower2Divisions(
456 const std::vector<std::string>& config,
457 size_t i);
458 size_t parseAllocatorConfig(
459 const std::vector<std::string>& config,
460 size_t i,
461 bool& used_cudaMallocAsync);
462
463 std::atomic<size_t> m_max_split_size;
464 std::vector<size_t> m_roundup_power2_divisions;
465 std::atomic<double> m_garbage_collection_threshold;
466};
467
468void CachingAllocatorConfig::lexArgs(
469 const char* env,
470 std::vector<std::string>& config) {
471 std::vector<char> buf;
472
473 size_t env_length = strlen(env);
474 for (size_t i = 0; i < env_length; i++) {
475 if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
476 if (buf.size() != 0) {
477 config.emplace_back(buf.begin(), buf.end());
478 buf.clear();
479 }
480 config.emplace_back(1, env[i]);
481 } else if (env[i] != ' ') {
482 buf.emplace_back(static_cast<char>(env[i]));
483 }
484 }
485 if (!buf.empty()) {
486 config.emplace_back(buf.begin(), buf.end());
487 }
488}
489
490void CachingAllocatorConfig::consumeToken(
491 const std::vector<std::string>& config,
492 size_t i,
493 const char c) {
494 TORCH_CHECK(
495 i < config.size() && config[i].compare(std::string(1, c)) == 0,
496 "Error parsing CachingAllocator settings, expected ",
497 c,
498 "");
499}
500
501size_t CachingAllocatorConfig::parseMaxSplitSize(
502 const std::vector<std::string>& config,
503 size_t i) {
504 consumeToken(config, ++i, ':');
505 if (++i < config.size()) {
506 size_t val1 = stoi(config[i]);
507 TORCH_CHECK(
508 val1 > Native::kLargeBuffer / (1024 * 1024),
509 "CachingAllocator option max_split_size_mb too small, must be > ",
510 Native::kLargeBuffer / (1024 * 1024),
511 "");
512 val1 = std::max(val1, Native::kLargeBuffer / (1024 * 1024));
513 val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
514 m_max_split_size = val1 * 1024 * 1024;
515 } else {
516 TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
517 }
518 return i;
519}
520
521size_t CachingAllocatorConfig::parseGarbageCollectionThreshold(
522 const std::vector<std::string>& config,
523 size_t i) {
524 consumeToken(config, ++i, ':');
525 if (++i < config.size()) {
526 double val1 = stod(config[i]);
527 TORCH_CHECK(
528 val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
529 TORCH_CHECK(
530 val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
531 m_garbage_collection_threshold = val1;
532 } else {
533 TORCH_CHECK(
534 false, "Error, expecting garbage_collection_threshold value", "");
535 }
536 return i;
537}
538
539size_t CachingAllocatorConfig::parseRoundUpPower2Divisions(
540 const std::vector<std::string>& config,
541 size_t i) {
542 consumeToken(config, ++i, ':');
543 bool first_value = true;
544
545 if (++i < config.size()) {
546 if (config[i].compare("[") == 0) {
547 size_t last_index = 0;
548 while (++i < config.size() && config[i].compare("]") != 0) {
549 std::string val1 = config[i];
550 size_t val2 = 0;
551
552 consumeToken(config, ++i, ':');
553 if (++i < config.size()) {
554 val2 = stoi(config[i]);
555 } else {
556 TORCH_CHECK(
557 false, "Error parsing roundup_power2_divisions value", "");
558 }
559 TORCH_CHECK(
560 llvm::isPowerOf2_64(val2),
561 "For roundups, the divisons has to be power of 2 ",
562 "");
563
564 if (val1.compare(">") == 0) {
565 std::fill(
566 std::next(
567 m_roundup_power2_divisions.begin(),
568 static_cast<std::vector<unsigned long>::difference_type>(
569 last_index)),
570 m_roundup_power2_divisions.end(),
571 val2);
572 } else {
573 size_t val1_long = stoul(val1);
574 TORCH_CHECK(
575 llvm::isPowerOf2_64(val1_long),
576 "For roundups, the intervals have to be power of 2 ",
577 "");
578
579 size_t index = 63 - llvm::countLeadingZeros(val1_long);
580 index = std::max((size_t)0, index);
581 index = std::min(index, m_roundup_power2_divisions.size() - 1);
582
583 if (first_value) {
584 std::fill(
585 m_roundup_power2_divisions.begin(),
586 std::next(
587 m_roundup_power2_divisions.begin(),
588 static_cast<std::vector<unsigned long>::difference_type>(
589 index)),
590 val2);
591 first_value = false;
592 }
593 if (index < m_roundup_power2_divisions.size()) {
594 m_roundup_power2_divisions[index] = val2;
595 }
596 last_index = index;
597 }
598
599 if (config[i + 1].compare("]") != 0) {
600 consumeToken(config, ++i, ',');
601 }
602 }
603 } else { // Keep this for backwards compatibility
604 size_t val1 = stoi(config[i]);
605 TORCH_CHECK(
606 llvm::isPowerOf2_64(val1),
607 "For roundups, the divisons has to be power of 2 ",
608 "");
609 std::fill(
610 m_roundup_power2_divisions.begin(),
611 m_roundup_power2_divisions.end(),
612 val1);
613 }
614 } else {
615 TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
616 }
617 return i;
618}
619
620size_t CachingAllocatorConfig::parseAllocatorConfig(
621 const std::vector<std::string>& config,
622 size_t i,
623 bool& used_cudaMallocAsync) {
624 consumeToken(config, ++i, ':');
625 if (++i < config.size()) {
626 TORCH_CHECK(
627 ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
628 "Unknown allocator backend, "
629 "options are native and cudaMallocAsync");
630 used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
631 if (used_cudaMallocAsync) {
632#if CUDA_VERSION >= 11040
633 int version;
634 C10_CUDA_CHECK(cudaDriverGetVersion(&version));
635 TORCH_CHECK(
636 version >= 11040,
637 "backend:cudaMallocAsync requires CUDA runtime "
638 "11.4 or newer, but cudaDriverGetVersion returned ",
639 version);
640#else
641 TORCH_CHECK(
642 false,
643 "backend:cudaMallocAsync requires PyTorch to be built with "
644 "CUDA 11.4 or newer, but CUDA_VERSION is ",
645 CUDA_VERSION);
646#endif
647 }
648 TORCH_INTERNAL_ASSERT(
649 config[i] == get()->name(),
650 "Allocator backend parsed at runtime != "
651 "allocator backend parsed at load time");
652 } else {
653 TORCH_CHECK(false, "Error parsing backend value", "");
654 }
655 return i;
656}
657
658void CachingAllocatorConfig::parseArgs(const char* env) {
659 // If empty, set the default values
660 m_max_split_size = std::numeric_limits<size_t>::max();
661 m_roundup_power2_divisions.assign(Native::kRoundUpPowerOfTwoIntervals, 0);
662 m_garbage_collection_threshold = 0;
663 bool used_cudaMallocAsync = false;
664 bool used_native_specific_option = false;
665
666 if (env == nullptr) {
667 return;
668 }
669
670 std::vector<std::string> config;
671 lexArgs(env, config);
672
673 for (size_t i = 0; i < config.size(); i++) {
674 if (config[i].compare("max_split_size_mb") == 0) {
675 i = parseMaxSplitSize(config, i);
676 used_native_specific_option = true;
677 } else if (config[i].compare("garbage_collection_threshold") == 0) {
678 i = parseGarbageCollectionThreshold(config, i);
679 used_native_specific_option = true;
680 } else if (config[i].compare("roundup_power2_divisions") == 0) {
681 i = parseRoundUpPower2Divisions(config, i);
682 used_native_specific_option = true;
683 } else if (config[i].compare("backend") == 0) {
684 i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
685 } else {
686 TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i]);
687 }
688
689 if (i + 1 < config.size()) {
690 consumeToken(config, ++i, ',');
691 }
692 }
693
694 if (used_cudaMallocAsync && used_native_specific_option) {
695 TORCH_WARN(
696 "backend:cudaMallocAsync ignores max_split_size_mb, roundup_bypass_threshold_mb,"
697 "roundup_power2_divisions, and garbage_collect_threshold.");
698 }
699}
700
701namespace Native {
702
703class DeviceCachingAllocator {
704 private:
705 // lock around all operations
706 mutable std::recursive_mutex mutex;
707
708 // device statistics
709 DeviceStats stats;
710
711 // unallocated cached blocks larger than 1 MB
712 BlockPool large_blocks;
713
714 // unallocated cached blocks 1 MB or smaller
715 BlockPool small_blocks;
716
717 // allocated or in use by a stream. Holds all active allocations,
718 // whether they came from graph_pools or one of the BlockPools above.
719 ska::flat_hash_set<Block*> active_blocks;
720
721 // captures_underway tracks if a capture might be underway on any stream.
722 // Most of the time it's zero, in which case malloc can avoid calling
723 // cudaStreamGetCaptureInfo in the hot path.
724 int captures_underway = 0;
725 // See free() for this thing's purpose
726 std::vector<Block*> needs_events_deferred_until_no_capture;
727 // outstanding cuda events
728 ska::flat_hash_map<
729 cuda::CUDAStream,
730 std::deque<std::pair<EventPool::Event, Block*>>>
731 cuda_events;
732
733 // record used memory.
734 size_t total_allocated_memory = 0;
735
736 size_t allowed_memory_maximum = 0;
737
738 bool set_fraction = false;
739
740 bool record_history = false;
741 std::atomic<CreateContextFn> context_recorder_;
742 size_t alloc_trace_next = 0;
743 bool alloc_trace_record_context_ = false;
744 size_t alloc_trace_max_entries_ = 1;
745 std::vector<TraceEntry>*
746 alloc_trace; // pointer because we need to intentionally leak this on
747 // deallocation it can hold references to Python state which
748 // will already be destroyed when we are in exit handlers
749
750 // Members specific to CUDA graphs
751
752 // Private pools for CUDA graphs
753 ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
754 graph_pools;
755 // Pools no longer referenced by any graph. Their BlockPools are eligible for
756 // free_blocks. Can't be a vector or deque because we might erase entries in
757 // any order. Could be an std::list, but we don't care much, access and
758 // insert/erase are rare.
759 ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
760 graph_pools_freeable;
761
762 // Maps a capturing stream to its assigned private pool,
763 // in case we want multiple captures to share the same pool
764 ska::flat_hash_map<CaptureId_t, MempoolId_t> capture_to_pool_map;
765
766 // XXX - maybe we should generalize and have multiple events
767 std::vector<OutOfMemoryObserver> oom_observers_;
768
769 public:
770 DeviceCachingAllocator()
771 : large_blocks(BlockComparator, /*is_small=*/false),
772 small_blocks(BlockComparator, /*is_small=*/true),
773 alloc_trace(new std::vector<TraceEntry>()) {
774 stats.max_split_size = CachingAllocatorConfig::max_split_size();
775 context_recorder_.store(nullptr);
776 }
777
778 void recordHistory(
779 bool enabled,
780 CreateContextFn context_recorder,
781 size_t alloc_trace_max_entries,
782 bool alloc_trace_record_context) {
783 std::unique_lock<std::recursive_mutex> lock(mutex);
784 record_history = enabled;
785 context_recorder_.store(context_recorder);
786 alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
787 alloc_trace_record_context_ = alloc_trace_record_context;
788 alloc_trace_next = 0;
789 alloc_trace->clear();
790 }
791
792 void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
793 oom_observers_.emplace_back(std::move(observer));
794 }
795
796 // All public methods (except the above) acquire the allocator mutex.
797 // Thus, do not call a public method from another public method.
798
799 Block* malloc(int device, size_t orig_size, cudaStream_t stream) {
800 // done outside the lock because we don't know what locks the recorder needs
801 // to have...
802 CreateContextFn context_recorder = context_recorder_.load();
803 std::shared_ptr<Context> context =
804 context_recorder ? context_recorder() : nullptr;
805
806 std::unique_lock<std::recursive_mutex> lock(mutex);
807
808 if (C10_LIKELY(captures_underway == 0)) {
809 // Processes end-of-life events for outstanding allocations used on
810 // multiple streams (checks if their GPU-side uses are complete and
811 // recycles their memory if so)
812 //
813 // Q. Why skip process_events if a capture might be underway?
814 // A. process_events involves cudaEventQueries, illegal during CUDA graph
815 // capture.
816 // Dumb simple solution: defer reclaiming these allocations until after
817 // capture. Cross-stream memory use is uncommon, so the deferral's
818 // effect on memory use during capture should be small.
819 process_events();
820 }
821 size_t size = round_size(orig_size);
822 auto& pool = get_pool(size, stream);
823 const size_t alloc_size = get_allocation_size(size);
824 AllocParams params(device, size, stream, &pool, alloc_size, stats);
825 params.stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
826 params.stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
827
828 // First, try to get a block from the existing pool.
829 bool block_found =
830 // Search pool
831 get_free_block(params)
832 // Trigger callbacks and retry search
833 || (trigger_free_memory_callbacks(params) && get_free_block(params));
834
835 // Can't reuse an existing block; try to get a new one.
836 if (!block_found) {
837 // Do garbage collection if the flag is set.
838 if (C10_UNLIKELY(
839 set_fraction &&
840 CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
841 garbage_collect_cached_blocks();
842 }
843 // Attempt allocate
844 block_found = alloc_block(params, false)
845 // Free enough available cached blocks to satisfy alloc and retry
846 // alloc.
847 || (release_available_cached_blocks(params) &&
848 alloc_block(params, false))
849 // Free all non-split cached blocks and retry alloc.
850 || (C10_LIKELY(captures_underway == 0) && release_cached_blocks() &&
851 alloc_block(params, true));
852 if (record_history && block_found) {
853 record_trace(
854 TraceEntry::SEGMENT_ALLOC,
855 int64_t(params.block->ptr),
856 params.block->size,
857 params.stream(),
858 context);
859 }
860 }
861
862 if (!block_found) {
863 // For any error code other than cudaErrorMemoryAllocation,
864 // alloc_block should have thrown an exception already.
865 TORCH_INTERNAL_ASSERT(params.err == cudaErrorMemoryAllocation);
866
867 size_t device_free;
868 size_t device_total;
869 C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
870 std::string allowed_info;
871
872 if (set_fraction) {
873 allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
874 }
875
876 if (record_history) {
877 record_trace(
878 TraceEntry::OOM,
879 device_free,
880 params.size(),
881 params.stream(),
882 std::move(context));
883 }
884 stats.num_ooms += 1;
885
886 c10::reportOutOfMemoryToProfiler(
887 size,
888 stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
889 .current,
890 stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
891 .current,
892 c10::Device(c10::DeviceType::CUDA, static_cast<DeviceIndex>(device)));
893 for (const auto& obs : oom_observers_) {
894 obs(device,
895 alloc_size,
896 set_fraction ? allowed_memory_maximum : device_total,
897 device_free);
898 }
899 // "total capacity": total global memory on GPU
900 // "allowed": memory is allowed to use, which set by fraction.
901 // "already allocated": memory allocated by the program using the
902 // caching allocator
903 // "free": free memory as reported by the CUDA API
904 // "cached": memory held by the allocator but not used by the program
905 //
906 // The "allocated" amount does not include memory allocated outside
907 // of the caching allocator, such as memory allocated by other programs
908 // or memory held by the driver.
909 //
910 // The sum of "allocated" + "free" + "cached" may be less than the
911 // total capacity due to memory held by the driver and usage by other
912 // programs.
913 //
914 // Note that at this point free_cached_blocks has already returned all
915 // possible "cached" memory to the driver. The only remaining "cached"
916 // memory is split from a larger block that is partially in-use.
917 TORCH_CHECK_WITH(
918 OutOfMemoryError,
919 false,
920 "CUDA out of memory. Tried to allocate ",
921 format_size(alloc_size),
922 " (GPU ",
923 device,
924 "; ",
925 format_size(device_total),
926 " total capacity; ",
927 format_size(
928 stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
929 .current),
930 " already allocated; ",
931 format_size(device_free),
932 " free; ",
933 allowed_info,
934 format_size(
935 stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
936 .current),
937 " reserved in total by PyTorch)",
938 " If reserved memory is >> allocated memory try setting max_split_size_mb to avoid"
939 " fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
940 "");
941 }
942
943 TORCH_INTERNAL_ASSERT(
944 params.err == cudaSuccess && params.block != nullptr &&
945 params.block->ptr != nullptr);
946 Block* block = params.block;
947 Block* remaining = nullptr;
948
949 const bool already_split = block->is_split();
950 if (should_split(block, size)) {
951 remaining = block;
952
953 block = new Block(device, stream, size, &pool, block->ptr);
954 block->prev = remaining->prev;
955 if (block->prev) {
956 block->prev->next = block;
957 }
958 block->next = remaining;
959
960 remaining->prev = block;
961 remaining->ptr = static_cast<char*>(remaining->ptr) + size;
962 remaining->size -= size;
963 bool inserted = pool.blocks.insert(remaining).second;
964 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
965
966 if (record_history) {
967 trimHistoryBefore(remaining, (char*)block->ptr + size);
968 }
969
970 if (already_split) {
971 // An already-split inactive block is being shrunk by size bytes.
972 update_stat_array(
973 stats.inactive_split_bytes,
974 -static_cast<std::int64_t>(block->size),
975 params.stat_types);
976 } else {
977 // A new split inactive block is being created from a previously unsplit
978 // block, size remaining->size bytes.
979 for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
980 update_stat(
981 stats.inactive_split_bytes[stat_type],
982 static_cast<std::int64_t>(remaining->size));
983 update_stat(stats.inactive_split[stat_type], 1);
984 });
985 }
986
987 } else if (already_split) {
988 // An already-split block is becoming active
989 for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
990 update_stat(
991 stats.inactive_split_bytes[stat_type],
992 -static_cast<std::int64_t>(block->size));
993 update_stat(stats.inactive_split[stat_type], -1);
994 });
995 }
996
997 block->allocated = true;
998 block->requested_size = orig_size;
999 if (record_history) {
1000 trimHistoryBefore(block, (char*)block->ptr + size);
1001 block->history = std::make_unique<HistoryChain>(HistoryChain{
1002 History{block->ptr, orig_size, std::move(context)},
1003 std::move(block->history)});
1004 if (!block->history_last) {
1005 block->history_last = block->history.get();
1006 }
1007 record_trace(
1008 TraceEntry::ALLOC,
1009 int64_t(block->ptr),
1010 orig_size,
1011 block->stream,
1012 block->history->h.context);
1013 }
1014
1015 bool inserted = active_blocks.insert(block).second;
1016 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
1017
1018 for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
1019 update_stat(stats.allocation[stat_type], 1);
1020 update_stat(
1021 stats.allocated_bytes[stat_type],
1022 static_cast<std::int64_t>(block->size));
1023 update_stat(stats.active[stat_type], 1);
1024 update_stat(
1025 stats.active_bytes[stat_type],
1026 static_cast<std::int64_t>(block->size));
1027 update_stat(
1028 stats.requested_bytes[stat_type],
1029 static_cast<std::int64_t>(block->requested_size));
1030 });
1031 if (block->size >= CachingAllocatorConfig::max_split_size())
1032 update_stat(stats.oversize_allocations, 1);
1033
1034 c10::reportMemoryUsageToProfiler(
1035 block->ptr,
1036 block->size,
1037 stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1038 stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1039 c10::Device(c10::DeviceType::CUDA, device));
1040
1041 return block;
1042 }
1043
1044 void free(Block* block) {
1045 std::lock_guard<std::recursive_mutex> lock(mutex);
1046
1047 block->allocated = false;
1048
1049 // following logic might modifying underlaying Block, causing the size
1050 // changed. We store ahead for reporting
1051 auto orig_block_ptr = block->ptr;
1052 auto orig_block_size = block->size;
1053
1054 StatTypes stat_types = {false};
1055 stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
1056 stat_types[static_cast<size_t>(get_stat_type_for_pool(*(block->pool)))] =
1057 true;
1058 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
1059 update_stat(stats.allocation[stat_type], -1);
1060 update_stat(
1061 stats.allocated_bytes[stat_type],
1062 -static_cast<std::int64_t>(block->size));
1063 });
1064 if (block->history) {
1065 record_trace(
1066 TraceEntry::FREE_REQUESTED,
1067 int64_t(block->ptr),
1068 block->history->h.real_size,
1069 block->stream,
1070 block->history->h.context);
1071 }
1072 if (block->size >= CachingAllocatorConfig::max_split_size())
1073 update_stat(stats.oversize_allocations, -1);
1074
1075 if (!block->stream_uses.empty()) {
1076 if (C10_UNLIKELY(captures_underway)) {
1077 // It's forbidden to cudaEventQuery an event recorded during CUDA graph
1078 // capture. We conservatively defer recording end-of-life events until
1079 // the next call to process_events() (which won't happen until no
1080 // captures are underway)
1081 needs_events_deferred_until_no_capture.push_back(block);
1082 } else {
1083 insert_events(block);
1084 }
1085 } else {
1086 free_block(block);
1087 }
1088
1089 c10::reportMemoryUsageToProfiler(
1090 orig_block_ptr,
1091 -orig_block_size,
1092 stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1093 stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
1094 c10::Device(c10::DeviceType::CUDA, block->device));
1095 }
1096
1097 void* getBaseAllocation(Block* block, size_t* outSize) {
1098 std::lock_guard<std::recursive_mutex> lock(mutex);
1099 while (block->prev) {
1100 block = block->prev;
1101 }
1102 void* basePtr = block->ptr;
1103 if (outSize) {
1104 size_t size = 0;
1105 while (block) {
1106 size += block->size;
1107 block = block->next;
1108 }
1109 *outSize = size;
1110 }
1111 return basePtr;
1112 }
1113
1114 void recordStream(Block* block, cuda::CUDAStream stream) {
1115 std::lock_guard<std::recursive_mutex> lock(mutex);
1116 if (stream.stream() == block->stream) {
1117 // ignore uses on the allocation stream, since those don't require any
1118 // special synchronization
1119 return;
1120 }
1121 block->stream_uses.insert(stream);
1122 }
1123
1124 /** set memory fraction to limit maximum allocated memory **/
1125 void setMemoryFraction(double fraction) {
1126 size_t device_free;
1127 size_t device_total;
1128 C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1129 allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
1130 set_fraction = true;
1131 }
1132
1133 /** returns cached blocks to the system allocator **/
1134 void emptyCache() {
1135 std::lock_guard<std::recursive_mutex> lock(mutex);
1136 release_cached_blocks();
1137 }
1138
1139 /** Retrieves size of largest unused block held by the memory cache **/
1140 void cacheInfo(size_t* largest) {
1141 std::lock_guard<std::recursive_mutex> lock(mutex);
1142 if (*largest ==
1143 0) { // make an initial guess if a zero *largest is passed in
1144 size_t tmp_bytes;
1145 C10_CUDA_CHECK(cudaMemGetInfo(
1146 largest, // Use free memory as an optimistic initial guess of *largest
1147 &tmp_bytes));
1148 }
1149 cache_info_aux(large_blocks, largest);
1150 cache_info_aux(small_blocks, largest);
1151 for (const auto& gp : graph_pools) {
1152 cache_info_aux(gp.second->large_blocks, largest);
1153 cache_info_aux(gp.second->small_blocks, largest);
1154 }
1155 }
1156
1157 /** Returns a copy of the memory allocator stats **/
1158 DeviceStats getStats() {
1159 std::lock_guard<std::recursive_mutex> lock(mutex);
1160 return stats;
1161 }
1162
1163 /** Resets the historical accumulation stats for the device **/
1164 void resetAccumulatedStats() {
1165 std::lock_guard<std::recursive_mutex> lock(mutex);
1166
1167 for (const auto statType :
1168 c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
1169 reset_accumulated_stat(stats.allocation[statType]);
1170 reset_accumulated_stat(stats.segment[statType]);
1171 reset_accumulated_stat(stats.active[statType]);
1172 reset_accumulated_stat(stats.inactive_split[statType]);
1173 reset_accumulated_stat(stats.allocated_bytes[statType]);
1174 reset_accumulated_stat(stats.reserved_bytes[statType]);
1175 reset_accumulated_stat(stats.active_bytes[statType]);
1176 reset_accumulated_stat(stats.inactive_split_bytes[statType]);
1177 reset_accumulated_stat(stats.requested_bytes[statType]);
1178 }
1179
1180 stats.num_alloc_retries = 0;
1181 stats.num_ooms = 0;
1182 reset_accumulated_stat(stats.oversize_allocations);
1183 reset_accumulated_stat(stats.oversize_segments);
1184 }
1185
1186 /** Resets the historical peak stats for the device **/
1187 void resetPeakStats() {
1188 std::lock_guard<std::recursive_mutex> lock(mutex);
1189
1190 for (const auto statType :
1191 c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
1192 reset_peak_stat(stats.allocation[statType]);
1193 reset_peak_stat(stats.segment[statType]);
1194 reset_peak_stat(stats.active[statType]);
1195 reset_peak_stat(stats.inactive_split[statType]);
1196 reset_peak_stat(stats.allocated_bytes[statType]);
1197 reset_peak_stat(stats.reserved_bytes[statType]);
1198 reset_peak_stat(stats.active_bytes[statType]);
1199 reset_peak_stat(stats.inactive_split_bytes[statType]);
1200 reset_peak_stat(stats.requested_bytes[statType]);
1201 }
1202 reset_peak_stat(stats.oversize_allocations);
1203 reset_peak_stat(stats.oversize_segments);
1204 }
1205
1206 /** Dump a complete snapshot of the memory held by the allocator. Potentially
1207 * VERY expensive. **/
1208 std::vector<SegmentInfo> snapshot() {
1209 std::lock_guard<std::recursive_mutex> lock(mutex);
1210
1211 size_t total_active = 0;
1212 std::vector<SegmentInfo> result;
1213 const auto all_blocks = get_all_blocks();
1214 for (const Block* const head_block : all_blocks) {
1215 if (head_block->prev != nullptr) {
1216 continue;
1217 }
1218 result.emplace_back();
1219 SegmentInfo& segment_info = result.back();
1220 segment_info.device = head_block->device;
1221 segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
1222 segment_info.stream = head_block->stream;
1223 segment_info.is_large = (!head_block->pool->is_small);
1224
1225 const Block* block = head_block;
1226 while (block != nullptr) {
1227 segment_info.blocks.emplace_back();
1228 BlockInfo& block_info = segment_info.blocks.back();
1229
1230 block_info.size = block->size;
1231 block_info.requested_size = block->requested_size;
1232 block_info.allocated = block->allocated;
1233 block_info.active = block->allocated || (block->event_count > 0) ||
1234 !block->stream_uses.empty();
1235
1236 segment_info.total_size += block_info.size;
1237 if (block_info.allocated) {
1238 segment_info.allocated_size += block_info.size;
1239 }
1240 if (block_info.active) {
1241 segment_info.active_size += block_info.size;
1242 segment_info.requested_size += block_info.requested_size;
1243 }
1244 HistoryChain* h = block->history.get();
1245 while (h) {
1246 block_info.history.push_back(h->h);
1247 h = h->next.get();
1248 }
1249 block = block->next;
1250 }
1251 total_active += segment_info.active_size;
1252 }
1253
1254 std::sort(
1255 result.begin(),
1256 result.end(),
1257 [](const SegmentInfo& a, const SegmentInfo& b) {
1258 return a.address < b.address;
1259 });
1260
1261 if (record_history) {
1262 record_trace(TraceEntry::SNAPSHOT, 0, total_active, 0, nullptr);
1263 }
1264 return result;
1265 }
1266
1267 std::vector<TraceEntry> trace() {
1268 std::lock_guard<std::recursive_mutex> lock(mutex);
1269 std::vector<TraceEntry> result;
1270 result.reserve(alloc_trace->size());
1271 result.insert(
1272 result.end(),
1273 alloc_trace->begin() + alloc_trace_next,
1274 alloc_trace->end());
1275 result.insert(
1276 result.end(),
1277 alloc_trace->begin(),
1278 alloc_trace->begin() + alloc_trace_next);
1279 return result;
1280 }
1281
1282 // This function takes the size and number of divisions argument and rounds
1283 // up the size argument for the nearest power-of-2 division.
1284 // For example, if we need to round-up 1200 and number of divisions is 4,
1285 // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
1286 // them, the values are 1024, 1280, 1536, and 1792. So the function will
1287 // return 1280 as the nearest ceiling of power-2 divison.
1288 static size_t roundup_power2_next_division(size_t size, size_t divisions) {
1289 if (C10_UNLIKELY(size <= 4 || divisions <= 1)) {
1290 return size;
1291 }
1292 if (llvm::isPowerOf2_64(size)) {
1293 return size;
1294 }
1295
1296 // divide the space between these 2's power into equal divisions
1297 // If division is zero, return the power-of-2 ceiling.
1298 size_t power2_floor = llvm::PowerOf2Floor(size);
1299 size_t power2_divison =
1300 power2_floor >> (63 - llvm::countLeadingZeros(divisions));
1301 if (C10_UNLIKELY(power2_divison == 0)) {
1302 return (power2_floor << 1);
1303 }
1304 size_t round_size_floor = size & (~(power2_divison - 1));
1305 return (round_size_floor == size) ? size
1306 : round_size_floor + power2_divison;
1307 }
1308
1309 static size_t round_size(size_t size) {
1310 if (size < kMinBlockSize) {
1311 return kMinBlockSize;
1312 } else {
1313 auto divisions = CachingAllocatorConfig::roundup_power2_divisions(size);
1314 if (divisions > 0 && size > (kMinBlockSize * divisions)) {
1315 return roundup_power2_next_division(size, divisions);
1316 } else {
1317 return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
1318 }
1319 }
1320 }
1321
1322 // See Note [Interaction with CUDA graph capture]
1323
1324 // Called by CUDAGraph::capture_begin
1325 void notifyCaptureBegin(CaptureId_t graph_id, MempoolId_t mempool_id) {
1326 std::lock_guard<std::recursive_mutex> lock(mutex);
1327 captures_underway++;
1328 auto it = graph_pools.find(mempool_id);
1329 if (it == graph_pools.end()) {
1330 // mempool_id does not reference an existing pool. Make a new pool for
1331 // this capture.
1332 graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
1333 } else {
1334 // mempool_id references an existing pool, which the current capture will
1335 // share. Check this pool is live (at least one other capture already
1336 // references it).
1337 TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
1338 it->second->use_count++;
1339 }
1340 // Maps this graph_id to mempool_id and makes sure this graph_id wasn't
1341 // somehow assigned a mempool_id already. Keeps essential effect (insert)
1342 // out of macro.
1343 bool inserted = capture_to_pool_map.insert({graph_id, mempool_id}).second;
1344 TORCH_INTERNAL_ASSERT(inserted);
1345 }
1346
1347 // Called by CUDAGraph::capture_end
1348 void notifyCaptureAboutToEnd(CaptureId_t graph_id) {
1349 std::lock_guard<std::recursive_mutex> lock(mutex);
1350 captures_underway--;
1351 auto it = capture_to_pool_map.find(graph_id);
1352 TORCH_INTERNAL_ASSERT(it != capture_to_pool_map.end());
1353 capture_to_pool_map.erase(it);
1354 }
1355
1356 // Called by CUDAGraph::reset
1357 void notifyCaptureDestroy(MempoolId_t mempool_id) {
1358 std::lock_guard<std::recursive_mutex> lock(mutex);
1359 // The instantiated cudaGraphExec_t has been destroyed. We can't blindly
1360 // delete and cudaFree the mempool its capture used, because
1361 // 1. other graph(s) might share the same pool
1362 // 2. the user might still hold references to output tensors allocated
1363 // during capture.
1364 // To handle 1 and 2, we track the number of graphs using this particular
1365 // mempool. When the count reaches 0, we tell free_cached_blocks it may now
1366 // cudaFree blocks from this graph's pool when it discovers they're unused
1367 // (unsplit).
1368 auto it = graph_pools.find(mempool_id);
1369 TORCH_INTERNAL_ASSERT(it != graph_pools.end());
1370 auto uc = --(it->second->use_count);
1371 TORCH_INTERNAL_ASSERT(uc >= 0);
1372 if (uc == 0) {
1373 // Allows free_cached_blocks to begin cudaFreeing this pool's memory,
1374 // and makes sure this pool wasn't somehow made freeable already.
1375 bool inserted =
1376 graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
1377 TORCH_INTERNAL_ASSERT(inserted);
1378 }
1379 }
1380
1381 private:
1382 // All private methods do not acquire the allocator mutex.
1383
1384 std::vector<const Block*> get_all_blocks() const {
1385 std::vector<const Block*> blocks;
1386 blocks.insert(
1387 blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
1388 blocks.insert(
1389 blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
1390 for (const auto& gp : graph_pools) {
1391 blocks.insert(
1392 blocks.end(),
1393 gp.second->small_blocks.blocks.begin(),
1394 gp.second->small_blocks.blocks.end());
1395 blocks.insert(
1396 blocks.end(),
1397 gp.second->large_blocks.blocks.begin(),
1398 gp.second->large_blocks.blocks.end());
1399 }
1400 blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
1401 return blocks;
1402 }
1403
1404 /** moves a block into a pool of cached free blocks */
1405 void free_block(Block* block) {
1406 TORCH_INTERNAL_ASSERT(
1407 !block->allocated && block->event_count == 0 &&
1408 block->stream_uses.empty());
1409 if (block->history) {
1410 record_trace(
1411 TraceEntry::FREE_COMPLETED,
1412 int64_t(block->ptr),
1413 block->history->h.real_size,
1414 block->stream,
1415 block->history->h.context);
1416 }
1417 size_t original_block_size = block->size;
1418 size_t requested_size = block->requested_size;
1419
1420 auto& pool = *block->pool;
1421 int64_t net_change_inactive_split_blocks = 0;
1422 int64_t net_change_inactive_split_size = 0;
1423
1424 const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
1425 for (Block* merge_candidate : merge_candidates) {
1426 const int64_t subsumed_size =
1427 try_merge_blocks(block, merge_candidate, pool);
1428 if (subsumed_size > 0) {
1429 net_change_inactive_split_blocks -= 1;
1430 net_change_inactive_split_size -= subsumed_size;
1431 }
1432 }
1433
1434 active_blocks.erase(block);
1435 // Makes sure the Block* isn't already present in the pool we're freeing it
1436 // back into.
1437 bool inserted = pool.blocks.insert(block).second;
1438 TORCH_INTERNAL_ASSERT(inserted);
1439
1440 if (block->is_split()) {
1441 net_change_inactive_split_blocks += 1;
1442 net_change_inactive_split_size += block->size;
1443 }
1444
1445 StatTypes stat_types = {false};
1446 stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
1447 stat_types[static_cast<size_t>(get_stat_type_for_pool(pool))] = true;
1448 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
1449 update_stat(
1450 stats.inactive_split[stat_type], net_change_inactive_split_blocks);
1451 update_stat(
1452 stats.inactive_split_bytes[stat_type],
1453 net_change_inactive_split_size);
1454 update_stat(stats.active[stat_type], -1);
1455 update_stat(
1456 stats.active_bytes[stat_type],
1457 -static_cast<std::int64_t>(original_block_size));
1458 update_stat(
1459 stats.requested_bytes[stat_type],
1460 -static_cast<std::int64_t>(requested_size));
1461 });
1462 }
1463
1464 /** combine previously split blocks. returns the size of the subsumed block,
1465 * or 0 on failure. */
1466 size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
1467 if (!src || src->allocated || src->event_count > 0 ||
1468 !src->stream_uses.empty()) {
1469 return 0;
1470 }
1471
1472 AT_ASSERT(dst->is_split() && src->is_split());
1473
1474 if (dst->prev == src) { // [src dst]
1475 dst->ptr = src->ptr;
1476 dst->prev = src->prev;
1477 if (dst->prev) {
1478 dst->prev->next = dst;
1479 }
1480 if (!dst->history) {
1481 dst->history = std::move(src->history);
1482 dst->history_last = src->history_last;
1483 } else if (src->history) {
1484 src->history_last->next = std::move(dst->history);
1485 dst->history = std::move(src->history);
1486 }
1487 src->history_last = nullptr;
1488 } else { // [dest src]
1489 dst->next = src->next;
1490 if (dst->next) {
1491 dst->next->prev = dst;
1492 }
1493
1494 if (!dst->history) {
1495 dst->history = std::move(src->history);
1496 dst->history_last = src->history_last;
1497 } else if (src->history) {
1498 dst->history_last->next = std::move(src->history);
1499 dst->history_last = src->history_last;
1500 }
1501 src->history_last = nullptr;
1502 }
1503 const size_t subsumed_size = src->size;
1504 dst->size += subsumed_size;
1505 auto erased = pool.blocks.erase(src);
1506 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
1507 delete src;
1508
1509 return subsumed_size;
1510 }
1511
1512 BlockPool& get_pool(size_t size, cudaStream_t stream) {
1513#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
1514 // captures_underway is a conservative guess that the current stream may be
1515 // capturing. It's only > 0 if some thread has begun and not yet ended a
1516 // capture, so it's usually 0, and we can short-circuit
1517 // cudaStreamCaptureStatus (which does a TLS lookup).
1518 if (C10_UNLIKELY(captures_underway)) {
1519 CaptureId_t id;
1520 cudaStreamCaptureStatus status;
1521 C10_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &id));
1522 if (status != cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) {
1523 TORCH_INTERNAL_ASSERT(
1524 status !=
1525 cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated);
1526 // Retrieves the private pool assigned to this capture.
1527 auto it0 = capture_to_pool_map.find(id);
1528 TORCH_INTERNAL_ASSERT(it0 != capture_to_pool_map.end());
1529 auto it1 = graph_pools.find(it0->second);
1530 TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
1531 if (size <= kSmallSize) {
1532 return it1->second->small_blocks;
1533 } else {
1534 return it1->second->large_blocks;
1535 }
1536 }
1537 }
1538#endif
1539 if (size <= kSmallSize) {
1540 return small_blocks;
1541 } else {
1542 return large_blocks;
1543 }
1544 }
1545
1546 StatType get_stat_type_for_pool(const BlockPool& pool) {
1547 return pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL;
1548 }
1549
1550 bool should_split(const Block* block, size_t size) {
1551 size_t remaining = block->size - size;
1552 if (block->pool->is_small) {
1553 return remaining >= kMinBlockSize;
1554 } else {
1555 return (size < CachingAllocatorConfig::max_split_size()) &&
1556 (remaining > kSmallSize);
1557 }
1558 }
1559
1560 static size_t get_allocation_size(size_t size) {
1561 if (size <= kSmallSize) {
1562 return kSmallBuffer;
1563 } else if (size < kMinLargeAlloc) {
1564 return kLargeBuffer;
1565 } else {
1566 return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
1567 }
1568 }
1569
1570 bool get_free_block(AllocParams& p) {
1571 BlockPool& pool = *p.pool;
1572
1573 if (C10_UNLIKELY(
1574 set_fraction &&
1575 CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
1576 // Track block reuse interval only when garbage collection is enabled.
1577 for (auto& b : pool.blocks) {
1578 ++b->gc_count;
1579 }
1580 }
1581 auto it = pool.blocks.lower_bound(&p.search_key);
1582 if (it == pool.blocks.end() || (*it)->stream != p.stream())
1583 return false;
1584 // Do not return an oversized block for a large request
1585 if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
1586 ((*it)->size >= CachingAllocatorConfig::max_split_size()))
1587 return false;
1588 // Allow oversized block size to be rounded up but within a limit
1589 if ((p.size() >= CachingAllocatorConfig::max_split_size()) &&
1590 ((*it)->size >= p.size() + kLargeBuffer))
1591 return false;
1592 p.block = *it;
1593 (*it)->gc_count = 0; // Denote this block has been used
1594 pool.blocks.erase(it);
1595 return true;
1596 }
1597
1598 bool trigger_free_memory_callbacks(AllocParams& p) {
1599 bool freed_memory = false;
1600 for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
1601 freed_memory |=
1602 FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
1603 }
1604 return freed_memory;
1605 }
1606
1607 void garbage_collect_cached_blocks() {
1608 // Free unused cached blocks to reclaim GPU memory.
1609 // Unlike release_cached_blocks(), this does not enforce synchronization and
1610 // therefore should be of less overheads.
1611
1612 size_t gc_threshold = static_cast<size_t>(
1613 CachingAllocatorConfig::garbage_collection_threshold() *
1614 allowed_memory_maximum);
1615 // No need to trigger GC yet
1616 if (total_allocated_memory <= gc_threshold) {
1617 return;
1618 }
1619 const auto target_size = total_allocated_memory - gc_threshold;
1620 size_t gc_reclaimed = 0;
1621
1622 // Calculate the total age of the free-able blocks. We'll use it later to
1623 // get "avg age" threshold.
1624 double total_age = 0.0;
1625 int freeable_block_count = 0;
1626 for (auto& b : large_blocks.blocks) {
1627 if (!b->is_split()) {
1628 total_age += b->gc_count;
1629 ++freeable_block_count;
1630 }
1631 }
1632 // No free-able blocks?
1633 if (freeable_block_count == 0) {
1634 return;
1635 }
1636
1637 // Repeat GC until we reach reclaim > target size.
1638 bool block_freed = true;
1639 while (gc_reclaimed < target_size && block_freed == true &&
1640 freeable_block_count > 0) {
1641 // Free blocks exceeding this age threshold first.
1642 double age_threshold = total_age / freeable_block_count;
1643 // Stop iteration if we can no longer free a block.
1644 block_freed = false;
1645
1646 // Free blocks of > avg age. Don't stop upon reaching the target_size,
1647 // we don't want this GC to be triggered frequently.
1648 auto it = large_blocks.blocks.begin();
1649 while (it != large_blocks.blocks.end()) {
1650 Block* block = *it;
1651 ++it;
1652 if (!block->is_split() && block->gc_count >= age_threshold) {
1653 block_freed = true;
1654 gc_reclaimed += block->size;
1655 total_age -= block->gc_count; // Decrement the age
1656 freeable_block_count--; // One less block that can be freed
1657 release_block(block);
1658 }
1659 }
1660 }
1661 }
1662
1663 bool alloc_block(AllocParams& p, bool isRetry) {
1664 // Defensively checks for preexisting CUDA error state.
1665 C10_CUDA_CHECK(cudaGetLastError());
1666
1667 size_t size = p.alloc_size;
1668 void* ptr;
1669
1670 if (isRetry) {
1671 stats.num_alloc_retries += 1;
1672 }
1673
1674 if (set_fraction &&
1675 total_allocated_memory + size > allowed_memory_maximum) {
1676 p.err = cudaErrorMemoryAllocation;
1677 return false;
1678 } else {
1679 p.err = cudaMallocMaybeCapturing(&ptr, size);
1680 if (p.err != cudaSuccess) {
1681 if (p.err == cudaErrorMemoryAllocation) {
1682 // If this is the first attempt (!isRetry), we can forgive and clear
1683 // CUDA's internal error state.
1684 //
1685 // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH
1686 // will take over to throw a helpful exception. The user can choose
1687 // to catch the exception, free some stuff in their script, and
1688 // attempt the allocation again. In this case, we can also forgive and
1689 // clear CUDA's internal error state.
1690 cudaGetLastError();
1691 } else {
1692 // If the error's unrelated to memory allocation, we should throw
1693 // immediately.
1694 C10_CUDA_CHECK(p.err);
1695 }
1696 return false;
1697 }
1698 }
1699
1700 if (p.pool->owner_PrivatePool) {
1701 // The block is for a CUDA graph's PrivatePool.
1702 p.pool->owner_PrivatePool->cudaMalloc_count++;
1703 }
1704
1705 total_allocated_memory += size;
1706 p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
1707 for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
1708 update_stat(stats.segment[stat_type], 1);
1709 update_stat(stats.reserved_bytes[stat_type], size);
1710 });
1711 if (size >= CachingAllocatorConfig::max_split_size())
1712 update_stat(stats.oversize_segments, 1);
1713
1714 // p.block came from new, not cudaMalloc. It should not be nullptr here.
1715 TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
1716 return true;
1717 }
1718
1719 /** Free one or more oversize blocks to the system allocator. But only enough
1720 * **/
1721 /** to satisfy the target size **/
1722 bool release_available_cached_blocks(const AllocParams& p) {
1723 if (CachingAllocatorConfig::max_split_size() ==
1724 std::numeric_limits<size_t>::max())
1725 return false;
1726 BlockPool& pool = *p.pool;
1727
1728 // because of std::unique_ptr, block cannot be trivially copied
1729 Block key(
1730 p.search_key.device,
1731 p.search_key.stream,
1732 p.search_key.size,
1733 p.search_key.pool,
1734 p.search_key.ptr);
1735 key.size = (key.size < CachingAllocatorConfig::max_split_size())
1736 ? CachingAllocatorConfig::max_split_size()
1737 : key.size;
1738 auto it = pool.blocks.lower_bound(&key);
1739 if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
1740 // No single block is large enough; free multiple oversize blocks,
1741 // starting with the largest
1742 if (it == pool.blocks.begin())
1743 return false;
1744 size_t totalReleased = 0;
1745 --it; // Back up one item. Now on the largest block for the correct
1746 // stream
1747 while ((totalReleased < key.size) &&
1748 ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
1749 ((*it)->stream == p.stream())) {
1750 auto cur = it;
1751 totalReleased += (*it)->size;
1752 if (it != pool.blocks.begin()) {
1753 --it;
1754 release_block(*cur);
1755 } else {
1756 release_block(*cur);
1757 break;
1758 }
1759 }
1760 if (totalReleased < key.size)
1761 return false;
1762 } else {
1763 release_block(*it);
1764 }
1765 return true;
1766 }
1767
1768 bool release_cached_blocks() {
1769 // First ensure that all blocks that can't currently be allocated due to
1770 // outstanding events are returned to the pool.
1771 synchronize_and_free_events();
1772
1773 // Free all non-split cached blocks to system allocator
1774 release_blocks(large_blocks);
1775 release_blocks(small_blocks);
1776
1777 for (auto it = graph_pools_freeable.begin();
1778 it != graph_pools_freeable.end();) {
1779 // See notifyCaptureDestroy for the strategy here.
1780 TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
1781 release_blocks(it->second->small_blocks);
1782 release_blocks(it->second->large_blocks);
1783 if (it->second->cudaMalloc_count == 0) {
1784 auto erase_count = graph_pools.erase(it->first);
1785 TORCH_INTERNAL_ASSERT(erase_count == 1);
1786 it = graph_pools_freeable.erase(it);
1787 } else {
1788 ++it;
1789 }
1790 }
1791
1792 return true;
1793 }
1794
1795 void release_block(Block* block) {
1796 C10_CUDA_CHECK(cudaFree((void*)block->ptr));
1797 total_allocated_memory -= block->size;
1798
1799 auto* pool = block->pool;
1800 if (pool->owner_PrivatePool) {
1801 // The cudaFreed block belonged to a CUDA graph's PrivatePool.
1802 TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->cudaMalloc_count > 0);
1803 pool->owner_PrivatePool->cudaMalloc_count--;
1804 }
1805
1806 StatTypes stat_types = {false};
1807 stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
1808 stat_types[static_cast<size_t>(get_stat_type_for_pool(*pool))] = true;
1809 for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
1810 update_stat(stats.segment[stat_type], -1);
1811 update_stat(
1812 stats.reserved_bytes[stat_type],
1813 -static_cast<std::int64_t>(block->size));
1814 });
1815 if (block->size >= CachingAllocatorConfig::max_split_size())
1816 update_stat(stats.oversize_segments, -1);
1817 if (block->history) {
1818 record_trace(
1819 TraceEntry::SEGMENT_FREE,
1820 int64_t(block->ptr),
1821 block->size,
1822 block->stream,
1823 block->history->h.context);
1824 }
1825 pool->blocks.erase(block);
1826 delete block;
1827 }
1828
1829 void release_blocks(BlockPool& pool) {
1830 // Frees all non-split blocks
1831 auto it = pool.blocks.begin();
1832 while (it != pool.blocks.end()) {
1833 Block* block = *it;
1834 ++it;
1835 if (!block->prev && !block->next) {
1836 release_block(block);
1837 }
1838 }
1839 }
1840
1841 EventPool::Event create_event_internal(int idx) {
1842 // Leak the event pool to avoid shutdown issues.
1843 static auto* event_pool = new EventPool();
1844 return event_pool->get(idx);
1845 }
1846
1847 void synchronize_and_free_events() {
1848 // Synchronize on outstanding events and then free associated blocks.
1849
1850 // This function syncs, so capture should not be underway. Might as well
1851 // make sure capture-deferred end of life events get processed too.
1852 TORCH_INTERNAL_ASSERT(captures_underway == 0);
1853 insert_events_deferred_until_no_capture();
1854
1855 for (auto& st : cuda_events) {
1856 for (auto& e : st.second) {
1857 EventPool::Event event = std::move(e.first);
1858 Block* block = e.second;
1859
1860 C10_CUDA_CHECK(cudaEventSynchronize(*event));
1861
1862 block->event_count--;
1863 if (block->event_count == 0) {
1864 free_block(block);
1865 }
1866 }
1867 }
1868
1869 cuda_events.clear();
1870 }
1871
1872 void insert_events(Block* block) {
1873 int prev_device;
1874 C10_CUDA_CHECK(cudaGetDevice(&prev_device));
1875
1876 stream_set streams(std::move(block->stream_uses));
1877 AT_ASSERT(block->stream_uses.empty());
1878 for (auto& stream : streams) {
1879 C10_CUDA_CHECK(cudaSetDevice(stream.device_index()));
1880
1881 EventPool::Event event =
1882 create_event_internal(static_cast<int>(stream.device_index()));
1883 C10_CUDA_CHECK(cudaEventRecord(*event, stream.stream()));
1884
1885 block->event_count++;
1886 cuda_events[stream].emplace_back(std::move(event), block);
1887 }
1888
1889 C10_CUDA_CHECK(cudaSetDevice(prev_device));
1890 }
1891
1892 void insert_events_deferred_until_no_capture() {
1893 if (C10_UNLIKELY(needs_events_deferred_until_no_capture.size() > 0)) {
1894 for (auto* block : needs_events_deferred_until_no_capture) {
1895 TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
1896 insert_events(block);
1897 }
1898 needs_events_deferred_until_no_capture.clear();
1899 }
1900 }
1901
1902 void process_events() {
1903 insert_events_deferred_until_no_capture();
1904
1905 // Process outstanding cudaEvents. Events that are completed are
1906 // removed from the queue, and the 'event_count' for the
1907 // corresponding allocation is decremented. We maintain a separate
1908 // list of events per stream to avoid head-of-line delays if one
1909 // or more streams has long-running operations.
1910
1911 // Iterate over different streams.
1912 for (auto it = cuda_events.begin(); it != cuda_events.end();) {
1913 // Iterate over this stream's (event, block) pairs.
1914 while (!it->second.empty()) {
1915 auto& e = it->second.front();
1916 EventPool::Event event = std::move(e.first);
1917 Block* block = e.second;
1918
1919 cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(*event));
1920 if (err == cudaErrorNotReady) {
1921 // ignore and clear the error if not ready
1922 cudaGetLastError();
1923 // Return the ownership of the Event (unique ptr)
1924 e.first = std::move(event);
1925 break;
1926 } else if (err != cudaSuccess) {
1927 C10_CUDA_CHECK(err);
1928 }
1929
1930 block->event_count--;
1931 if (block->event_count == 0) {
1932 free_block(block);
1933 }
1934 it->second.pop_front();
1935 }
1936
1937 if (it->second.empty()) {
1938 it = cuda_events.erase(it);
1939 } else {
1940 it++;
1941 }
1942 }
1943 }
1944
1945 // Iterates over sizes of all memory blocks for given device in given pool
1946 void cache_info_aux(const BlockPool& pool, size_t* largest) {
1947 for (const auto& block : pool.blocks) {
1948 const auto blocksize = block->size;
1949 if (blocksize > *largest) {
1950 *largest = blocksize;
1951 }
1952 }
1953 }
1954
1955 void record_trace(
1956 TraceEntry::Action action,
1957 int64_t addr,
1958 size_t size,
1959 cudaStream_t stream,
1960 std::shared_ptr<Context> context) {
1961 auto te = TraceEntry(
1962 action,
1963 addr,
1964 size,
1965 stream,
1966 alloc_trace_record_context_ ? std::move(context) : nullptr);
1967 if (alloc_trace->size() < alloc_trace_max_entries_) {
1968 alloc_trace->emplace_back(te);
1969 } else {
1970 (*alloc_trace)[alloc_trace_next++] = te;
1971 if (alloc_trace_next == alloc_trace_max_entries_) {
1972 alloc_trace_next = 0;
1973 }
1974 }
1975 }
1976};
1977
1978// Returns whether to force all allocations to bypass the caching allocator and
1979// go straight to cudaMalloc. This setting is useful when debugging GPU memory
1980// errors, since the caching allocator foils cuda-memcheck.
1981bool forceUncachedAllocator() {
1982 static bool force_uncached =
1983 getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
1984 return force_uncached;
1985}
1986
1987static void uncached_delete(void* ptr) {
1988 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
1989 if (C10_UNLIKELY(interp)) {
1990 (*interp)->trace_gpu_memory_deallocation(reinterpret_cast<uintptr_t>(ptr));
1991 }
1992 C10_CUDA_CHECK(cudaFree(ptr));
1993}
1994
1995void local_raw_delete(void* ptr);
1996
1997class NativeCachingAllocator : public CUDAAllocator {
1998 private:
1999 std::mutex mutex;
2000
2001 // allocated blocks by device pointer
2002 ska::flat_hash_map<void*, Block*> allocated_blocks;
2003
2004 void add_allocated_block(Block* block) {
2005 std::lock_guard<std::mutex> lock(mutex);
2006 allocated_blocks[block->ptr] = block;
2007 }
2008
2009 public:
2010 std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
2011
2012 Block* get_allocated_block(void* ptr, bool remove = false) {
2013 std::lock_guard<std::mutex> lock(mutex);
2014 auto it = allocated_blocks.find(ptr);
2015 if (it == allocated_blocks.end()) {
2016 return nullptr;
2017 }
2018 Block* block = it->second;
2019 if (remove) {
2020 allocated_blocks.erase(it);
2021 }
2022 return block;
2023 }
2024
2025 void init(int device_count) override {
2026 const auto size = static_cast<int64_t>(device_allocator.size());
2027 if (size < device_count) {
2028 device_allocator.resize(device_count);
2029 for (const auto i : c10::irange(size, device_count)) {
2030 device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
2031 }
2032 }
2033 }
2034
2035 bool initialized() override {
2036 return device_allocator.size() > 0;
2037 }
2038
2039 /** allocates a block which is safe to use from the provided stream */
2040 void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
2041 TORCH_INTERNAL_ASSERT(
2042 0 <= device && static_cast<size_t>(device) < device_allocator.size(),
2043 "Allocator not initialized for device ",
2044 device,
2045 ": did you call init?");
2046 Block* block = device_allocator[device]->malloc(device, size, stream);
2047 add_allocated_block(block);
2048 *devPtr = (void*)block->ptr;
2049 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
2050 if (C10_UNLIKELY(interp)) {
2051 (*interp)->trace_gpu_memory_allocation(
2052 reinterpret_cast<uintptr_t>(*devPtr));
2053 }
2054 }
2055
2056 void free(void* ptr) {
2057 if (!ptr) {
2058 return;
2059 }
2060 Block* block = get_allocated_block(ptr, true /* remove */);
2061 if (!block) {
2062 TORCH_CHECK(false, "invalid device pointer: ", ptr);
2063 }
2064 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
2065 if (C10_UNLIKELY(interp)) {
2066 (*interp)->trace_gpu_memory_deallocation(
2067 reinterpret_cast<uintptr_t>(block->ptr));
2068 }
2069 device_allocator[block->device]->free(block);
2070 }
2071
2072 void setMemoryFraction(double fraction, int device) override {
2073 TORCH_INTERNAL_ASSERT(
2074 0 <= device && static_cast<size_t>(device) < device_allocator.size(),
2075 "Allocator not initialized for device ",
2076 device,
2077 ": did you call init?");
2078 TORCH_INTERNAL_ASSERT(
2079 0 <= fraction && fraction <= 1,
2080 "invalid fraction:",
2081 fraction,
2082 ". Please set within (0, 1).");
2083 int activated_device;
2084 C10_CUDA_CHECK(cudaGetDevice(&activated_device));
2085 if (activated_device != device) {
2086 C10_CUDA_CHECK(cudaSetDevice(device));
2087 }
2088 device_allocator[device]->setMemoryFraction(fraction);
2089 }
2090
2091 void recordHistory(
2092 bool enabled,
2093 CreateContextFn context_recorder,
2094 size_t alloc_trace_max_entries,
2095 bool alloc_trace_record_context) override {
2096 int device;
2097 C10_CUDA_CHECK(cudaGetDevice(&device));
2098 device_allocator[device]->recordHistory(
2099 enabled,
2100 std::move(context_recorder),
2101 alloc_trace_max_entries,
2102 alloc_trace_record_context);
2103 }
2104
2105 void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
2106 int device;
2107 C10_CUDA_CHECK(cudaGetDevice(&device));
2108 device_allocator[device]->attachOutOfMemoryObserver(std::move(observer));
2109 }
2110
2111 void emptyCache() override {
2112 for (auto& da : device_allocator)
2113 da->emptyCache();
2114 }
2115
2116 void* getBaseAllocation(void* ptr, size_t* outSize) override {
2117 Block* block = get_allocated_block(ptr);
2118 if (!block) {
2119 TORCH_CHECK(false, "invalid device pointer: ", ptr);
2120 }
2121 return device_allocator[block->device]->getBaseAllocation(block, outSize);
2122 }
2123
2124 void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override {
2125 // Empty tensor's storage().data() might be a null ptr. As there is no
2126 // blocks associated with those tensors, it is fine to do nothing here.
2127 if (!ptr.get()) {
2128 return;
2129 }
2130
2131 // If a tensor is not allocated by this instance, simply skip
2132 // This usually happens when CUDA tensors are shared across processes,
2133 // we have implemented reference counting based sharing mechanism to
2134 // guarantee tensors won't be accidentally freed by one process while
2135 // they are still being used in another
2136 if (ptr.get_deleter() != &local_raw_delete)
2137 return;
2138
2139 Block* block = get_allocated_block(ptr.get());
2140 // block must not be null reaching here
2141 TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found");
2142 device_allocator[block->device]->recordStream(block, stream);
2143 }
2144
2145 SnapshotInfo snapshot() override {
2146 SnapshotInfo result;
2147 for (auto& da : device_allocator) {
2148 result.device_traces.emplace_back(da->trace());
2149 auto snap = da->snapshot();
2150 result.segments.insert(result.segments.end(), snap.begin(), snap.end());
2151 }
2152 return result;
2153 }
2154 DataPtr allocate(size_t size) const override {
2155 constexpr size_t one_exa_bytes = 1152921504606846976ULL;
2156 TORCH_CHECK_WITH(
2157 OutOfMemoryError,
2158 size < one_exa_bytes,
2159 "CUDA out of memory. Tried to allocate more than 1EB memory.");
2160 int device;
2161 C10_CUDA_CHECK(cudaGetDevice(&device));
2162 void* r = nullptr;
2163 if (forceUncachedAllocator()) {
2164 // Deliberately don't use cudaMallocMaybeCapturing here, to force an error
2165 // if someone tries to use forceUncachedAllocator while capturing.
2166 C10_CUDA_CHECK(cudaMalloc(&r, size));
2167 const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
2168 if (C10_UNLIKELY(interp)) {
2169 (*interp)->trace_gpu_memory_allocation(reinterpret_cast<uintptr_t>(r));
2170 }
2171 return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
2172 }
2173 if (size != 0) {
2174 // Allocator declars allocate const!?
2175 const_cast<NativeCachingAllocator*>(this)->malloc(
2176 &r, device, size, cuda::getCurrentCUDAStream(device));
2177 }
2178 return {r, r, &local_raw_delete, Device(DeviceType::CUDA, device)};
2179 }
2180 DeleterFnPtr raw_deleter() const override {
2181 if (forceUncachedAllocator()) {
2182 return &uncached_delete;
2183 } else {
2184 return &local_raw_delete;
2185 }
2186 }
2187 void cacheInfo(int dev_id, size_t* largestBlock) override {
2188 device_allocator[dev_id]->cacheInfo(largestBlock);
2189 }
2190 void assertValidDevice(int device) {
2191 const auto device_num = device_allocator.size();
2192 TORCH_CHECK(
2193 0 <= device && device < static_cast<int64_t>(device_num),
2194 "Invalid device argument ",
2195 device,
2196 ": did you call init?");
2197 }
2198
2199 DeviceStats getDeviceStats(int device) override {
2200 assertValidDevice(device);
2201 return device_allocator[device]->getStats();
2202 }
2203
2204 void resetAccumulatedStats(int device) override {
2205 assertValidDevice(device);
2206 device_allocator[device]->resetAccumulatedStats();
2207 }
2208
2209 void resetPeakStats(int device) override {
2210 assertValidDevice(device);
2211 device_allocator[device]->resetPeakStats();
2212 }
2213 // CUDAGraph interactions
2214 void notifyCaptureBegin(
2215 int device,
2216 CaptureId_t graph_id,
2217 MempoolId_t mempool_id) override {
2218 assertValidDevice(device);
2219 device_allocator[device]->notifyCaptureBegin(
2220 graph_id, std::move(mempool_id));
2221 }
2222
2223 void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override {
2224 assertValidDevice(device);
2225 device_allocator[device]->notifyCaptureAboutToEnd(graph_id);
2226 }
2227
2228 void notifyCaptureEnded(int device, CaptureId_t graph_id) override {} // no-op
2229
2230 void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override {
2231 assertValidDevice(device);
2232 device_allocator[device]->notifyCaptureDestroy(std::move(mempool_id));
2233 }
2234
2235 void* raw_alloc(size_t nbytes) override {
2236 if (nbytes == 0) {
2237 return nullptr;
2238 }
2239 int device;
2240 C10_CUDA_CHECK(cudaGetDevice(&device));
2241 void* r = nullptr;
2242 malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
2243 return r;
2244 }
2245
2246 void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
2247 if (nbytes == 0) {
2248 return nullptr;
2249 }
2250 int device;
2251 C10_CUDA_CHECK(cudaGetDevice(&device));
2252 void* r = nullptr;
2253 malloc(&r, device, nbytes, stream);
2254 return r;
2255 }
2256 bool needsPoolSpecificPeerAccess() override {
2257 return false;
2258 }
2259 void raw_delete(void* ptr) override {
2260 this->free(ptr);
2261 }
2262
2263 // In CUDA IPC, sender sends a tensor to receiver, getIpcDevPtr
2264 // is called by the receiving process to map the CUDA memory from the sending
2265 // process into its own address space.
2266 //
2267 // CUDA IPC only allows sharing a big memory block associated with a
2268 // cudaIpcMemHandle_t and it can be opened only **once** per context per
2269 // process. There can be multiple types of storage in the same IPC mem block,
2270 // so we must cache the device ptr to construct typed storage as it comes.
2271 //
2272 // ipcMemHandle_to_devptr maps a cudaIpcMemHandle_t to a device pointer in the
2273 // process that can be used to access the memory block in the sender process.
2274 // It only saves a weak_ptr of the device pointer in the map, the shared_ptr
2275 // will be used to reconstruct all storages in this CudaMalloc allocation. And
2276 // it will deleted in cudaIpcCloseMemHandle when its reference count is 0.
2277 //
2278 std::mutex IpcMutex;
2279 ska::flat_hash_map<std::string, std::weak_ptr<void>> ipcMemHandle_to_devptr;
2280 std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
2281 std::lock_guard<std::mutex> lock(IpcMutex);
2282
2283 auto iter = ipcMemHandle_to_devptr.find(handle);
2284 if (iter != ipcMemHandle_to_devptr.end()) {
2285 auto devptr = iter->second.lock();
2286 if (devptr)
2287 return devptr;
2288 }
2289 // This ipcMemHandle hasn't been opened, or already expired, open it to
2290 // enable IPC access to that mem block.
2291 void* dev = nullptr;
2292 auto ipc_handle =
2293 reinterpret_cast<const cudaIpcMemHandle_t*>(handle.c_str());
2294 C10_CUDA_CHECK(cudaIpcOpenMemHandle(
2295 &dev, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
2296 // devPtr has to be deleted in same device when created.
2297 int curr_device;
2298 C10_CUDA_CHECK(cudaGetDevice(&curr_device));
2299 auto sp =
2300 std::shared_ptr<void>(dev, [handle, curr_device, this](void* ptr) {
2301 cuda::CUDAGuard device_guard(curr_device);
2302 std::lock_guard<std::mutex> deleter_lock(IpcMutex);
2303 C10_CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
2304 ipcMemHandle_to_devptr.erase(handle);
2305 });
2306 std::weak_ptr<void> wp = sp;
2307 // To eliminate an additional search, we can use insert().
2308 // It doesn't overwrite when key already exists(ptr expired).
2309 // But in the deleter for sp we erased the entry,
2310 // this should be safe to do now.
2311 ipcMemHandle_to_devptr.insert(iter, {handle, wp});
2312
2313 return sp;
2314 }
2315 std::string name() override {
2316 return "native";
2317 }
2318};
2319
2320NativeCachingAllocator allocator;
2321
2322void local_raw_delete(void* ptr) {
2323 allocator.free(ptr);
2324}
2325
2326void setAllocatorSettings(const std::string& env) {
2327 CachingAllocatorConfig::instance().parseArgs(env.c_str());
2328}
2329
2330} // namespace Native
2331
2332// General caching allocator utilities
2333void setAllocatorSettings(const std::string& env) {
2334 CachingAllocatorConfig::instance().parseArgs(env.c_str());
2335}
2336
2337// Size pretty-printer
2338inline std::string format_size(uint64_t size) {
2339 std::ostringstream os;
2340 os.precision(2);
2341 os << std::fixed;
2342 if (size <= 1024) {
2343 os << size << " bytes";
2344 } else if (size <= 1048576) {
2345 os << (size / 1024.0);
2346 os << " KiB";
2347 } else if (size <= 1073741824ULL) {
2348 os << size / 1048576.0;
2349 os << " MiB";
2350 } else {
2351 os << size / 1073741824.0;
2352 os << " GiB";
2353 }
2354 return os.str();
2355}
2356
2357namespace CudaMallocAsync {
2358// If this is put in its own header file, it gets incorrectly renamed in HIPify.
2359CUDAAllocator* allocator();
2360
2361} // namespace CudaMallocAsync
2362
2363struct BackendStaticInitializer {
2364 // Parses env for backend at load time, duplicating some logic from
2365 // CachingAllocatorConfig. CachingAllocatorConfig double-checks it later (at
2366 // runtime). Defers verbose exceptions and error checks, including Cuda
2367 // version checks, to CachingAllocatorConfig's runtime doublecheck. If this
2368 // works, maybe we should move all of CachingAllocatorConfig here?
2369 CUDAAllocator* parseEnvForBackend() {
2370 const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
2371 if (val != nullptr) {
2372 const std::string config(val);
2373
2374 std::regex exp("[\\s,]+");
2375 std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
2376 std::sregex_token_iterator end;
2377 std::vector<std::string> options(it, end);
2378
2379 for (auto option : options) {
2380 std::regex exp2("[:]+");
2381 std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
2382 std::sregex_token_iterator end2;
2383 std::vector<std::string> kv(it2, end2);
2384 if (kv.size() >= 2) {
2385 if (kv[0] == "backend") {
2386 if (kv[1] == "cudaMallocAsync")
2387 return CudaMallocAsync::allocator();
2388 if (kv[1] == "native")
2389 return &Native::allocator;
2390 }
2391 }
2392 }
2393 }
2394 return &Native::allocator;
2395 }
2396
2397 BackendStaticInitializer() {
2398 auto r = parseEnvForBackend();
2399 allocator.store(r);
2400 }
2401};
2402
2403std::atomic<CUDAAllocator*> allocator{};
2404BackendStaticInitializer backend_static_initializer;
2405
2406} // namespace CUDACachingAllocator
2407} // namespace cuda
2408} // namespace c10
2409