1 | #pragma once |
2 | |
3 | #include <c10/core/Allocator.h> |
4 | #include <c10/cuda/CUDAStream.h> |
5 | |
6 | namespace at { |
7 | namespace cuda { |
8 | |
9 | // |
10 | // A caching allocator for CUDA host allocations (pinned memory). |
11 | // |
12 | // This provides a drop-in replacement for THCudaHostAllocator, which re-uses |
13 | // freed pinned (page-locked) memory allocations. This avoids device |
14 | // synchronizations due to cudaFreeHost calls. |
15 | // |
16 | // To ensure correct behavior, THCCachingHostAllocator_recordEvent must be |
17 | // called anytime a pointer from this allocator is used in a cudaMemcpyAsync |
18 | // call between host and device, and passed the corresponding context from the |
19 | // allocation. This is currently invoked by at::native::copy_kernel_cuda. |
20 | // |
21 | // Note that this allocator does not split larger allocations into smaller |
22 | // blocks, unlike the caching device allocator. |
23 | // |
24 | TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator(); |
25 | |
26 | // Records an event in the specified stream. The allocation corresponding to the |
27 | // input `ptr`/`ctx` will not be re-used until the event has occurred. |
28 | TORCH_CUDA_CPP_API bool |
29 | CachingHostAllocator_recordEvent(void* ptr, void* ctx, c10::cuda::CUDAStream stream); |
30 | |
31 | // Releases cached pinned memory allocations via cudaHostFree |
32 | TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache(); |
33 | |
34 | inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) { |
35 | return getCachingHostAllocator()->allocate(size); |
36 | } |
37 | |
38 | } // namespace cuda |
39 | } // namespace at |
40 | |