1 | #include "memory_pool.h" |
2 | #include "taichi/system/timer.h" |
3 | #include "taichi/rhi/cuda/cuda_driver.h" |
4 | #include "taichi/rhi/cuda/cuda_device.h" |
5 | |
6 | namespace taichi::lang { |
7 | |
8 | // In the future we wish to move the MemoryPool inside each Device |
9 | // so that the memory allocated from each Device can be used as-is. |
10 | MemoryPool::MemoryPool(Arch arch, Device *device) |
11 | : arch_(arch), device_(device) { |
12 | TI_TRACE("Memory pool created. Default buffer size per allocator = {} MB" , |
13 | default_allocator_size / 1024 / 1024); |
14 | terminating = false; |
15 | killed = false; |
16 | processed_tail = 0; |
17 | queue = nullptr; |
18 | #if defined(TI_WITH_CUDA) |
19 | // http://on-demand.gputechconf.com/gtc/2014/presentations/S4158-cuda-streams-best-practices-common-pitfalls.pdf |
20 | // Stream 0 has special synchronization rules: Operations in stream 0 cannot |
21 | // overlap other streams except for those streams with cudaStreamNonBlocking |
22 | // Do not use cudaCreateStream (with no flags) here! |
23 | if (use_cuda_stream && arch_ == Arch::cuda) { |
24 | CUDADriver::get_instance().stream_create(&cuda_stream, |
25 | CU_STREAM_NON_BLOCKING); |
26 | } |
27 | #endif |
28 | th = std::make_unique<std::thread>([this] { this->daemon(); }); |
29 | } |
30 | |
31 | void MemoryPool::set_queue(MemRequestQueue *queue) { |
32 | std::lock_guard<std::mutex> _(mut); |
33 | this->queue = queue; |
34 | } |
35 | |
36 | void *MemoryPool::allocate(std::size_t size, std::size_t alignment) { |
37 | std::lock_guard<std::mutex> _(mut_allocators); |
38 | void *ret = nullptr; |
39 | if (!allocators.empty()) { |
40 | ret = allocators.back()->allocate(size, alignment); |
41 | } |
42 | if (!ret) { |
43 | // allocation have failed |
44 | auto new_buffer_size = std::max(size, default_allocator_size); |
45 | allocators.emplace_back( |
46 | std::make_unique<UnifiedAllocator>(new_buffer_size, arch_, device_)); |
47 | ret = allocators.back()->allocate(size, alignment); |
48 | } |
49 | TI_ASSERT(ret); |
50 | return ret; |
51 | } |
52 | |
53 | template <typename T> |
54 | T MemoryPool::fetch(volatile void *ptr) { |
55 | T ret; |
56 | if (use_cuda_stream && arch_ == Arch::cuda) { |
57 | #if TI_WITH_CUDA |
58 | CUDADriver::get_instance().stream_synchronize(cuda_stream); |
59 | CUDADriver::get_instance().memcpy_device_to_host_async( |
60 | &ret, (void *)ptr, sizeof(T), cuda_stream); |
61 | CUDADriver::get_instance().stream_synchronize(cuda_stream); |
62 | #else |
63 | TI_NOT_IMPLEMENTED |
64 | #endif |
65 | } else { |
66 | ret = *(T *)ptr; |
67 | } |
68 | return ret; |
69 | } |
70 | |
71 | template <typename T> |
72 | void MemoryPool::push(volatile T *dest, const T &val) { |
73 | if (use_cuda_stream && arch_ == Arch::cuda) { |
74 | #if TI_WITH_CUDA |
75 | CUDADriver::get_instance().memcpy_host_to_device_async( |
76 | (void *)(dest), (void *)&val, sizeof(T), cuda_stream); |
77 | CUDADriver::get_instance().stream_synchronize(cuda_stream); |
78 | #else |
79 | TI_NOT_IMPLEMENTED |
80 | #endif |
81 | } else { |
82 | *(T *)dest = val; |
83 | } |
84 | } |
85 | |
86 | void MemoryPool::daemon() { |
87 | while (true) { |
88 | Time::usleep(1000); |
89 | std::lock_guard<std::mutex> _(mut); |
90 | if (terminating) { |
91 | killed = true; |
92 | break; |
93 | } |
94 | if (!queue) { |
95 | continue; |
96 | } |
97 | |
98 | // poll allocation requests. |
99 | using tail_type = decltype(MemRequestQueue::tail); |
100 | auto tail = fetch<tail_type>(&queue->tail); |
101 | if (tail > processed_tail) { |
102 | // allocate new buffer |
103 | auto i = processed_tail; |
104 | TI_DEBUG("Processing memory alloc request {}" , i); |
105 | auto req = fetch<MemRequest>(&queue->requests[i]); |
106 | if (req.size == 0 || req.alignment == 0) { |
107 | TI_DEBUG(" Incomplete memory alloc request {} fetched. Skipping" , i); |
108 | continue; |
109 | } |
110 | TI_DEBUG(" Allocating memory {} B (alignment {}B) " , req.size, |
111 | req.alignment); |
112 | auto ptr = allocate(req.size, req.alignment); |
113 | TI_DEBUG(" Allocated. Ptr = {:p}" , ptr); |
114 | push(&queue->requests[i].ptr, (uint8 *)ptr); |
115 | processed_tail += 1; |
116 | } |
117 | } |
118 | } |
119 | |
120 | void MemoryPool::terminate() { |
121 | { |
122 | std::lock_guard<std::mutex> _(mut); |
123 | terminating = true; |
124 | } |
125 | th->join(); |
126 | TI_ASSERT(killed); |
127 | #if 0 && defined(TI_WITH_CUDA) |
128 | if (arch_ == Arch::cuda) |
129 | CUDADriver::get_instance().cudaStreamDestroy(cuda_stream); |
130 | #endif |
131 | } |
132 | |
133 | MemoryPool::~MemoryPool() { |
134 | if (!killed) { |
135 | terminate(); |
136 | } |
137 | } |
138 | |
139 | } // namespace taichi::lang |
140 | |