1#include "memory_pool.h"
2#include "taichi/system/timer.h"
3#include "taichi/rhi/cuda/cuda_driver.h"
4#include "taichi/rhi/cuda/cuda_device.h"
5
6namespace taichi::lang {
7
8// In the future we wish to move the MemoryPool inside each Device
9// so that the memory allocated from each Device can be used as-is.
10MemoryPool::MemoryPool(Arch arch, Device *device)
11 : arch_(arch), device_(device) {
12 TI_TRACE("Memory pool created. Default buffer size per allocator = {} MB",
13 default_allocator_size / 1024 / 1024);
14 terminating = false;
15 killed = false;
16 processed_tail = 0;
17 queue = nullptr;
18#if defined(TI_WITH_CUDA)
19 // http://on-demand.gputechconf.com/gtc/2014/presentations/S4158-cuda-streams-best-practices-common-pitfalls.pdf
20 // Stream 0 has special synchronization rules: Operations in stream 0 cannot
21 // overlap other streams except for those streams with cudaStreamNonBlocking
22 // Do not use cudaCreateStream (with no flags) here!
23 if (use_cuda_stream && arch_ == Arch::cuda) {
24 CUDADriver::get_instance().stream_create(&cuda_stream,
25 CU_STREAM_NON_BLOCKING);
26 }
27#endif
28 th = std::make_unique<std::thread>([this] { this->daemon(); });
29}
30
31void MemoryPool::set_queue(MemRequestQueue *queue) {
32 std::lock_guard<std::mutex> _(mut);
33 this->queue = queue;
34}
35
36void *MemoryPool::allocate(std::size_t size, std::size_t alignment) {
37 std::lock_guard<std::mutex> _(mut_allocators);
38 void *ret = nullptr;
39 if (!allocators.empty()) {
40 ret = allocators.back()->allocate(size, alignment);
41 }
42 if (!ret) {
43 // allocation have failed
44 auto new_buffer_size = std::max(size, default_allocator_size);
45 allocators.emplace_back(
46 std::make_unique<UnifiedAllocator>(new_buffer_size, arch_, device_));
47 ret = allocators.back()->allocate(size, alignment);
48 }
49 TI_ASSERT(ret);
50 return ret;
51}
52
53template <typename T>
54T MemoryPool::fetch(volatile void *ptr) {
55 T ret;
56 if (use_cuda_stream && arch_ == Arch::cuda) {
57#if TI_WITH_CUDA
58 CUDADriver::get_instance().stream_synchronize(cuda_stream);
59 CUDADriver::get_instance().memcpy_device_to_host_async(
60 &ret, (void *)ptr, sizeof(T), cuda_stream);
61 CUDADriver::get_instance().stream_synchronize(cuda_stream);
62#else
63 TI_NOT_IMPLEMENTED
64#endif
65 } else {
66 ret = *(T *)ptr;
67 }
68 return ret;
69}
70
71template <typename T>
72void MemoryPool::push(volatile T *dest, const T &val) {
73 if (use_cuda_stream && arch_ == Arch::cuda) {
74#if TI_WITH_CUDA
75 CUDADriver::get_instance().memcpy_host_to_device_async(
76 (void *)(dest), (void *)&val, sizeof(T), cuda_stream);
77 CUDADriver::get_instance().stream_synchronize(cuda_stream);
78#else
79 TI_NOT_IMPLEMENTED
80#endif
81 } else {
82 *(T *)dest = val;
83 }
84}
85
86void MemoryPool::daemon() {
87 while (true) {
88 Time::usleep(1000);
89 std::lock_guard<std::mutex> _(mut);
90 if (terminating) {
91 killed = true;
92 break;
93 }
94 if (!queue) {
95 continue;
96 }
97
98 // poll allocation requests.
99 using tail_type = decltype(MemRequestQueue::tail);
100 auto tail = fetch<tail_type>(&queue->tail);
101 if (tail > processed_tail) {
102 // allocate new buffer
103 auto i = processed_tail;
104 TI_DEBUG("Processing memory alloc request {}", i);
105 auto req = fetch<MemRequest>(&queue->requests[i]);
106 if (req.size == 0 || req.alignment == 0) {
107 TI_DEBUG(" Incomplete memory alloc request {} fetched. Skipping", i);
108 continue;
109 }
110 TI_DEBUG(" Allocating memory {} B (alignment {}B) ", req.size,
111 req.alignment);
112 auto ptr = allocate(req.size, req.alignment);
113 TI_DEBUG(" Allocated. Ptr = {:p}", ptr);
114 push(&queue->requests[i].ptr, (uint8 *)ptr);
115 processed_tail += 1;
116 }
117 }
118}
119
120void MemoryPool::terminate() {
121 {
122 std::lock_guard<std::mutex> _(mut);
123 terminating = true;
124 }
125 th->join();
126 TI_ASSERT(killed);
127#if 0 && defined(TI_WITH_CUDA)
128 if (arch_ == Arch::cuda)
129 CUDADriver::get_instance().cudaStreamDestroy(cuda_stream);
130#endif
131}
132
133MemoryPool::~MemoryPool() {
134 if (!killed) {
135 terminate();
136 }
137}
138
139} // namespace taichi::lang
140