1#include <taichi/rhi/device.h>
2
3#if TI_WITH_VULKAN
4#include <taichi/rhi/vulkan/vulkan_device.h>
5#include <taichi/rhi/interop/vulkan_cpu_interop.h>
6#if TI_WITH_LLVM
7#include <taichi/rhi/cpu/cpu_device.h>
8#endif
9#if TI_WITH_CUDA
10#include <taichi/rhi/cuda/cuda_device.h>
11#include <taichi/rhi/interop/vulkan_cuda_interop.h>
12#endif // TI_WITH_CUDA
13#endif // TI_WITH_VULKAN
14
15namespace taichi::lang {
16
17DeviceAllocationGuard::~DeviceAllocationGuard() {
18 device->dealloc_memory(*this);
19}
20
21DeviceImageGuard::~DeviceImageGuard() {
22 dynamic_cast<GraphicsDevice *>(device)->destroy_image(*this);
23}
24
25DevicePtr DeviceAllocation::get_ptr(uint64_t offset) const {
26 return DevicePtr{{device, alloc_id}, offset};
27}
28
29Device::MemcpyCapability Device::check_memcpy_capability(DevicePtr dst,
30 DevicePtr src,
31 uint64_t size) {
32 if (dst.device == src.device) {
33 return Device::MemcpyCapability::Direct;
34 }
35
36#if TI_WITH_VULKAN
37#if TI_WITH_LLVM
38 if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
39 dynamic_cast<cpu::CpuDevice *>(src.device)) {
40 // TODO: support direct copy if dst itself supports host write.
41 return Device::MemcpyCapability::RequiresStagingBuffer;
42 } else if (dynamic_cast<cpu::CpuDevice *>(dst.device) &&
43 dynamic_cast<vulkan::VulkanDevice *>(src.device)) {
44 return Device::MemcpyCapability::RequiresStagingBuffer;
45 }
46#endif
47#if TI_WITH_CUDA
48 if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
49 dynamic_cast<cuda::CudaDevice *>(src.device)) {
50 // FIXME: direct copy isn't always possible.
51 // The vulkan buffer needs export_sharing turned on.
52 // Otherwise, needs staging buffer
53 return Device::MemcpyCapability::Direct;
54 } else if (dynamic_cast<cuda::CudaDevice *>(dst.device) &&
55 dynamic_cast<vulkan::VulkanDevice *>(src.device)) {
56 return Device::MemcpyCapability::Direct;
57 }
58#endif // TI_WITH_CUDA
59#endif // TI_WITH_VULKAN
60 return Device::MemcpyCapability::RequiresHost;
61}
62
63void Device::memcpy_direct(DevicePtr dst, DevicePtr src, uint64_t size) {
64 // Intra-device copy
65 if (dst.device == src.device) {
66 dst.device->memcpy_internal(dst, src, size);
67 return;
68 }
69#if TI_WITH_VULKAN && TI_WITH_LLVM
70 // cross-device copy directly
71 else if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
72 dynamic_cast<cpu::CpuDevice *>(src.device)) {
73 memcpy_cpu_to_vulkan(dst, src, size);
74 return;
75 }
76#endif
77#if TI_WITH_VULKAN && TI_WITH_CUDA
78 if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
79 dynamic_cast<cuda::CudaDevice *>(src.device)) {
80 memcpy_cuda_to_vulkan(dst, src, size);
81 return;
82 } else if (dynamic_cast<cuda::CudaDevice *>(dst.device) &&
83 dynamic_cast<vulkan::VulkanDevice *>(src.device)) {
84 memcpy_vulkan_to_cuda(dst, src, size);
85 return;
86 }
87#endif
88 TI_NOT_IMPLEMENTED;
89}
90
91void Device::memcpy_via_staging(DevicePtr dst,
92 DevicePtr staging,
93 DevicePtr src,
94 uint64_t size) {
95 // Inter-device copy
96#if defined(TI_WITH_VULKAN) && defined(TI_WITH_LLVM)
97 if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) &&
98 dynamic_cast<cpu::CpuDevice *>(src.device)) {
99 memcpy_cpu_to_vulkan_via_staging(dst, staging, src, size);
100 return;
101 }
102#endif
103
104 TI_NOT_IMPLEMENTED;
105}
106
107void Device::memcpy_via_host(DevicePtr dst,
108 void *host_buffer,
109 DevicePtr src,
110 uint64_t size) {
111 TI_NOT_IMPLEMENTED;
112}
113
114void GraphicsDevice::image_transition(DeviceAllocation img,
115 ImageLayout old_layout,
116 ImageLayout new_layout) {
117 Stream *stream = get_graphics_stream();
118 auto [cmd_list, res] = stream->new_command_list_unique();
119 TI_ASSERT(res == RhiResult::success);
120 cmd_list->image_transition(img, old_layout, new_layout);
121 stream->submit_synced(cmd_list.get());
122}
123void GraphicsDevice::buffer_to_image(DeviceAllocation dst_img,
124 DevicePtr src_buf,
125 ImageLayout img_layout,
126 const BufferImageCopyParams &params) {
127 Stream *stream = get_graphics_stream();
128 auto [cmd_list, res] = stream->new_command_list_unique();
129 TI_ASSERT(res == RhiResult::success);
130 cmd_list->buffer_to_image(dst_img, src_buf, img_layout, params);
131 stream->submit_synced(cmd_list.get());
132}
133void GraphicsDevice::image_to_buffer(DevicePtr dst_buf,
134 DeviceAllocation src_img,
135 ImageLayout img_layout,
136 const BufferImageCopyParams &params) {
137 Stream *stream = get_graphics_stream();
138 auto [cmd_list, res] = stream->new_command_list_unique();
139 TI_ASSERT(res == RhiResult::success);
140 cmd_list->image_to_buffer(dst_buf, src_img, img_layout, params);
141 stream->submit_synced(cmd_list.get());
142}
143
144RhiResult Device::upload_data(DevicePtr *device_ptr,
145 const void **data,
146 size_t *size,
147 int num_alloc) noexcept {
148 if (!device_ptr || !data || !size) {
149 return RhiResult::invalid_usage;
150 }
151
152 std::vector<DeviceAllocationUnique> stagings;
153 for (int i = 0; i < num_alloc; i++) {
154 if (device_ptr[i].device != this || !data[i]) {
155 return RhiResult::invalid_usage;
156 }
157 DeviceAllocationUnique staging = this->allocate_memory_unique(
158 {size[i], /*host_write=*/true, /*host_read=*/false,
159 /*export_sharing=*/false, AllocUsage::Upload});
160
161 void *mapped{nullptr};
162 RhiResult res = this->map(*staging, &mapped);
163 if (res != RhiResult::success) {
164 return res;
165 }
166 memcpy(mapped, data[i], size[i]);
167 this->unmap(*staging);
168
169 stagings.push_back(std::move(staging));
170 }
171
172 Stream *s = this->get_compute_stream();
173 auto [cmdlist, res] = s->new_command_list_unique();
174 if (res != RhiResult::success) {
175 return res;
176 }
177 for (int i = 0; i < num_alloc; i++) {
178 cmdlist->buffer_copy(device_ptr[i], stagings[i]->get_ptr(0), size[i]);
179 }
180 s->submit_synced(cmdlist.get());
181
182 return RhiResult::success;
183}
184
185RhiResult Device::readback_data(
186 DevicePtr *device_ptr,
187 void **data,
188 size_t *size,
189 int num_alloc,
190 const std::vector<StreamSemaphore> &wait_sema) noexcept {
191 if (!device_ptr || !data || !size) {
192 return RhiResult::invalid_usage;
193 }
194
195 Stream *s = this->get_compute_stream();
196 auto [cmdlist, res] = s->new_command_list_unique();
197 if (res != RhiResult::success) {
198 return res;
199 }
200
201 std::vector<DeviceAllocationUnique> stagings;
202 for (int i = 0; i < num_alloc; i++) {
203 if (device_ptr[i].device != this || !data[i]) {
204 return RhiResult::invalid_usage;
205 }
206 DeviceAllocationUnique staging = this->allocate_memory_unique(
207 {size[i], /*host_write=*/false, /*host_read=*/true,
208 /*export_sharing=*/false, AllocUsage::None});
209
210 cmdlist->buffer_copy(staging->get_ptr(0), device_ptr[i], size[i]);
211 stagings.push_back(std::move(staging));
212 }
213 s->submit_synced(cmdlist.get(), wait_sema);
214
215 for (int i = 0; i < num_alloc; i++) {
216 void *mapped{nullptr};
217 RhiResult res = this->map(*stagings[i], &mapped);
218 if (res != RhiResult::success) {
219 return res;
220 }
221 memcpy(data[i], mapped, size[i]);
222 this->unmap(*stagings[i]);
223 }
224
225 return RhiResult::success;
226}
227
228} // namespace taichi::lang
229