1 | #include <taichi/rhi/device.h> |
2 | |
3 | #if TI_WITH_VULKAN |
4 | #include <taichi/rhi/vulkan/vulkan_device.h> |
5 | #include <taichi/rhi/interop/vulkan_cpu_interop.h> |
6 | #if TI_WITH_LLVM |
7 | #include <taichi/rhi/cpu/cpu_device.h> |
8 | #endif |
9 | #if TI_WITH_CUDA |
10 | #include <taichi/rhi/cuda/cuda_device.h> |
11 | #include <taichi/rhi/interop/vulkan_cuda_interop.h> |
12 | #endif // TI_WITH_CUDA |
13 | #endif // TI_WITH_VULKAN |
14 | |
15 | namespace taichi::lang { |
16 | |
17 | DeviceAllocationGuard::~DeviceAllocationGuard() { |
18 | device->dealloc_memory(*this); |
19 | } |
20 | |
21 | DeviceImageGuard::~DeviceImageGuard() { |
22 | dynamic_cast<GraphicsDevice *>(device)->destroy_image(*this); |
23 | } |
24 | |
25 | DevicePtr DeviceAllocation::get_ptr(uint64_t offset) const { |
26 | return DevicePtr{{device, alloc_id}, offset}; |
27 | } |
28 | |
29 | Device::MemcpyCapability Device::check_memcpy_capability(DevicePtr dst, |
30 | DevicePtr src, |
31 | uint64_t size) { |
32 | if (dst.device == src.device) { |
33 | return Device::MemcpyCapability::Direct; |
34 | } |
35 | |
36 | #if TI_WITH_VULKAN |
37 | #if TI_WITH_LLVM |
38 | if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) && |
39 | dynamic_cast<cpu::CpuDevice *>(src.device)) { |
40 | // TODO: support direct copy if dst itself supports host write. |
41 | return Device::MemcpyCapability::RequiresStagingBuffer; |
42 | } else if (dynamic_cast<cpu::CpuDevice *>(dst.device) && |
43 | dynamic_cast<vulkan::VulkanDevice *>(src.device)) { |
44 | return Device::MemcpyCapability::RequiresStagingBuffer; |
45 | } |
46 | #endif |
47 | #if TI_WITH_CUDA |
48 | if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) && |
49 | dynamic_cast<cuda::CudaDevice *>(src.device)) { |
50 | // FIXME: direct copy isn't always possible. |
51 | // The vulkan buffer needs export_sharing turned on. |
52 | // Otherwise, needs staging buffer |
53 | return Device::MemcpyCapability::Direct; |
54 | } else if (dynamic_cast<cuda::CudaDevice *>(dst.device) && |
55 | dynamic_cast<vulkan::VulkanDevice *>(src.device)) { |
56 | return Device::MemcpyCapability::Direct; |
57 | } |
58 | #endif // TI_WITH_CUDA |
59 | #endif // TI_WITH_VULKAN |
60 | return Device::MemcpyCapability::RequiresHost; |
61 | } |
62 | |
63 | void Device::memcpy_direct(DevicePtr dst, DevicePtr src, uint64_t size) { |
64 | // Intra-device copy |
65 | if (dst.device == src.device) { |
66 | dst.device->memcpy_internal(dst, src, size); |
67 | return; |
68 | } |
69 | #if TI_WITH_VULKAN && TI_WITH_LLVM |
70 | // cross-device copy directly |
71 | else if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) && |
72 | dynamic_cast<cpu::CpuDevice *>(src.device)) { |
73 | memcpy_cpu_to_vulkan(dst, src, size); |
74 | return; |
75 | } |
76 | #endif |
77 | #if TI_WITH_VULKAN && TI_WITH_CUDA |
78 | if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) && |
79 | dynamic_cast<cuda::CudaDevice *>(src.device)) { |
80 | memcpy_cuda_to_vulkan(dst, src, size); |
81 | return; |
82 | } else if (dynamic_cast<cuda::CudaDevice *>(dst.device) && |
83 | dynamic_cast<vulkan::VulkanDevice *>(src.device)) { |
84 | memcpy_vulkan_to_cuda(dst, src, size); |
85 | return; |
86 | } |
87 | #endif |
88 | TI_NOT_IMPLEMENTED; |
89 | } |
90 | |
91 | void Device::memcpy_via_staging(DevicePtr dst, |
92 | DevicePtr staging, |
93 | DevicePtr src, |
94 | uint64_t size) { |
95 | // Inter-device copy |
96 | #if defined(TI_WITH_VULKAN) && defined(TI_WITH_LLVM) |
97 | if (dynamic_cast<vulkan::VulkanDevice *>(dst.device) && |
98 | dynamic_cast<cpu::CpuDevice *>(src.device)) { |
99 | memcpy_cpu_to_vulkan_via_staging(dst, staging, src, size); |
100 | return; |
101 | } |
102 | #endif |
103 | |
104 | TI_NOT_IMPLEMENTED; |
105 | } |
106 | |
107 | void Device::memcpy_via_host(DevicePtr dst, |
108 | void *host_buffer, |
109 | DevicePtr src, |
110 | uint64_t size) { |
111 | TI_NOT_IMPLEMENTED; |
112 | } |
113 | |
114 | void GraphicsDevice::image_transition(DeviceAllocation img, |
115 | ImageLayout old_layout, |
116 | ImageLayout new_layout) { |
117 | Stream *stream = get_graphics_stream(); |
118 | auto [cmd_list, res] = stream->new_command_list_unique(); |
119 | TI_ASSERT(res == RhiResult::success); |
120 | cmd_list->image_transition(img, old_layout, new_layout); |
121 | stream->submit_synced(cmd_list.get()); |
122 | } |
123 | void GraphicsDevice::buffer_to_image(DeviceAllocation dst_img, |
124 | DevicePtr src_buf, |
125 | ImageLayout img_layout, |
126 | const BufferImageCopyParams ¶ms) { |
127 | Stream *stream = get_graphics_stream(); |
128 | auto [cmd_list, res] = stream->new_command_list_unique(); |
129 | TI_ASSERT(res == RhiResult::success); |
130 | cmd_list->buffer_to_image(dst_img, src_buf, img_layout, params); |
131 | stream->submit_synced(cmd_list.get()); |
132 | } |
133 | void GraphicsDevice::image_to_buffer(DevicePtr dst_buf, |
134 | DeviceAllocation src_img, |
135 | ImageLayout img_layout, |
136 | const BufferImageCopyParams ¶ms) { |
137 | Stream *stream = get_graphics_stream(); |
138 | auto [cmd_list, res] = stream->new_command_list_unique(); |
139 | TI_ASSERT(res == RhiResult::success); |
140 | cmd_list->image_to_buffer(dst_buf, src_img, img_layout, params); |
141 | stream->submit_synced(cmd_list.get()); |
142 | } |
143 | |
144 | RhiResult Device::upload_data(DevicePtr *device_ptr, |
145 | const void **data, |
146 | size_t *size, |
147 | int num_alloc) noexcept { |
148 | if (!device_ptr || !data || !size) { |
149 | return RhiResult::invalid_usage; |
150 | } |
151 | |
152 | std::vector<DeviceAllocationUnique> stagings; |
153 | for (int i = 0; i < num_alloc; i++) { |
154 | if (device_ptr[i].device != this || !data[i]) { |
155 | return RhiResult::invalid_usage; |
156 | } |
157 | DeviceAllocationUnique staging = this->allocate_memory_unique( |
158 | {size[i], /*host_write=*/true, /*host_read=*/false, |
159 | /*export_sharing=*/false, AllocUsage::Upload}); |
160 | |
161 | void *mapped{nullptr}; |
162 | RhiResult res = this->map(*staging, &mapped); |
163 | if (res != RhiResult::success) { |
164 | return res; |
165 | } |
166 | memcpy(mapped, data[i], size[i]); |
167 | this->unmap(*staging); |
168 | |
169 | stagings.push_back(std::move(staging)); |
170 | } |
171 | |
172 | Stream *s = this->get_compute_stream(); |
173 | auto [cmdlist, res] = s->new_command_list_unique(); |
174 | if (res != RhiResult::success) { |
175 | return res; |
176 | } |
177 | for (int i = 0; i < num_alloc; i++) { |
178 | cmdlist->buffer_copy(device_ptr[i], stagings[i]->get_ptr(0), size[i]); |
179 | } |
180 | s->submit_synced(cmdlist.get()); |
181 | |
182 | return RhiResult::success; |
183 | } |
184 | |
185 | RhiResult Device::readback_data( |
186 | DevicePtr *device_ptr, |
187 | void **data, |
188 | size_t *size, |
189 | int num_alloc, |
190 | const std::vector<StreamSemaphore> &wait_sema) noexcept { |
191 | if (!device_ptr || !data || !size) { |
192 | return RhiResult::invalid_usage; |
193 | } |
194 | |
195 | Stream *s = this->get_compute_stream(); |
196 | auto [cmdlist, res] = s->new_command_list_unique(); |
197 | if (res != RhiResult::success) { |
198 | return res; |
199 | } |
200 | |
201 | std::vector<DeviceAllocationUnique> stagings; |
202 | for (int i = 0; i < num_alloc; i++) { |
203 | if (device_ptr[i].device != this || !data[i]) { |
204 | return RhiResult::invalid_usage; |
205 | } |
206 | DeviceAllocationUnique staging = this->allocate_memory_unique( |
207 | {size[i], /*host_write=*/false, /*host_read=*/true, |
208 | /*export_sharing=*/false, AllocUsage::None}); |
209 | |
210 | cmdlist->buffer_copy(staging->get_ptr(0), device_ptr[i], size[i]); |
211 | stagings.push_back(std::move(staging)); |
212 | } |
213 | s->submit_synced(cmdlist.get(), wait_sema); |
214 | |
215 | for (int i = 0; i < num_alloc; i++) { |
216 | void *mapped{nullptr}; |
217 | RhiResult res = this->map(*stagings[i], &mapped); |
218 | if (res != RhiResult::success) { |
219 | return res; |
220 | } |
221 | memcpy(data[i], mapped, size[i]); |
222 | this->unmap(*stagings[i]); |
223 | } |
224 | |
225 | return RhiResult::success; |
226 | } |
227 | |
228 | } // namespace taichi::lang |
229 | |