1 | #pragma once |
2 | #include <vector> |
3 | #include <set> |
4 | |
5 | #include "taichi/common/core.h" |
6 | #include "taichi/rhi/cuda/cuda_driver.h" |
7 | #include "taichi/rhi/cuda/cuda_caching_allocator.h" |
8 | #include "taichi/rhi/cuda/cuda_context.h" |
9 | #include "taichi/rhi/llvm/llvm_device.h" |
10 | |
11 | namespace taichi::lang { |
12 | namespace cuda { |
13 | |
14 | class CudaPipeline : public Pipeline { |
15 | public: |
16 | ~CudaPipeline() override { |
17 | } |
18 | }; |
19 | |
20 | class CudaCommandList : public CommandList { |
21 | public: |
22 | ~CudaCommandList() override { |
23 | } |
24 | |
25 | void bind_pipeline(Pipeline *p) noexcept override{TI_NOT_IMPLEMENTED}; |
26 | RhiResult bind_shader_resources(ShaderResourceSet *res, |
27 | int set_index = 0) noexcept final{ |
28 | TI_NOT_IMPLEMENTED}; |
29 | RhiResult bind_raster_resources(RasterResources *res) noexcept final{ |
30 | TI_NOT_IMPLEMENTED}; |
31 | void buffer_barrier(DevicePtr ptr, |
32 | size_t size) noexcept override{TI_NOT_IMPLEMENTED}; |
33 | void buffer_barrier(DeviceAllocation alloc) noexcept override{ |
34 | TI_NOT_IMPLEMENTED}; |
35 | void memory_barrier() noexcept override{TI_NOT_IMPLEMENTED}; |
36 | void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept override{ |
37 | TI_NOT_IMPLEMENTED}; |
38 | void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept override{ |
39 | TI_NOT_IMPLEMENTED}; |
40 | RhiResult dispatch(uint32_t x, |
41 | uint32_t y = 1, |
42 | uint32_t z = 1) noexcept override{TI_NOT_IMPLEMENTED}; |
43 | }; |
44 | |
45 | class CudaStream : public Stream { |
46 | public: |
47 | ~CudaStream() override{}; |
48 | |
49 | RhiResult new_command_list(CommandList **out_cmdlist) noexcept final{ |
50 | TI_NOT_IMPLEMENTED}; |
51 | StreamSemaphore submit(CommandList *cmdlist, |
52 | const std::vector<StreamSemaphore> &wait_semaphores = |
53 | {}) override{TI_NOT_IMPLEMENTED}; |
54 | StreamSemaphore submit_synced( |
55 | CommandList *cmdlist, |
56 | const std::vector<StreamSemaphore> &wait_semaphores = {}) override{ |
57 | TI_NOT_IMPLEMENTED}; |
58 | |
59 | void command_sync() override{TI_NOT_IMPLEMENTED}; |
60 | }; |
61 | |
62 | class CudaDevice : public LlvmDevice { |
63 | public: |
64 | struct AllocInfo { |
65 | void *ptr{nullptr}; |
66 | size_t size{0}; |
67 | bool is_imported{false}; |
68 | /* Note: Memory allocation in CUDA device. |
69 | * CudaDevice can use either its own cuda malloc mechanism via |
70 | * `allocate_memory` or the preallocated memory managed by Llvmprogramimpl |
71 | * via `allocate_memory_runtime`. The `use_preallocated` is used to track |
72 | * this option. For now, we keep both options and the preallocated method is |
73 | * used by default for CUDA backend. The `use_cached` is to enable/disable |
74 | * the caching behavior in `allocate_memory_runtime`. Later it should be |
75 | * always enabled, for now we keep both options to allow a scenario when |
76 | * using preallocated memory while disabling the caching behavior. |
77 | * */ |
78 | bool use_preallocated{true}; |
79 | bool use_cached{false}; |
80 | void *mapped{nullptr}; |
81 | }; |
82 | |
83 | AllocInfo get_alloc_info(const DeviceAllocation handle); |
84 | |
85 | ~CudaDevice() override{}; |
86 | |
87 | DeviceAllocation allocate_memory(const AllocParams ¶ms) override; |
88 | DeviceAllocation allocate_memory_runtime( |
89 | const LlvmRuntimeAllocParams ¶ms) override; |
90 | void dealloc_memory(DeviceAllocation handle) override; |
91 | |
92 | RhiResult upload_data(DevicePtr *device_ptr, |
93 | const void **data, |
94 | size_t *size, |
95 | int num_alloc = 1) noexcept override; |
96 | |
97 | RhiResult readback_data( |
98 | DevicePtr *device_ptr, |
99 | void **data, |
100 | size_t *size, |
101 | int num_alloc = 1, |
102 | const std::vector<StreamSemaphore> &wait_sema = {}) noexcept override; |
103 | |
104 | ShaderResourceSet *create_resource_set() final{TI_NOT_IMPLEMENTED}; |
105 | |
106 | RhiResult create_pipeline(Pipeline **out_pipeline, |
107 | const PipelineSourceDesc &src, |
108 | std::string name, |
109 | PipelineCache *cache) noexcept final { |
110 | TI_NOT_IMPLEMENTED; |
111 | } |
112 | |
113 | uint64 fetch_result_uint64(int i, uint64 *result_buffer) override; |
114 | |
115 | RhiResult map_range(DevicePtr ptr, uint64_t size, void **mapped_ptr) final { |
116 | TI_NOT_IMPLEMENTED; |
117 | } |
118 | RhiResult map(DeviceAllocation alloc, void **mapped_ptr) final; |
119 | |
120 | void unmap(DevicePtr ptr) final{TI_NOT_IMPLEMENTED}; |
121 | void unmap(DeviceAllocation alloc) final; |
122 | |
123 | void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) override; |
124 | |
125 | DeviceAllocation import_memory(void *ptr, size_t size); |
126 | |
127 | Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED}; |
128 | |
129 | void wait_idle() override{TI_NOT_IMPLEMENTED}; |
130 | |
131 | private: |
132 | std::vector<AllocInfo> allocations_; |
133 | void validate_device_alloc(const DeviceAllocation alloc) { |
134 | if (allocations_.size() <= alloc.alloc_id) { |
135 | TI_ERROR("invalid DeviceAllocation" ); |
136 | } |
137 | } |
138 | std::unique_ptr<CudaCachingAllocator> caching_allocator_{nullptr}; |
139 | }; |
140 | |
141 | } // namespace cuda |
142 | |
143 | } // namespace taichi::lang |
144 | |