1#pragma once
2#include <vector>
3#include <set>
4
5#include "taichi/common/core.h"
6#include "taichi/rhi/cuda/cuda_driver.h"
7#include "taichi/rhi/cuda/cuda_caching_allocator.h"
8#include "taichi/rhi/cuda/cuda_context.h"
9#include "taichi/rhi/llvm/llvm_device.h"
10
11namespace taichi::lang {
12namespace cuda {
13
14class CudaPipeline : public Pipeline {
15 public:
16 ~CudaPipeline() override {
17 }
18};
19
20class CudaCommandList : public CommandList {
21 public:
22 ~CudaCommandList() override {
23 }
24
25 void bind_pipeline(Pipeline *p) noexcept override{TI_NOT_IMPLEMENTED};
26 RhiResult bind_shader_resources(ShaderResourceSet *res,
27 int set_index = 0) noexcept final{
28 TI_NOT_IMPLEMENTED};
29 RhiResult bind_raster_resources(RasterResources *res) noexcept final{
30 TI_NOT_IMPLEMENTED};
31 void buffer_barrier(DevicePtr ptr,
32 size_t size) noexcept override{TI_NOT_IMPLEMENTED};
33 void buffer_barrier(DeviceAllocation alloc) noexcept override{
34 TI_NOT_IMPLEMENTED};
35 void memory_barrier() noexcept override{TI_NOT_IMPLEMENTED};
36 void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) noexcept override{
37 TI_NOT_IMPLEMENTED};
38 void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) noexcept override{
39 TI_NOT_IMPLEMENTED};
40 RhiResult dispatch(uint32_t x,
41 uint32_t y = 1,
42 uint32_t z = 1) noexcept override{TI_NOT_IMPLEMENTED};
43};
44
45class CudaStream : public Stream {
46 public:
47 ~CudaStream() override{};
48
49 RhiResult new_command_list(CommandList **out_cmdlist) noexcept final{
50 TI_NOT_IMPLEMENTED};
51 StreamSemaphore submit(CommandList *cmdlist,
52 const std::vector<StreamSemaphore> &wait_semaphores =
53 {}) override{TI_NOT_IMPLEMENTED};
54 StreamSemaphore submit_synced(
55 CommandList *cmdlist,
56 const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
57 TI_NOT_IMPLEMENTED};
58
59 void command_sync() override{TI_NOT_IMPLEMENTED};
60};
61
62class CudaDevice : public LlvmDevice {
63 public:
64 struct AllocInfo {
65 void *ptr{nullptr};
66 size_t size{0};
67 bool is_imported{false};
68 /* Note: Memory allocation in CUDA device.
69 * CudaDevice can use either its own cuda malloc mechanism via
70 * `allocate_memory` or the preallocated memory managed by Llvmprogramimpl
71 * via `allocate_memory_runtime`. The `use_preallocated` is used to track
72 * this option. For now, we keep both options and the preallocated method is
73 * used by default for CUDA backend. The `use_cached` is to enable/disable
74 * the caching behavior in `allocate_memory_runtime`. Later it should be
75 * always enabled, for now we keep both options to allow a scenario when
76 * using preallocated memory while disabling the caching behavior.
77 * */
78 bool use_preallocated{true};
79 bool use_cached{false};
80 void *mapped{nullptr};
81 };
82
83 AllocInfo get_alloc_info(const DeviceAllocation handle);
84
85 ~CudaDevice() override{};
86
87 DeviceAllocation allocate_memory(const AllocParams &params) override;
88 DeviceAllocation allocate_memory_runtime(
89 const LlvmRuntimeAllocParams &params) override;
90 void dealloc_memory(DeviceAllocation handle) override;
91
92 RhiResult upload_data(DevicePtr *device_ptr,
93 const void **data,
94 size_t *size,
95 int num_alloc = 1) noexcept override;
96
97 RhiResult readback_data(
98 DevicePtr *device_ptr,
99 void **data,
100 size_t *size,
101 int num_alloc = 1,
102 const std::vector<StreamSemaphore> &wait_sema = {}) noexcept override;
103
104 ShaderResourceSet *create_resource_set() final{TI_NOT_IMPLEMENTED};
105
106 RhiResult create_pipeline(Pipeline **out_pipeline,
107 const PipelineSourceDesc &src,
108 std::string name,
109 PipelineCache *cache) noexcept final {
110 TI_NOT_IMPLEMENTED;
111 }
112
113 uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;
114
115 RhiResult map_range(DevicePtr ptr, uint64_t size, void **mapped_ptr) final {
116 TI_NOT_IMPLEMENTED;
117 }
118 RhiResult map(DeviceAllocation alloc, void **mapped_ptr) final;
119
120 void unmap(DevicePtr ptr) final{TI_NOT_IMPLEMENTED};
121 void unmap(DeviceAllocation alloc) final;
122
123 void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) override;
124
125 DeviceAllocation import_memory(void *ptr, size_t size);
126
127 Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
128
129 void wait_idle() override{TI_NOT_IMPLEMENTED};
130
131 private:
132 std::vector<AllocInfo> allocations_;
133 void validate_device_alloc(const DeviceAllocation alloc) {
134 if (allocations_.size() <= alloc.alloc_id) {
135 TI_ERROR("invalid DeviceAllocation");
136 }
137 }
138 std::unique_ptr<CudaCachingAllocator> caching_allocator_{nullptr};
139};
140
141} // namespace cuda
142
143} // namespace taichi::lang
144