1#pragma once
2
3#include <string>
4#include <vector>
5#include <assert.h>
6#include <initializer_list>
7
8#include "taichi/common/logging.h"
9
10#include "taichi/rhi/device_capability.h"
11#include "taichi/rhi/arch.h"
12
13namespace taichi::lang {
14
15enum class RhiResult {
16 success = 0,
17 error = -1,
18 invalid_usage = -2,
19 not_supported = -3,
20 out_of_memory = -4,
21};
22
23constexpr size_t kBufferSizeEntireSize = std::numeric_limits<size_t>::max();
24
25#define MAKE_ENUM_FLAGS(name) \
26 inline name operator|(name a, name b) { \
27 return static_cast<name>(int(a) | int(b)); \
28 } \
29 inline name operator&(name a, name b) { \
30 return static_cast<name>(int(a) & int(b)); \
31 } \
32 inline bool operator&&(name a, name b) { return (int(a) & int(b)) != 0; }
33
34enum class BlendOp : uint32_t { add, subtract, reverse_subtract, min, max };
35
36enum class BlendFactor : uint32_t {
37 zero,
38 one,
39 src_color,
40 one_minus_src_color,
41 dst_color,
42 one_minus_dst_color,
43 src_alpha,
44 one_minus_src_alpha,
45 dst_alpha,
46 one_minus_dst_alpha
47};
48
49class Device;
50struct DeviceAllocation;
51struct DevicePtr;
52
53// TODO: Figure out how to support images. Temporary solutions is to have all
54// opque types such as images work as an allocation
55using DeviceAllocationId = uint64_t;
56
57struct TI_DLL_EXPORT DeviceAllocation {
58 Device *device{nullptr};
59 DeviceAllocationId alloc_id{0};
60 // TODO: Shall we include size here?
61
62 DevicePtr get_ptr(uint64_t offset = 0) const;
63
64 bool operator==(const DeviceAllocation &other) const {
65 return other.device == device && other.alloc_id == alloc_id;
66 }
67
68 bool operator!=(const DeviceAllocation &other) const {
69 return !(*this == other);
70 }
71};
72
73struct TI_DLL_EXPORT DeviceAllocationGuard : public DeviceAllocation {
74 explicit DeviceAllocationGuard(DeviceAllocation alloc)
75 : DeviceAllocation(alloc) {
76 }
77 DeviceAllocationGuard(const DeviceAllocationGuard &) = delete;
78 ~DeviceAllocationGuard();
79};
80
81using DeviceAllocationUnique = std::unique_ptr<DeviceAllocationGuard>;
82
83struct TI_DLL_EXPORT DeviceImageGuard : public DeviceAllocation {
84 explicit DeviceImageGuard(DeviceAllocation alloc) : DeviceAllocation(alloc) {
85 }
86 DeviceImageGuard(const DeviceAllocationGuard &) = delete;
87 ~DeviceImageGuard();
88};
89
90using DeviceImageUnique = std::unique_ptr<DeviceImageGuard>;
91
92struct TI_DLL_EXPORT DevicePtr : public DeviceAllocation {
93 uint64_t offset{0};
94
95 bool operator==(const DevicePtr &other) const {
96 return other.device == device && other.alloc_id == alloc_id &&
97 other.offset == offset;
98 }
99
100 bool operator!=(const DevicePtr &other) const {
101 return !(*this == other);
102 }
103};
104
105constexpr DeviceAllocation kDeviceNullAllocation{};
106constexpr DevicePtr kDeviceNullPtr{};
107
108// TODO: fill this with the required options
109struct ImageSamplerConfig {};
110
111// A set of shader resources (that is bound at once)
112class TI_DLL_EXPORT ShaderResourceSet {
113 public:
114 virtual ~ShaderResourceSet() = default;
115
116 /**
117 * Bind a RW subregion of a buffer resource (StorgeBuffer / SSBO)
118 * @params[in] binding The binding index of the resource
119 * @params[in] ptr The Device Pointer that is going to be bound
120 * @params[in] size The size of the bound region of the buffer
121 */
122 virtual ShaderResourceSet &rw_buffer(uint32_t binding,
123 DevicePtr ptr,
124 size_t size) = 0;
125
126 /**
127 * Bind an entire RW buffer resource (StorgeBuffer / SSBO)
128 * @params[in] binding The binding index of the resource
129 * @params[in] alloc The Device Allocation that is going to be bound
130 */
131 virtual ShaderResourceSet &rw_buffer(uint32_t binding,
132 DeviceAllocation alloc) = 0;
133
134 /**
135 * Bind a read-only subregion of a buffer resource (Constants / UBO)
136 * @params[in] binding The binding index of the resource
137 * @params[in] ptr The Device Pointer that is going to be bound
138 * @params[in] size The size of the bound region of the buffer
139 */
140 virtual ShaderResourceSet &buffer(uint32_t binding,
141 DevicePtr ptr,
142 size_t size) = 0;
143
144 /**
145 * Bind an entire read-only buffer resource (Constants / UBO)
146 * @params[in] binding The binding index of the resource
147 * @params[in] alloc The Device Allocation that is going to be bound
148 */
149 virtual ShaderResourceSet &buffer(uint32_t binding,
150 DeviceAllocation alloc) = 0;
151
152 /**
153 * Bind a read-only image resource (SRV / Texture)
154 * @params[in] binding The binding index of the resource
155 * @params[in] alloc The Device Allocation that is going to be bound
156 * @params[in] sampler_config The texture sampling configuration
157 */
158 virtual ShaderResourceSet &image(uint32_t binding,
159 DeviceAllocation alloc,
160 ImageSamplerConfig sampler_config) {
161 TI_NOT_IMPLEMENTED;
162 }
163
164 /**
165 * Bind a RW image resource (UAV / Storage Image)
166 * @params binding The binding index of the resource
167 * @params alloc The Device Allocation that is going to be bound
168 */
169 virtual ShaderResourceSet &rw_image(uint32_t binding,
170 DeviceAllocation alloc,
171 int lod) {
172 TI_NOT_IMPLEMENTED
173 }
174};
175
176// A set of states / resources for rasterization
177class TI_DLL_EXPORT RasterResources {
178 public:
179 virtual ~RasterResources() = default;
180
181 /**
182 * Set a vertex buffer for the rasterization
183 * @params ptr The Device Pointer to the vertices data
184 * @params binding The binding index of the vertex buffer
185 */
186 virtual RasterResources &vertex_buffer(DevicePtr ptr, uint32_t binding = 0) {
187 TI_NOT_IMPLEMENTED
188 }
189
190 /**
191 * Set an index buffer for the rasterization
192 * @params ptr The Device Pointer to the vertices data
193 * @params index_width The index data width (in bits).
194 * index_width = 32 -> uint32 index
195 * index_width = 16 -> uint16 index
196 */
197 virtual RasterResources &index_buffer(DevicePtr ptr, size_t index_width) {
198 TI_NOT_IMPLEMENTED
199 }
200};
201
202enum class PipelineSourceType {
203 spirv_binary,
204 spirv_src,
205 glsl_src,
206 hlsl_src,
207 dxil_binary,
208 llvm_ir_src,
209 llvm_ir_binary,
210 metal_src,
211 metal_ir
212};
213
214enum class PipelineStageType {
215 compute,
216 fragment,
217 vertex,
218 tesselation_control,
219 tesselation_eval,
220 geometry,
221 raytracing
222};
223
224// FIXME: Drop the plural form?
225enum class TopologyType : int { Triangles = 0, Lines = 1, Points = 2 };
226
227enum class PolygonMode : int {
228 Fill = 0,
229 Line = 1,
230 Point = 2,
231};
232
233enum class BufferFormat : uint32_t {
234#define PER_BUFFER_FORMAT(x) x,
235#include "taichi/inc/rhi_constants.inc.h"
236#undef PER_BUFFER_FORMAT
237};
238
239class TI_DLL_EXPORT Pipeline {
240 public:
241 virtual ~Pipeline() {
242 }
243};
244
245using UPipeline = std::unique_ptr<Pipeline>;
246
247enum class ImageDimension {
248#define PER_IMAGE_DIMENSION(x) x,
249#include "taichi/inc/rhi_constants.inc.h"
250#undef PER_IMAGE_DIMENSION
251};
252
253enum class ImageLayout {
254#define PER_IMAGE_LAYOUT(x) x,
255#include "taichi/inc/rhi_constants.inc.h"
256#undef PER_IMAGE_LAYOUT
257};
258
259struct BufferImageCopyParams {
260 uint32_t buffer_row_length{0};
261 uint32_t buffer_image_height{0};
262 uint32_t image_mip_level{0};
263 struct {
264 uint32_t x{0};
265 uint32_t y{0};
266 uint32_t z{0};
267 } image_offset;
268 struct {
269 uint32_t x{1};
270 uint32_t y{1};
271 uint32_t z{1};
272 } image_extent;
273 uint32_t image_base_layer{0};
274 uint32_t image_layer_count{1};
275 uint32_t image_aspect_flag{1};
276};
277
278struct ImageCopyParams {
279 uint32_t width{1};
280 uint32_t height{1};
281 uint32_t depth{1};
282};
283
284class TI_DLL_EXPORT CommandList {
285 public:
286 virtual ~CommandList() {
287 }
288
289 /**
290 * Bind a pipeline to the command list.
291 * Doing so resets all bound resources.
292 * @params[in] pipeline The pipeline to be bound
293 */
294 virtual void bind_pipeline(Pipeline *p) noexcept = 0;
295
296 /**
297 * Bind a ShaderResourceSet to a set index.
298 * - If the set index is already bound, the previous binding will be
299 * overwritten.
300 * - A set index can only be bound with a single ShaderResourceSet.
301 * - If the input set is empty, this command is a no-op.
302 * @params[in] res The ShaderResourceSet to be bound.
303 * @params[in] set_index The index the resources will be bound to.
304 * @return The binding result code
305 * `success` If the binding succeded
306 * `invalid_usage` If `res` is incompatible with current pipeline
307 * `not_supported` If some bindings are not supported by the backend
308 * `out_of_memory` If binding failed due to OOM conditions
309 * `error` If binding failed due to other reasons
310 */
311 virtual RhiResult bind_shader_resources(ShaderResourceSet *res,
312 int set_index = 0) noexcept = 0;
313
314 /**
315 * Bind RasterResources to the command list.
316 * - If the input resource is empty, this command is a no-op.
317 * @params res The RasterResources to be bound.
318 * @return The binding result code
319 * `success` If the binding succeded
320 * `invalid_usage` If `res` is incompatible with current pipeline
321 * `not_supported` If some bindings are not supported by the backend
322 * `error` If binding failed due to other reasons
323 */
324 virtual RhiResult bind_raster_resources(RasterResources *res) noexcept = 0;
325
326 /**
327 * Insert a memory barrier into the command list.
328 * The barrier affects a continous region of memory.
329 * Changes to memory before the barrier will be visible to accesses after the
330 * barrier (API command ordering). i.e. Command later to this barrier will see
331 * the changes made by commands before this barrier.
332 * This barrier is limited in scope to the Stream that the command list is
333 * submitted to. Other Streams or Devices may not observe this barrier.
334 * @params[in] ptr The pointer to the start of the region
335 * @params[in] size The size of the memory region.
336 * Size is clamped to the underlying buffer size.
337 */
338 virtual void buffer_barrier(DevicePtr ptr, size_t size) noexcept = 0;
339
340 /**
341 * Insert a memory barrier into the command list.
342 * The barrier affects an entire buffer.
343 * Behaviour is the same as `buffer_barrier(DevicePtr, size_t)`
344 * @params[in] alloc The memory allocation of this barrier
345 */
346 virtual void buffer_barrier(DeviceAllocation alloc) noexcept = 0;
347
348 /**
349 * Insert a memory barrier into the command list.
350 * The barrier affects all global memory.
351 * Behaviour is the same as `buffer_barrier(DevicePtr, size_t)`
352 * @params[in] alloc The memory allocation of this barrier
353 */
354 virtual void memory_barrier() noexcept = 0;
355
356 /**
357 * Insert a buffer copy operation into the command list.
358 * @params[in] src The source Device Pointer
359 * @params[in] dst The destination Device Pointer
360 * @params[in] size The size of the region to be copied.
361 * The size will be clamped to the minimum between
362 * `dst.size - dst.offset` and `src.size - src.offset`
363 */
364 virtual void buffer_copy(DevicePtr dst,
365 DevicePtr src,
366 size_t size) noexcept = 0;
367
368 /**
369 * Insert a memory region fill operation into the command list
370 * The memory region will be filled with the given (bit precise) value.
371 * - (Encouraged behavior) If the `data` is 0, the underlying API might
372 * provide a faster code path.
373 * - (Encouraged behavior) If the `size` is -1 (max of size_t) the underlying
374 * API might provide a faster code path.
375 * @params[in] ptr The start of the memory region.
376 * - ptr.offset will be aligned down to a multiple of 4 bytes.
377 * @params[in] size The size of the region.
378 * - The size will be clamped to the underlying buffer's size.
379 */
380 virtual void buffer_fill(DevicePtr ptr,
381 size_t size,
382 uint32_t data) noexcept = 0;
383
384 /**
385 * Enqueues a compute operation with {X, Y, Z} amount of workgroups.
386 * The block size / workgroup size is pre-determined within the pipeline.
387 * - This is only valid if the pipeline has a predetermined block size
388 * - This API has a device-dependent variable max values for X, Y, Z
389 * - The currently bound pipeline will be dispatched
390 * - The enqueued operation starts in CommandList API ordering.
391 * - The enqueued operation may end out-of-order, but it respects barriers
392 * @params[in] x The number of workgroups in X dimension
393 * @params[in] y The number of workgroups in Y dimension
394 * @params[in] z The number of workgroups in Y dimension
395 * @return The status of this operation
396 * - `success` if the operation is successful
397 * - `invalid_operation` if the current pipeline has variable block size
398 * - `not_supported` if the requested X, Y, or Z is not supported
399 */
400 virtual RhiResult dispatch(uint32_t x,
401 uint32_t y = 1,
402 uint32_t z = 1) noexcept = 0;
403
404 struct ComputeSize {
405 uint32_t x{0};
406 uint32_t y{0};
407 uint32_t z{0};
408 };
409
410 /**
411 * Enqueues a compute operation with `grid_size` amount of threads.
412 * The workgroup size is dynamic and specified through `block_size`
413 * - This is only valid if the pipeline has a predetermined block size
414 * - This API has a device-dependent variable max values for `grid_size`
415 * - This API has a device-dependent supported values for `block_size`
416 * - The currently bound pipeline will be dispatched
417 * - The enqueued operation starts in CommandList API ordering.
418 * - The enqueued operation may end out-of-order, but it respects barriers
419 * @params[in] grid_size The number of threads dispatch
420 * @params[in] block_size The shape of each block / workgroup / threadsgroup
421 * @return The status of this operation
422 * - `success` if the operation is successful
423 * - `invalid_operation` if the current pipeline has variable block size
424 * - `not_supported` if the requested sizes are not supported
425 * - `error` if the operation failed due to other reasons
426 */
427 virtual RhiResult dispatch(ComputeSize grid_size,
428 ComputeSize block_size) noexcept {
429 return RhiResult::not_supported;
430 }
431
432 // Profiler support
433 virtual void begin_profiler_scope(const std::string &kernel_name) {
434 }
435
436 virtual void end_profiler_scope() {
437 }
438
439 // These are not implemented in compute only device
440 virtual void begin_renderpass(int x0,
441 int y0,
442 int x1,
443 int y1,
444 uint32_t num_color_attachments,
445 DeviceAllocation *color_attachments,
446 bool *color_clear,
447 std::vector<float> *clear_colors,
448 DeviceAllocation *depth_attachment,
449 bool depth_clear) {
450 TI_NOT_IMPLEMENTED
451 }
452 virtual void end_renderpass() {
453 TI_NOT_IMPLEMENTED
454 }
455 virtual void draw(uint32_t num_verticies, uint32_t start_vertex = 0) {
456 TI_NOT_IMPLEMENTED
457 }
458 virtual void draw_instance(uint32_t num_verticies,
459 uint32_t num_instances,
460 uint32_t start_vertex = 0,
461 uint32_t start_instance = 0) {
462 TI_NOT_IMPLEMENTED
463 }
464 virtual void set_line_width(float width) {
465 TI_NOT_IMPLEMENTED
466 }
467 virtual void draw_indexed(uint32_t num_indicies,
468 uint32_t start_vertex = 0,
469 uint32_t start_index = 0) {
470 TI_NOT_IMPLEMENTED
471 }
472 virtual void draw_indexed_instance(uint32_t num_indicies,
473 uint32_t num_instances,
474 uint32_t start_vertex = 0,
475 uint32_t start_index = 0,
476 uint32_t start_instance = 0) {
477 TI_NOT_IMPLEMENTED
478 }
479 virtual void image_transition(DeviceAllocation img,
480 ImageLayout old_layout,
481 ImageLayout new_layout) {
482 TI_NOT_IMPLEMENTED
483 }
484 virtual void buffer_to_image(DeviceAllocation dst_img,
485 DevicePtr src_buf,
486 ImageLayout img_layout,
487 const BufferImageCopyParams &params) {
488 TI_NOT_IMPLEMENTED
489 }
490 virtual void image_to_buffer(DevicePtr dst_buf,
491 DeviceAllocation src_img,
492 ImageLayout img_layout,
493 const BufferImageCopyParams &params) {
494 TI_NOT_IMPLEMENTED
495 }
496 virtual void copy_image(DeviceAllocation dst_img,
497 DeviceAllocation src_img,
498 ImageLayout dst_img_layout,
499 ImageLayout src_img_layout,
500 const ImageCopyParams &params) {
501 TI_NOT_IMPLEMENTED
502 }
503 virtual void blit_image(DeviceAllocation dst_img,
504 DeviceAllocation src_img,
505 ImageLayout dst_img_layout,
506 ImageLayout src_img_layout,
507 const ImageCopyParams &params) {
508 TI_NOT_IMPLEMENTED
509 }
510};
511
512struct PipelineSourceDesc {
513 PipelineSourceType type;
514 const void *data{nullptr};
515 size_t size{0};
516 PipelineStageType stage{PipelineStageType::compute};
517};
518
519// FIXME: this probably isn't backend-neutral enough
520enum class AllocUsage : int {
521 None = 0,
522 Storage = 1,
523 Uniform = 2,
524 Vertex = 4,
525 Index = 8,
526 Upload = 16,
527};
528
529MAKE_ENUM_FLAGS(AllocUsage)
530
531class TI_DLL_EXPORT StreamSemaphoreObject {
532 public:
533 virtual ~StreamSemaphoreObject() {
534 }
535};
536
537using StreamSemaphore = std::shared_ptr<StreamSemaphoreObject>;
538
539class TI_DLL_EXPORT Stream {
540 public:
541 virtual ~Stream() {
542 }
543
544 /**
545 * Allocates a new CommandList object from the stream.
546 * @params[out] out_cmdlist The allocated command list.
547 * @return The status of this operation.
548 * - `success` If allocation succeeded.
549 * - `out_of_memory` If allocation failed due to lack of device or host
550 * memory.
551 */
552 virtual RhiResult new_command_list(CommandList **out_cmdlist) noexcept = 0;
553
554 inline std::pair<std::unique_ptr<CommandList>, RhiResult>
555 new_command_list_unique() {
556 CommandList *cmdlist{nullptr};
557 RhiResult res = this->new_command_list(&cmdlist);
558 return std::make_pair(std::unique_ptr<CommandList>(cmdlist), res);
559 }
560
561 virtual StreamSemaphore submit(
562 CommandList *cmdlist,
563 const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
564 virtual StreamSemaphore submit_synced(
565 CommandList *cmdlist,
566 const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
567
568 virtual void command_sync() = 0;
569};
570
571class TI_DLL_EXPORT PipelineCache {
572 public:
573 virtual ~PipelineCache() = default;
574
575 /**
576 * Get the pointer to the raw data of the cache.
577 * - Can return `nullptr` if cache is invalid or empty.
578 */
579 virtual void *data() noexcept {
580 return nullptr;
581 }
582
583 /**
584 * Get the size of the cache (in bytes).
585 */
586 virtual size_t size() const noexcept {
587 return 0;
588 }
589};
590
591using UPipelineCache = std::unique_ptr<PipelineCache>;
592
593class TI_DLL_EXPORT Device {
594 DeviceCapabilityConfig caps_{};
595
596 public:
597 virtual ~Device(){};
598
599 struct AllocParams {
600 uint64_t size{0};
601 bool host_write{false};
602 bool host_read{false};
603 bool export_sharing{false};
604 AllocUsage usage{AllocUsage::Storage};
605 };
606
607 virtual DeviceAllocation allocate_memory(const AllocParams &params) = 0;
608
609 virtual void dealloc_memory(DeviceAllocation handle) = 0;
610
611 virtual uint64_t get_memory_physical_pointer(DeviceAllocation handle) {
612 // FIXME: (penguinliong) This method reports the actual device memory
613 // address, it's used for bindless (like argument buffer on Metal). If the
614 // backend doesn't have access to physical memory address, it should return
615 // null and it depends on the backend implementation to use the address in
616 // argument binders.
617 return 0;
618 }
619
620 /**
621 * Create a Pipeline Cache, which acclerates backend API's pipeline creation.
622 * @params[out] out_cache The created pipeline cache.
623 * - If operation failed this will be set to `nullptr`
624 * @params[in] initial_size Size of the initial data, can be 0.
625 * @params[in] initial_data The initial data, can be nullptr.
626 * - This data can be used to load back the cache from previous invocations.
627 * - The backend API may ignore this data or deem it incompatible.
628 * @return The status of this operation.
629 * - `success` if the pipeline cache is created successfully.
630 * - `out_of_memory` if operation failed due to lack of device or host memory.
631 * - `error` if operation failed due to other errors.
632 */
633 virtual RhiResult create_pipeline_cache(
634 PipelineCache **out_cache,
635 size_t initial_size = 0,
636 const void *initial_data = nullptr) noexcept {
637 *out_cache = nullptr;
638 return RhiResult::not_supported;
639 }
640
641 inline std::pair<UPipelineCache, RhiResult> create_pipeline_cache_unique(
642 size_t initial_size = 0,
643 const void *initial_data = nullptr) noexcept {
644 PipelineCache *cache{nullptr};
645 RhiResult res =
646 this->create_pipeline_cache(&cache, initial_size, initial_data);
647 return std::make_pair(UPipelineCache(cache), res);
648 }
649
650 /**
651 * Create a Pipeline. A Pipeline is a program that can be dispatched into a
652 * stream through a command list.
653 * @params[out] out_pipeline The created pipeline.
654 * @params[in] src The source description of the pipeline.
655 * @params[in] name The name of such pipeline, for debug purposes.
656 * @params[in] cache The pipeline cache to use, can be nullptr.
657 * @return The status of this operation.
658 * - `success` if the pipeline is created successfully.
659 * - `out_of_memory` if operation failed due to lack of device or host memory.
660 * - `invalid_usage` if the specified source is incompatible or invalid.
661 * - `not_supported` if the pipeline uses features the device can't support.
662 * - `error` if the operation failed due to other reasons.
663 */
664 virtual RhiResult create_pipeline(
665 Pipeline **out_pipeline,
666 const PipelineSourceDesc &src,
667 std::string name = "Pipeline",
668 PipelineCache *cache = nullptr) noexcept = 0;
669
670 inline std::pair<UPipeline, RhiResult> create_pipeline_unique(
671 const PipelineSourceDesc &src,
672 std::string name = "Pipeline",
673 PipelineCache *cache = nullptr) noexcept {
674 Pipeline *pipeline{nullptr};
675 RhiResult res = this->create_pipeline(&pipeline, src, name, cache);
676 return std::make_pair(UPipeline(pipeline), res);
677 }
678
679 std::unique_ptr<DeviceAllocationGuard> allocate_memory_unique(
680 const AllocParams &params) {
681 return std::make_unique<DeviceAllocationGuard>(
682 this->allocate_memory(params));
683 }
684
685 /**
686 * Upload data to device allocations immediately.
687 * - This is a synchronous operation, function returns when upload is complete
688 * - The host data pointers must be valid and large enough for the size of the
689 * copy, otherwise this function might segfault
690 * - `device_ptr`, `data`, and `sizes` must contain `count` number of valid
691 * values
692 * @params[in] device_ptr The array to destination device pointers.
693 * @params[in] data The array to source host pointers.
694 * @params[in] sizes The array to sizes of data/copy.
695 * @params[in] count The number of uploads to perform.
696 * @return The status of this operation
697 * - `success` if the upload is successful.
698 * - `out_of_memory` if operation failed due to lack of device or host memory.
699 * - `invalid_usage` if the specified source is incompatible or invalid.
700 * - `error` if the operation failed due to other reasons.
701 */
702 virtual RhiResult upload_data(DevicePtr *device_ptr,
703 const void **data,
704 size_t *size,
705 int num_alloc = 1) noexcept;
706
707 /**
708 * Read data from device allocations back to host immediately.
709 * - This is a synchronous operation, function returns when readback is
710 * complete
711 * - The host data pointers must be valid and large enough for the size of the
712 * copy, otherwise this function might segfault
713 * - `device_ptr`, `data`, and `sizes` must contain `count` number of valid
714 * values
715 * @params[in] device_ptr The array to source device pointers.
716 * @params[in] data The array to destination host pointers.
717 * @params[in] sizes The array to sizes of data/copy.
718 * @params[in] count The number of readbacks to perform.
719 * @params[in] wait_sema The semaphores to wait for before the copy is
720 * initiated.
721 * @return The status of this operation
722 * - `success` if the upload is successful.
723 * - `out_of_memory` if operation failed due to lack of device or host memory.
724 * - `invalid_usage` if the specified source is incompatible or invalid.
725 * - `error` if the operation failed due to other reasons.
726 */
727 virtual RhiResult readback_data(
728 DevicePtr *device_ptr,
729 void **data,
730 size_t *size,
731 int num_alloc = 1,
732 const std::vector<StreamSemaphore> &wait_sema = {}) noexcept;
733
734 virtual uint64_t fetch_result_uint64(int i, uint64_t *result_buffer) {
735 TI_NOT_IMPLEMENTED
736 }
737
738 // Each thraed will acquire its own stream
739 virtual Stream *get_compute_stream() = 0;
740
741 // Wait for all tasks to complete (task from all streams)
742 virtual void wait_idle() = 0;
743
744 /**
745 * Create a new shader resource set
746 * @return The new shader resource set pointer
747 */
748 virtual ShaderResourceSet *create_resource_set() = 0;
749
750 /**
751 * Create a new shader resource set (wrapped in unique ptr)
752 * @return The new shader resource set unique pointer
753 */
754 inline std::unique_ptr<ShaderResourceSet> create_resource_set_unique() {
755 return std::unique_ptr<ShaderResourceSet>(this->create_resource_set());
756 }
757
758 /**
759 * Map a range within a DeviceAllocation memory into host address space.
760 *
761 * @param[in] ptr The Device Pointer to map.
762 * @param[in] size The size of the mapped region.
763 * @param[out] mapped_ptr Outputs the pointer to the mapped region.
764 * @return The result status.
765 * `success` when the mapping is successful.
766 * `invalid_usage` when the memory is not host visible.
767 * `invalid_usage` when trying to map the memory multiple times.
768 * `invalid_usage` when `ptr.offset + size` is out-of-bounds.
769 * `error` when the mapping failed for other reasons.
770 */
771 virtual RhiResult map_range(DevicePtr ptr,
772 uint64_t size,
773 void **mapped_ptr) = 0;
774
775 /**
776 * Map an entire DeviceAllocation into host address space.
777 * @param[in] ptr The Device Pointer to map.
778 * @param[in] size The size of the mapped region.
779 * @param[out] mapped_ptr Outputs the pointer to the mapped region.
780 * @return The result status.
781 * `success` when the mapping is successful.
782 * `invalid_usage` when the memory is not host visible.
783 * `invalid_usage` when trying to map the memory multiple times.
784 * `invalid_usage` when `ptr.offset + size` is out-of-bounds.
785 * `error` when the mapping failed for other reasons.
786 */
787 virtual RhiResult map(DeviceAllocation alloc, void **mapped_ptr) = 0;
788
789 /**
790 * Unmap a previously mapped DevicePtr or DeviceAllocation.
791 * @param[in] ptr The DevicePtr to unmap.
792 */
793 virtual void unmap(DevicePtr ptr) = 0;
794
795 /**
796 * Unmap a previously mapped DevicePtr or DeviceAllocation.
797 * @param[in] alloc The DeviceAllocation to unmap
798 */
799 virtual void unmap(DeviceAllocation alloc) = 0;
800
801 // Directly share memory in the form of alias
802 static DeviceAllocation share_to(DeviceAllocation *alloc, Device *target);
803
804 // Strictly intra device copy (synced)
805 virtual void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) = 0;
806
807 // Copy memory inter or intra devices (synced)
808 enum class MemcpyCapability { Direct, RequiresStagingBuffer, RequiresHost };
809
810 static MemcpyCapability check_memcpy_capability(DevicePtr dst,
811 DevicePtr src,
812 uint64_t size);
813
814 static void memcpy_direct(DevicePtr dst, DevicePtr src, uint64_t size);
815
816 static void memcpy_via_staging(DevicePtr dst,
817 DevicePtr staging,
818 DevicePtr src,
819 uint64_t size);
820
821 static void memcpy_via_host(DevicePtr dst,
822 void *host_buffer,
823 DevicePtr src,
824 uint64_t size);
825
826 // Get all supported capabilities of the current created device.
827 virtual Arch arch() const = 0;
828 inline const DeviceCapabilityConfig &get_caps() const {
829 return caps_;
830 }
831 inline void set_caps(DeviceCapabilityConfig &&caps) {
832 caps_ = std::move(caps);
833 }
834
835 // Profiler support
836 virtual void profiler_sync() {
837 }
838
839 virtual size_t profiler_get_sampler_count() {
840 return 0;
841 }
842
843 virtual std::vector<std::pair<std::string, double>>
844 profiler_flush_sampled_time() {
845 return std::vector<std::pair<std::string, double>>();
846 }
847};
848
849class TI_DLL_EXPORT Surface {
850 public:
851 virtual ~Surface() {
852 }
853
854 virtual StreamSemaphore acquire_next_image() = 0;
855 virtual DeviceAllocation get_target_image() = 0;
856 virtual void present_image(
857 const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
858 virtual std::pair<uint32_t, uint32_t> get_size() = 0;
859 virtual int get_image_count() = 0;
860 virtual BufferFormat image_format() = 0;
861 virtual void resize(uint32_t width, uint32_t height) = 0;
862 virtual DeviceAllocation get_depth_data(DeviceAllocation &depth_alloc) = 0;
863 virtual DeviceAllocation get_image_data() {
864 TI_NOT_IMPLEMENTED
865 }
866};
867
868struct VertexInputBinding {
869 uint32_t binding{0};
870 size_t stride{0};
871 bool instance{false};
872};
873
874struct VertexInputAttribute {
875 uint32_t location{0};
876 uint32_t binding{0};
877 BufferFormat format;
878 uint32_t offset{0};
879};
880
881struct SurfaceConfig {
882 // VSync:
883 // - true: will attempt to wait for V-Blank
884 // - when adaptive is true: when supported, if a V-Blank is missed, instead of
885 // waiting, a tearing may appear, reduces overall latency
886 bool vsync{false};
887 bool adaptive{true};
888 void *window_handle{nullptr};
889 uint32_t width{1};
890 uint32_t height{1};
891 void *native_surface_handle{nullptr};
892};
893
894enum class ImageAllocUsage : int {
895 None = 0,
896 Storage = 1,
897 Sampled = 2,
898 Attachment = 4,
899};
900inline ImageAllocUsage operator|(ImageAllocUsage a, ImageAllocUsage b) {
901 return static_cast<ImageAllocUsage>(static_cast<int>(a) |
902 static_cast<int>(b));
903}
904inline bool operator&(ImageAllocUsage a, ImageAllocUsage b) {
905 return static_cast<int>(a) & static_cast<int>(b);
906}
907
908struct ImageParams {
909 ImageDimension dimension;
910 BufferFormat format;
911 ImageLayout initial_layout{ImageLayout::undefined};
912 uint32_t x{1};
913 uint32_t y{1};
914 uint32_t z{1};
915 bool export_sharing{false};
916 ImageAllocUsage usage{ImageAllocUsage::Storage | ImageAllocUsage::Sampled |
917 ImageAllocUsage::Attachment};
918};
919
920struct BlendFunc {
921 BlendOp op{BlendOp::add};
922 BlendFactor src_factor{BlendFactor::src_alpha};
923 BlendFactor dst_factor{BlendFactor::one_minus_src_alpha};
924};
925
926struct BlendingParams {
927 bool enable{true};
928 BlendFunc color;
929 BlendFunc alpha;
930};
931
932struct RasterParams {
933 TopologyType prim_topology{TopologyType::Triangles};
934 PolygonMode polygon_mode{PolygonMode::Fill};
935 bool front_face_cull{false};
936 bool back_face_cull{false};
937 bool depth_test{false};
938 bool depth_write{false};
939 std::vector<BlendingParams> blending{};
940};
941
942class TI_DLL_EXPORT GraphicsDevice : public Device {
943 public:
944 virtual std::unique_ptr<Pipeline> create_raster_pipeline(
945 const std::vector<PipelineSourceDesc> &src,
946 const RasterParams &raster_params,
947 const std::vector<VertexInputBinding> &vertex_inputs,
948 const std::vector<VertexInputAttribute> &vertex_attrs,
949 std::string name = "Pipeline") = 0;
950
951 virtual Stream *get_graphics_stream() = 0;
952
953 /**
954 * Create a new raster resources set
955 * @return The new RasterResources pointer
956 */
957 virtual RasterResources *create_raster_resources() = 0;
958
959 /**
960 * Create a new raster resources set (wrapped in unique ptr)
961 * @return The new RasterResources unique pointer
962 */
963 inline std::unique_ptr<RasterResources> create_raster_resources_unique() {
964 return std::unique_ptr<RasterResources>(this->create_raster_resources());
965 }
966
967 virtual std::unique_ptr<Surface> create_surface(
968 const SurfaceConfig &config) = 0;
969 // You are not expected to call this directly. If you want to use this image
970 // in a taichi kernel, you usually want to create the image via
971 // `GfxRuntime::create_image`. `GfxRuntime` is available in `ProgramImpl`
972 // of GPU backends.
973 virtual DeviceAllocation create_image(const ImageParams &params) = 0;
974 inline DeviceImageUnique create_image_unique(const ImageParams &params) {
975 return std::make_unique<DeviceImageGuard>(this->create_image(params));
976 }
977 virtual void destroy_image(DeviceAllocation handle) = 0;
978
979 virtual void image_transition(DeviceAllocation img,
980 ImageLayout old_layout,
981 ImageLayout new_layout);
982 virtual void buffer_to_image(DeviceAllocation dst_img,
983 DevicePtr src_buf,
984 ImageLayout img_layout,
985 const BufferImageCopyParams &params);
986 virtual void image_to_buffer(DevicePtr dst_buf,
987 DeviceAllocation src_img,
988 ImageLayout img_layout,
989 const BufferImageCopyParams &params);
990};
991
992} // namespace taichi::lang
993