1 | /*! |
2 | * Copyright (c) 2017 by Contributors |
3 | * \file dlpack.h |
4 | * \brief The common header of DLPack. |
5 | */ |
6 | #ifndef DLPACK_DLPACK_H_ |
7 | #define DLPACK_DLPACK_H_ |
8 | |
9 | /** |
10 | * \brief Compatibility with C++ |
11 | */ |
12 | #ifdef __cplusplus |
13 | #define DLPACK_EXTERN_C extern "C" |
14 | #else |
15 | #define DLPACK_EXTERN_C |
16 | #endif |
17 | |
18 | /*! \brief The current version of dlpack */ |
19 | #define DLPACK_VERSION 70 |
20 | |
21 | /*! \brief The current ABI version of dlpack */ |
22 | #define DLPACK_ABI_VERSION 1 |
23 | |
24 | /*! \brief DLPACK_DLL prefix for windows */ |
25 | #ifdef _WIN32 |
26 | #ifdef DLPACK_EXPORTS |
27 | #define DLPACK_DLL __declspec(dllexport) |
28 | #else |
29 | #define DLPACK_DLL __declspec(dllimport) |
30 | #endif |
31 | #else |
32 | #define DLPACK_DLL |
33 | #endif |
34 | |
35 | #include <stdint.h> |
36 | #include <stddef.h> |
37 | |
38 | #ifdef __cplusplus |
39 | extern "C" { |
40 | #endif |
41 | /*! |
42 | * \brief The device type in DLDevice. |
43 | */ |
44 | #ifdef __cplusplus |
45 | typedef enum : int32_t { |
46 | #else |
47 | typedef enum { |
48 | #endif |
49 | /*! \brief CPU device */ |
50 | kDLCPU = 1, |
51 | /*! \brief CUDA GPU device */ |
52 | kDLCUDA = 2, |
53 | /*! |
54 | * \brief Pinned CUDA CPU memory by cudaMallocHost |
55 | */ |
56 | kDLCUDAHost = 3, |
57 | /*! \brief OpenCL devices. */ |
58 | kDLOpenCL = 4, |
59 | /*! \brief Vulkan buffer for next generation graphics. */ |
60 | kDLVulkan = 7, |
61 | /*! \brief Metal for Apple GPU. */ |
62 | kDLMetal = 8, |
63 | /*! \brief Verilog simulator buffer */ |
64 | kDLVPI = 9, |
65 | /*! \brief ROCm GPUs for AMD GPUs */ |
66 | kDLROCM = 10, |
67 | /*! |
68 | * \brief Pinned ROCm CPU memory allocated by hipMallocHost |
69 | */ |
70 | kDLROCMHost = 11, |
71 | /*! |
72 | * \brief Reserved extension device type, |
73 | * used for quickly test extension device |
74 | * The semantics can differ depending on the implementation. |
75 | */ |
76 | kDLExtDev = 12, |
77 | /*! |
78 | * \brief CUDA managed/unified memory allocated by cudaMallocManaged |
79 | */ |
80 | kDLCUDAManaged = 13, |
81 | /*! |
82 | * \brief Unified shared memory allocated on a oneAPI non-partititioned |
83 | * device. Call to oneAPI runtime is required to determine the device |
84 | * type, the USM allocation type and the sycl context it is bound to. |
85 | * |
86 | */ |
87 | kDLOneAPI = 14, |
88 | /*! \brief GPU support for next generation WebGPU standard. */ |
89 | kDLWebGPU = 15, |
90 | /*! \brief Qualcomm Hexagon DSP */ |
91 | kDLHexagon = 16, |
92 | } DLDeviceType; |
93 | |
94 | /*! |
95 | * \brief A Device for Tensor and operator. |
96 | */ |
97 | typedef struct { |
98 | /*! \brief The device type used in the device. */ |
99 | DLDeviceType device_type; |
100 | /*! |
101 | * \brief The device index. |
102 | * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. |
103 | */ |
104 | int32_t device_id; |
105 | } DLDevice; |
106 | |
107 | /*! |
108 | * \brief The type code options DLDataType. |
109 | */ |
110 | typedef enum { |
111 | /*! \brief signed integer */ |
112 | kDLInt = 0U, |
113 | /*! \brief unsigned integer */ |
114 | kDLUInt = 1U, |
115 | /*! \brief IEEE floating point */ |
116 | kDLFloat = 2U, |
117 | /*! |
118 | * \brief Opaque handle type, reserved for testing purposes. |
119 | * Frameworks need to agree on the handle data type for the exchange to be well-defined. |
120 | */ |
121 | kDLOpaqueHandle = 3U, |
122 | /*! \brief bfloat16 */ |
123 | kDLBfloat = 4U, |
124 | /*! |
125 | * \brief complex number |
126 | * (C/C++/Python layout: compact struct per complex number) |
127 | */ |
128 | kDLComplex = 5U, |
129 | } DLDataTypeCode; |
130 | |
131 | /*! |
132 | * \brief The data type the tensor can hold. The data type is assumed to follow the |
133 | * native endian-ness. An explicit error message should be raised when attempting to |
134 | * export an array with non-native endianness |
135 | * |
136 | * Examples |
137 | * - float: type_code = 2, bits = 32, lanes=1 |
138 | * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4 |
139 | * - int8: type_code = 0, bits = 8, lanes=1 |
140 | * - std::complex<float>: type_code = 5, bits = 64, lanes = 1 |
141 | */ |
142 | typedef struct { |
143 | /*! |
144 | * \brief Type code of base types. |
145 | * We keep it uint8_t instead of DLDataTypeCode for minimal memory |
146 | * footprint, but the value should be one of DLDataTypeCode enum values. |
147 | * */ |
148 | uint8_t code; |
149 | /*! |
150 | * \brief Number of bits, common choices are 8, 16, 32. |
151 | */ |
152 | uint8_t bits; |
153 | /*! \brief Number of lanes in the type, used for vector types. */ |
154 | uint16_t lanes; |
155 | } DLDataType; |
156 | |
157 | /*! |
158 | * \brief Plain C Tensor object, does not manage memory. |
159 | */ |
160 | typedef struct { |
161 | /*! |
162 | * \brief The data pointer points to the allocated data. This will be CUDA |
163 | * device pointer or cl_mem handle in OpenCL. It may be opaque on some device |
164 | * types. This pointer is always aligned to 256 bytes as in CUDA. The |
165 | * `byte_offset` field should be used to point to the beginning of the data. |
166 | * |
167 | * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, |
168 | * TVM, perhaps others) do not adhere to this 256 byte aligment requirement |
169 | * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed |
170 | * (after which this note will be updated); at the moment it is recommended |
171 | * to not rely on the data pointer being correctly aligned. |
172 | * |
173 | * For given DLTensor, the size of memory required to store the contents of |
174 | * data is calculated as follows: |
175 | * |
176 | * \code{.c} |
177 | * static inline size_t GetDataSize(const DLTensor* t) { |
178 | * size_t size = 1; |
179 | * for (tvm_index_t i = 0; i < t->ndim; ++i) { |
180 | * size *= t->shape[i]; |
181 | * } |
182 | * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; |
183 | * return size; |
184 | * } |
185 | * \endcode |
186 | */ |
187 | void* data; |
188 | /*! \brief The device of the tensor */ |
189 | DLDevice device; |
190 | /*! \brief Number of dimensions */ |
191 | int32_t ndim; |
192 | /*! \brief The data type of the pointer*/ |
193 | DLDataType dtype; |
194 | /*! \brief The shape of the tensor */ |
195 | int64_t* shape; |
196 | /*! |
197 | * \brief strides of the tensor (in number of elements, not bytes) |
198 | * can be NULL, indicating tensor is compact and row-majored. |
199 | */ |
200 | int64_t* strides; |
201 | /*! \brief The offset in bytes to the beginning pointer to data */ |
202 | uint64_t byte_offset; |
203 | } DLTensor; |
204 | |
205 | /*! |
206 | * \brief C Tensor object, manage memory of DLTensor. This data structure is |
207 | * intended to facilitate the borrowing of DLTensor by another framework. It is |
208 | * not meant to transfer the tensor. When the borrowing framework doesn't need |
209 | * the tensor, it should call the deleter to notify the host that the resource |
210 | * is no longer needed. |
211 | */ |
212 | typedef struct DLManagedTensor { |
213 | /*! \brief DLTensor which is being memory managed */ |
214 | DLTensor dl_tensor; |
215 | /*! \brief the context of the original host framework of DLManagedTensor in |
216 | * which DLManagedTensor is used in the framework. It can also be NULL. |
217 | */ |
218 | void * manager_ctx; |
219 | /*! \brief Destructor signature void (*)(void*) - this should be called |
220 | * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL |
221 | * if there is no way for the caller to provide a reasonable destructor. |
222 | * The destructors deletes the argument self as well. |
223 | */ |
224 | void (*deleter)(struct DLManagedTensor * self); |
225 | } DLManagedTensor; |
226 | #ifdef __cplusplus |
227 | } // DLPACK_EXTERN_C |
228 | #endif |
229 | #endif // DLPACK_DLPACK_H_ |
230 | |