1/*!
2 * Copyright (c) 2017 by Contributors
3 * \file dlpack.h
4 * \brief The common header of DLPack.
5 */
6#ifndef DLPACK_DLPACK_H_
7#define DLPACK_DLPACK_H_
8
9/**
10 * \brief Compatibility with C++
11 */
12#ifdef __cplusplus
13#define DLPACK_EXTERN_C extern "C"
14#else
15#define DLPACK_EXTERN_C
16#endif
17
18/*! \brief The current version of dlpack */
19#define DLPACK_VERSION 70
20
21/*! \brief The current ABI version of dlpack */
22#define DLPACK_ABI_VERSION 1
23
24/*! \brief DLPACK_DLL prefix for windows */
25#ifdef _WIN32
26#ifdef DLPACK_EXPORTS
27#define DLPACK_DLL __declspec(dllexport)
28#else
29#define DLPACK_DLL __declspec(dllimport)
30#endif
31#else
32#define DLPACK_DLL
33#endif
34
35#include <stdint.h>
36#include <stddef.h>
37
38#ifdef __cplusplus
39extern "C" {
40#endif
41/*!
42 * \brief The device type in DLDevice.
43 */
44#ifdef __cplusplus
45typedef enum : int32_t {
46#else
47typedef enum {
48#endif
49 /*! \brief CPU device */
50 kDLCPU = 1,
51 /*! \brief CUDA GPU device */
52 kDLCUDA = 2,
53 /*!
54 * \brief Pinned CUDA CPU memory by cudaMallocHost
55 */
56 kDLCUDAHost = 3,
57 /*! \brief OpenCL devices. */
58 kDLOpenCL = 4,
59 /*! \brief Vulkan buffer for next generation graphics. */
60 kDLVulkan = 7,
61 /*! \brief Metal for Apple GPU. */
62 kDLMetal = 8,
63 /*! \brief Verilog simulator buffer */
64 kDLVPI = 9,
65 /*! \brief ROCm GPUs for AMD GPUs */
66 kDLROCM = 10,
67 /*!
68 * \brief Pinned ROCm CPU memory allocated by hipMallocHost
69 */
70 kDLROCMHost = 11,
71 /*!
72 * \brief Reserved extension device type,
73 * used for quickly test extension device
74 * The semantics can differ depending on the implementation.
75 */
76 kDLExtDev = 12,
77 /*!
78 * \brief CUDA managed/unified memory allocated by cudaMallocManaged
79 */
80 kDLCUDAManaged = 13,
81 /*!
82 * \brief Unified shared memory allocated on a oneAPI non-partititioned
83 * device. Call to oneAPI runtime is required to determine the device
84 * type, the USM allocation type and the sycl context it is bound to.
85 *
86 */
87 kDLOneAPI = 14,
88 /*! \brief GPU support for next generation WebGPU standard. */
89 kDLWebGPU = 15,
90 /*! \brief Qualcomm Hexagon DSP */
91 kDLHexagon = 16,
92} DLDeviceType;
93
94/*!
95 * \brief A Device for Tensor and operator.
96 */
97typedef struct {
98 /*! \brief The device type used in the device. */
99 DLDeviceType device_type;
100 /*!
101 * \brief The device index.
102 * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
103 */
104 int32_t device_id;
105} DLDevice;
106
107/*!
108 * \brief The type code options DLDataType.
109 */
110typedef enum {
111 /*! \brief signed integer */
112 kDLInt = 0U,
113 /*! \brief unsigned integer */
114 kDLUInt = 1U,
115 /*! \brief IEEE floating point */
116 kDLFloat = 2U,
117 /*!
118 * \brief Opaque handle type, reserved for testing purposes.
119 * Frameworks need to agree on the handle data type for the exchange to be well-defined.
120 */
121 kDLOpaqueHandle = 3U,
122 /*! \brief bfloat16 */
123 kDLBfloat = 4U,
124 /*!
125 * \brief complex number
126 * (C/C++/Python layout: compact struct per complex number)
127 */
128 kDLComplex = 5U,
129} DLDataTypeCode;
130
131/*!
132 * \brief The data type the tensor can hold. The data type is assumed to follow the
133 * native endian-ness. An explicit error message should be raised when attempting to
134 * export an array with non-native endianness
135 *
136 * Examples
137 * - float: type_code = 2, bits = 32, lanes=1
138 * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
139 * - int8: type_code = 0, bits = 8, lanes=1
140 * - std::complex<float>: type_code = 5, bits = 64, lanes = 1
141 */
142typedef struct {
143 /*!
144 * \brief Type code of base types.
145 * We keep it uint8_t instead of DLDataTypeCode for minimal memory
146 * footprint, but the value should be one of DLDataTypeCode enum values.
147 * */
148 uint8_t code;
149 /*!
150 * \brief Number of bits, common choices are 8, 16, 32.
151 */
152 uint8_t bits;
153 /*! \brief Number of lanes in the type, used for vector types. */
154 uint16_t lanes;
155} DLDataType;
156
157/*!
158 * \brief Plain C Tensor object, does not manage memory.
159 */
160typedef struct {
161 /*!
162 * \brief The data pointer points to the allocated data. This will be CUDA
163 * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
164 * types. This pointer is always aligned to 256 bytes as in CUDA. The
165 * `byte_offset` field should be used to point to the beginning of the data.
166 *
167 * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
168 * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
169 * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
170 * (after which this note will be updated); at the moment it is recommended
171 * to not rely on the data pointer being correctly aligned.
172 *
173 * For given DLTensor, the size of memory required to store the contents of
174 * data is calculated as follows:
175 *
176 * \code{.c}
177 * static inline size_t GetDataSize(const DLTensor* t) {
178 * size_t size = 1;
179 * for (tvm_index_t i = 0; i < t->ndim; ++i) {
180 * size *= t->shape[i];
181 * }
182 * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
183 * return size;
184 * }
185 * \endcode
186 */
187 void* data;
188 /*! \brief The device of the tensor */
189 DLDevice device;
190 /*! \brief Number of dimensions */
191 int32_t ndim;
192 /*! \brief The data type of the pointer*/
193 DLDataType dtype;
194 /*! \brief The shape of the tensor */
195 int64_t* shape;
196 /*!
197 * \brief strides of the tensor (in number of elements, not bytes)
198 * can be NULL, indicating tensor is compact and row-majored.
199 */
200 int64_t* strides;
201 /*! \brief The offset in bytes to the beginning pointer to data */
202 uint64_t byte_offset;
203} DLTensor;
204
205/*!
206 * \brief C Tensor object, manage memory of DLTensor. This data structure is
207 * intended to facilitate the borrowing of DLTensor by another framework. It is
208 * not meant to transfer the tensor. When the borrowing framework doesn't need
209 * the tensor, it should call the deleter to notify the host that the resource
210 * is no longer needed.
211 */
212typedef struct DLManagedTensor {
213 /*! \brief DLTensor which is being memory managed */
214 DLTensor dl_tensor;
215 /*! \brief the context of the original host framework of DLManagedTensor in
216 * which DLManagedTensor is used in the framework. It can also be NULL.
217 */
218 void * manager_ctx;
219 /*! \brief Destructor signature void (*)(void*) - this should be called
220 * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
221 * if there is no way for the caller to provide a reasonable destructor.
222 * The destructors deletes the argument self as well.
223 */
224 void (*deleter)(struct DLManagedTensor * self);
225} DLManagedTensor;
226#ifdef __cplusplus
227} // DLPACK_EXTERN_C
228#endif
229#endif // DLPACK_DLPACK_H_
230