1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20/*!
21 * \file tvm/runtime/ndarray.h
22 * \brief A device-independent managed NDArray abstraction.
23 */
24#ifndef TVM_RUNTIME_NDARRAY_H_
25#define TVM_RUNTIME_NDARRAY_H_
26
27#include <tvm/runtime/c_runtime_api.h>
28#include <tvm/runtime/container/optional.h>
29#include <tvm/runtime/container/shape_tuple.h>
30#include <tvm/runtime/container/string.h>
31#include <tvm/runtime/data_type.h>
32#include <tvm/runtime/object.h>
33#include <tvm/runtime/serializer.h>
34
35#include <atomic>
36#include <functional>
37#include <utility>
38#include <vector>
39
40namespace tvm {
41
42// alias DLDevice
43using Device = DLDevice;
44
45// A 'null' device type, does not correspond to any DLDeviceType enum.
46// TODO(mbs): This is to help us as we transition away from representing the 'homogenous' case
47// as a singleton target map indexed by the invalid DLDeviceType '0'.
48constexpr DLDeviceType kNullDeviceType = static_cast<DLDeviceType>(0);
49
50// An 'invalid' device type, does not correspond to any DLDeviceType enum.
51constexpr DLDeviceType kInvalidDeviceType = static_cast<DLDeviceType>(-1);
52
53namespace runtime {
54
55/*!
56 * \brief Managed NDArray.
57 * The array is backed by reference counted blocks.
58 */
59class NDArray : public ObjectRef {
60 public:
61 /*! \brief ContainerBase used to back the TVMArrayHandle */
62 class ContainerBase;
63 /*! \brief NDArray internal container type */
64 class Container;
65 /*! \brief Container type for Object system. */
66 using ContainerType = Container;
67 /*! \brief default constructor */
68 NDArray() {}
69 /*!
70 * \brief constructor.
71 * \param data ObjectPtr to the data container.
72 */
73 explicit NDArray(ObjectPtr<Object> data) : ObjectRef(data) {}
74
75 /*! \brief reset the content of NDArray to be nullptr */
76 inline void reset();
77 /*!
78 * \return the reference counter
79 * \note this number is approximate in multi-threaded setting.
80 */
81 inline int use_count() const;
82 /*! \return Pointer to content of DLTensor */
83 inline const DLTensor* operator->() const;
84 /*! \return Whether the tensor is contiguous */
85 inline bool IsContiguous() const;
86 /*!
87 * \brief Copy data content from another array.
88 * \param other The source array to be copied from.
89 * \note The copy may happen asynchronously if it involves a GPU context.
90 * TVMSynchronize is necessary.
91 */
92 inline void CopyFrom(const DLTensor* other);
93 inline void CopyFrom(const NDArray& other);
94 /*!
95 * \brief Copy data content from a byte buffer.
96 * \param data The source bytes to be copied from.
97 * \param nbytes The size of the buffer in bytes
98 * Must be equal to the size of the NDArray.
99 * \note The copy always triggers a TVMSynchronize.
100 */
101 TVM_DLL void CopyFromBytes(const void* data, size_t nbytes);
102 /*!
103 * \brief Copy data content into another array.
104 * \param other The source array to be copied from.
105 * \note The copy may happen asynchronously if it involves a GPU context.
106 * TVMSynchronize is necessary.
107 */
108 inline void CopyTo(DLTensor* other) const;
109 inline void CopyTo(const NDArray& other) const;
110 /*!
111 * \brief Copy data content into another array.
112 * \param data The source bytes to be copied from.
113 * \param nbytes The size of the data buffer.
114 * Must be equal to the size of the NDArray.
115 * \note The copy always triggers a TVMSynchronize.
116 */
117 TVM_DLL void CopyToBytes(void* data, size_t nbytes) const;
118 /*!
119 * \brief Copy the data to another device.
120 * \param dev The target device.
121 * \return The array under another device.
122 */
123 inline NDArray CopyTo(const Device& dev) const;
124 /*!
125 * \brief Load NDArray from stream
126 * \param stream The input data stream
127 * \return Whether load is successful
128 */
129 inline bool Load(dmlc::Stream* stream);
130 /*!
131 * \brief Save NDArray to stream
132 * \param stream The output data stream
133 */
134 inline void Save(dmlc::Stream* stream) const;
135 /*!
136 * \brief Create a NDArray that shares the data memory with the current one.
137 * \param shape The shape of the new array.
138 * \param dtype The data type of the new array.
139 * \note The memory size of new array must be smaller than the current one.
140 */
141 TVM_DLL NDArray CreateView(ShapeTuple shape, DLDataType dtype);
142 /*!
143 * \brief Create a reference view of NDArray that
144 * represents as DLManagedTensor.
145 * \return A DLManagedTensor
146 */
147 TVM_DLL DLManagedTensor* ToDLPack() const;
148 /*!
149 * \brief Create an empty NDArray.
150 * \param shape The shape of the new array.
151 * \param dtype The data type of the new array.
152 * \param dev The device of the array.
153 * \param mem_scope The memory scope of the array.
154 * \return The created Array
155 */
156 TVM_DLL static NDArray Empty(ShapeTuple shape, DLDataType dtype, Device dev,
157 Optional<String> mem_scope = NullOpt);
158 /*!
159 * \brief Create a NDArray backed by an external DLTensor without memory copying.
160 *
161 * If DLTensor is not contiguous or has bad aligned data, It fails.
162 * This allows us to create a NDArray using the memory
163 * allocated by an external source. Responsibility for memory
164 * retaining lies with the external source.
165 * \param dl_tensor The DLTensor for NDArray base.
166 * \return The created NDArray view.
167 */
168 TVM_DLL static NDArray FromExternalDLTensor(const DLTensor& dl_tensor);
169 /*!
170 * \brief Create new NDArray, data is copied from DLTensor.
171 *
172 * \param dl_tensor The DLTensor to copy from.
173 * \param dev device location of the created NDArray.
174 * \return The created NDArray view.
175 */
176 TVM_DLL static NDArray NewFromDLTensor(DLTensor* dl_tensor, const Device& dev);
177 /*!
178 * \brief Create a NDArray backed by a dlpack tensor.
179 *
180 * This allows us to create a NDArray using the memory
181 * allocated by an external deep learning framework
182 * that is DLPack compatible.
183 *
184 * The memory is retained until the NDArray went out of scope.
185 * \param tensor The DLPack tensor to copy from.
186 * \return The created NDArray view.
187 */
188 TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor);
189 /*!
190 * \brief Function to copy data from one array to another.
191 * \param from The source array.
192 * \param to The target array.
193 * \param stream The stream used in copy.
194 */
195 TVM_DLL static void CopyFromTo(const DLTensor* from, DLTensor* to,
196 TVMStreamHandle stream = nullptr);
197
198 TVM_DLL ShapeTuple Shape() const;
199 TVM_DLL runtime::DataType DataType() const;
200 /*!
201 * \brief Check conditions for construction NDArray over DLTensor without copying.
202 * There are three conditions to check:
203 * 1. Destination device is the same as DLTensor device
204 * 2. Destination device id is the same as DLTensor device id
205 * 3. Memory in DLTensor is aligned as expected for NDArray
206 * \param tensor the DLTensor.
207 * \param dev destination device.
208 * \return true if all conditions are satisfied.
209 */
210 TVM_DLL static bool AbilityOfZeroCopyForDLTensor(DLTensor* tensor, const Device& dev);
211 // internal namespace
212 struct Internal;
213
214 private:
215 TVM_DLL static bool IsAligned(const DLTensor& tensor);
216
217 protected:
218 friend class TVMPODValue_;
219 friend class TVMRetValue;
220 friend class TVMArgsSetter;
221 /*!
222 * \brief Get mutable internal container pointer.
223 * \return a mutable container pointer.
224 */
225 inline Container* get_mutable() const;
226 // Helper functions for FFI handling.
227 /*!
228 * \brief Construct NDArray's Data field from array handle in FFI.
229 * \param handle The array handle.
230 * \return The corresponding ObjectPtr to the constructed container object.
231 *
232 * \note We keep a special calling convention for NDArray by passing
233 * ContainerBase pointer in FFI.
234 * As a result, the argument is compatible to DLTensor*.
235 */
236 inline static ObjectPtr<Object> FFIDataFromHandle(TVMArrayHandle handle);
237 /*!
238 * \brief DecRef resource managed by an FFI array handle.
239 * \param handle The array handle.
240 */
241 inline static void FFIDecRef(TVMArrayHandle handle);
242 /*!
243 * \brief Get FFI Array handle from ndarray.
244 * \param nd The object with ndarray type.
245 * \return The result array handle.
246 */
247 inline static TVMArrayHandle FFIGetHandle(const ObjectRef& nd);
248};
249
250/*!
251 * \brief Save a DLTensor to stream
252 * \param strm The output stream
253 * \param tensor The tensor to be saved.
254 */
255inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
256
257/*!
258 * \brief The container base structure
259 * contains all the fields except for the Object header.
260 *
261 * \note We explicitly declare this structure in order to pass
262 * PackedFunc argument using ContainerBase*.
263 */
264class NDArray::ContainerBase {
265 public:
266 /*!
267 * \brief The corresponding dl_tensor field.
268 * \note it is important that the first field is DLTensor
269 * So that this data structure is DLTensor compatible.
270 * The head ptr of this struct can be viewed as DLTensor*.
271 */
272 DLTensor dl_tensor;
273
274 /*!
275 * \brief additional context, reserved for recycling
276 * \note We can attach additional content here
277 * which the current container depend on
278 * (e.g. reference to original memory when creating views).
279 */
280 void* manager_ctx{nullptr};
281
282 protected:
283 /*!
284 * \brief The shape container,
285 * can be used used for shape data.
286 */
287 ShapeTuple shape_;
288};
289
290/*!
291 * \brief Object container class that backs NDArray.
292 * \note do not use this function directly, use NDArray.
293 */
294class NDArray::Container : public Object, public NDArray::ContainerBase {
295 public:
296 /*! \brief default constructor */
297 Container() {
298 // Initialize the type index.
299 type_index_ = Container::RuntimeTypeIndex();
300 dl_tensor.data = nullptr;
301 dl_tensor.ndim = 0;
302 dl_tensor.shape = nullptr;
303 dl_tensor.strides = nullptr;
304 dl_tensor.byte_offset = 0;
305 }
306
307 Container(void* data, ShapeTuple shape, DLDataType dtype, Device dev) {
308 // Initialize the type index.
309 type_index_ = Container::RuntimeTypeIndex();
310 dl_tensor.data = data;
311 shape_ = std::move(shape);
312 dl_tensor.ndim = static_cast<int>(shape_.size());
313 dl_tensor.shape = const_cast<ShapeTuple::index_type*>(shape_.data());
314 dl_tensor.dtype = dtype;
315 dl_tensor.strides = nullptr;
316 dl_tensor.byte_offset = 0;
317 dl_tensor.device = dev;
318 }
319 /*!
320 * \brief Set the deleter field.
321 * \param deleter The deleter.
322 */
323 void SetDeleter(FDeleter deleter) { deleter_ = deleter; }
324
325 // Expose DecRef and IncRef as public function
326 // NOTE: they are only for developer purposes only.
327 using Object::DecRef;
328 using Object::IncRef;
329
330 // Information for object protocol.
331 static constexpr const uint32_t _type_index = TypeIndex::kRuntimeNDArray;
332 static constexpr const uint32_t _type_child_slots = 0;
333 static constexpr const uint32_t _type_child_slots_can_overflow = true;
334 static constexpr const char* _type_key = "runtime.NDArray";
335 TVM_DECLARE_BASE_OBJECT_INFO(NDArray::Container, Object);
336
337 protected:
338 friend class RPCWrappedFunc;
339 friend class NDArray;
340};
341
342// implementations of inline functions
343/*!
344 * \brief return the size of data the DLTensor hold, in term of number of bytes
345 *
346 * \param arr the input DLTensor
347 * \return number of bytes of data in the DLTensor.
348 */
349inline size_t GetDataSize(const DLTensor& arr) {
350 size_t size = 1;
351 for (tvm_index_t i = 0; i < arr.ndim; ++i) {
352 size *= static_cast<size_t>(arr.shape[i]);
353 }
354 size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
355 return size;
356}
357
358/*!
359 * \brief check if a DLTensor is contiguous.
360 * \param arr The input DLTensor.
361 * \return The check result.
362 */
363static inline bool IsContiguous(const DLTensor& arr) {
364 if (arr.strides == nullptr) return true;
365 int64_t expected_stride = 1;
366 for (int32_t i = arr.ndim; i != 0; --i) {
367 int32_t k = i - 1;
368 if (arr.shape[k] == 1) {
369 // Skip stride check if shape[k] is 1, where the dimension is contiguous
370 // regardless of the value of stride.
371 //
372 // For example, PyTorch will normalize stride to 1 if shape is 1 when exporting
373 // to DLPack.
374 // More context: https://github.com/pytorch/pytorch/pull/83158
375 continue;
376 }
377 if (arr.strides[k] != expected_stride) return false;
378 expected_stride *= arr.shape[k];
379 }
380 return true;
381}
382
383inline bool NDArray::IsContiguous() const {
384 return ::tvm::runtime::IsContiguous(get_mutable()->dl_tensor);
385}
386
387inline void NDArray::CopyFrom(const DLTensor* other) {
388 ICHECK(data_ != nullptr);
389 CopyFromTo(other, &(get_mutable()->dl_tensor));
390}
391
392inline void NDArray::CopyFrom(const NDArray& other) {
393 ICHECK(data_ != nullptr);
394 ICHECK(other.data_ != nullptr);
395 CopyFromTo(&(other.get_mutable()->dl_tensor), &(get_mutable()->dl_tensor));
396}
397
398inline void NDArray::CopyTo(DLTensor* other) const {
399 ICHECK(data_ != nullptr);
400 CopyFromTo(&(get_mutable()->dl_tensor), other);
401}
402
403inline void NDArray::CopyTo(const NDArray& other) const {
404 ICHECK(data_ != nullptr);
405 ICHECK(other.data_ != nullptr);
406 CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
407}
408
409inline NDArray NDArray::CopyTo(const Device& dev) const {
410 ICHECK(data_ != nullptr);
411 const DLTensor* dptr = operator->();
412 NDArray ret = Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev);
413 this->CopyTo(ret);
414 return ret;
415}
416
417inline int NDArray::use_count() const { return data_.use_count(); }
418
419inline const DLTensor* NDArray::operator->() const { return &(get_mutable()->dl_tensor); }
420
421inline NDArray::Container* NDArray::get_mutable() const {
422 return static_cast<NDArray::Container*>(data_.get());
423}
424
425inline ObjectPtr<Object> NDArray::FFIDataFromHandle(TVMArrayHandle handle) {
426 return GetObjectPtr<Object>(
427 static_cast<NDArray::Container*>(reinterpret_cast<NDArray::ContainerBase*>(handle)));
428}
429
430inline TVMArrayHandle NDArray::FFIGetHandle(const ObjectRef& nd) {
431 // NOTE: it is necessary to cast to container then to base
432 // so that the FFI handle uses the ContainerBase address.
433 auto ptr = reinterpret_cast<TVMArrayHandle>(static_cast<NDArray::ContainerBase*>(
434 static_cast<NDArray::Container*>(const_cast<Object*>(nd.get()))));
435 return ptr;
436}
437
438inline void NDArray::FFIDecRef(TVMArrayHandle handle) {
439 static_cast<NDArray::Container*>(reinterpret_cast<NDArray::ContainerBase*>(handle))->DecRef();
440}
441
442inline Object* TVMArrayHandleToObjectHandle(TVMArrayHandle handle) {
443 return static_cast<NDArray::Container*>(reinterpret_cast<NDArray::ContainerBase*>(handle));
444}
445
446/*! \brief Magic number for NDArray file */
447constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
448
449inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
450 uint64_t header = kTVMNDArrayMagic, reserved = 0;
451 strm->Write(header);
452 strm->Write(reserved);
453 // Always save data as CPU context
454 //
455 // Parameters that get serialized should be in CPU by default.
456 // So even the array's context is GPU, it will be stored as CPU array.
457 // This is used to prevent case when another user loads the parameters
458 // back on machine that do not have GPU or related context.
459 //
460 // We can always do array.CopyTo(target_dev) to get a corresponding
461 // array in the target context.
462 Device cpu_dev;
463 cpu_dev.device_type = kDLCPU;
464 cpu_dev.device_id = 0;
465 strm->Write(cpu_dev);
466 strm->Write(tensor->ndim);
467 strm->Write(tensor->dtype);
468 int ndim = tensor->ndim;
469 strm->WriteArray(tensor->shape, ndim);
470 int type_bytes = (tensor->dtype.bits + 7) / 8;
471 int64_t num_elems = 1;
472 for (int i = 0; i < ndim; ++i) {
473 num_elems *= tensor->shape[i];
474 }
475 int64_t data_byte_size = type_bytes * num_elems;
476 strm->Write(data_byte_size);
477
478 if (DMLC_IO_NO_ENDIAN_SWAP && tensor->device.device_type == kDLCPU &&
479 tensor->strides == nullptr && tensor->byte_offset == 0) {
480 // quick path
481 strm->Write(tensor->data, data_byte_size);
482 } else {
483 std::vector<uint8_t> bytes(data_byte_size);
484 ICHECK_EQ(
485 TVMArrayCopyToBytes(const_cast<DLTensor*>(tensor), dmlc::BeginPtr(bytes), data_byte_size),
486 0)
487 << TVMGetLastError();
488 if (!DMLC_IO_NO_ENDIAN_SWAP) {
489 dmlc::ByteSwap(dmlc::BeginPtr(bytes), type_bytes, num_elems);
490 }
491 strm->Write(dmlc::BeginPtr(bytes), data_byte_size);
492 }
493 return true;
494}
495
496inline void NDArray::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->()); }
497
498inline bool NDArray::Load(dmlc::Stream* strm) {
499 uint64_t header, reserved;
500 ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
501 ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
502 ICHECK(header == kTVMNDArrayMagic) << "Invalid DLTensor file format";
503 Device dev;
504 int ndim;
505 DLDataType dtype;
506 ICHECK(strm->Read(&dev)) << "Invalid DLTensor file format";
507 ICHECK(strm->Read(&ndim)) << "Invalid DLTensor file format";
508 ICHECK(strm->Read(&dtype)) << "Invalid DLTensor file format";
509 ICHECK_EQ(dev.device_type, kDLCPU) << "Invalid DLTensor device: can only save as CPU tensor";
510 std::vector<int64_t> shape(ndim);
511 if (ndim != 0) {
512 ICHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DLTensor file format";
513 }
514 NDArray ret = NDArray::Empty(ShapeTuple(shape), dtype, dev);
515 int64_t num_elems = 1;
516 int elem_bytes = (ret->dtype.bits + 7) / 8;
517 for (int i = 0; i < ret->ndim; ++i) {
518 num_elems *= ret->shape[i];
519 }
520 int64_t data_byte_size;
521 ICHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
522 ICHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
523 auto read_ret = strm->Read(ret->data, data_byte_size);
524 // Only check non-empty data
525 if (ndim > 0 && shape[0] != 0) {
526 ICHECK(read_ret) << "Invalid DLTensor file format";
527 }
528 if (!DMLC_IO_NO_ENDIAN_SWAP) {
529 dmlc::ByteSwap(ret->data, elem_bytes, num_elems);
530 }
531 *this = ret;
532 return true;
533}
534
535} // namespace runtime
536} // namespace tvm
537
538namespace std {
539template <>
540struct hash<tvm::Device> {
541 std::size_t operator()(const tvm::Device& dev) const {
542 return ((dev.device_id << 8) | dev.device_type);
543 }
544};
545
546template <>
547struct equal_to<tvm::Device> {
548 bool operator()(const tvm::Device& lhs, const tvm::Device& rhs) const {
549 return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id);
550 }
551};
552} // namespace std
553
554#endif // TVM_RUNTIME_NDARRAY_H_
555