1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef GLOW_BASE_TENSOR_H
17#define GLOW_BASE_TENSOR_H
18
19#include <algorithm>
20#include <cassert>
21#include <vector>
22
23#include "glow/Base/DeviceTensorTransferManager.h"
24#include "glow/Base/Type.h"
25#include "glow/Support/Compiler.h"
26#include "glow/Support/Memory.h"
27#include "glow/Support/Random.h"
28
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/Support/raw_ostream.h"
31
32namespace glow {
33
34//===----------------------------------------------------------------------===//
35// Tensor
36//===----------------------------------------------------------------------===//
37
38template <class ElemTy> class Handle;
39
40class Tensor;
41class TensorPool;
42
43void genericTranspose(const Tensor *src, Tensor *dest,
44 llvm::ArrayRef<unsigned_t> shuffle);
45
46/// Helper function that \returns a ShapeVector of those dimensions in \p
47/// currDims expanded with dimension = 1 until the maximum tensor dimension is
48/// reached. The number of elements in the input dims is the same as in the
49/// returned dims. For example, input {2,1,4} would result in {2,1,4,1,1,1}.
50ShapeVector expandDimsToMax(llvm::ArrayRef<dim_t> currDims);
51
52/// Helper function that \returns a ShapeVector obtained from \p dims by
53/// reducing (setting to 1) the dimensions given by \p axes. If the flag
54/// \p keepDims is also used then the reduced dimensions are kept, otherwise
55/// are pruned. For example, given the dimensions [2,3,4] and axes [0,2] the
56/// returned shape will be [1,3,1] for keepDims true and [3] for keepDims false.
57ShapeVector reduceDims(llvm::ArrayRef<dim_t> dims,
58 llvm::ArrayRef<unsigned_t> axes, bool keepDims);
59
60/// Helper function that \returns the transpose shuffle that would undo the
61/// given \p shuffle so that if two transposes were composed with the given
62/// shuffle and the result of this function, it would result in the identity
63/// shuffle.
64std::vector<unsigned_t> getInverseTranspose(llvm::ArrayRef<unsigned_t> shuffle);
65
66namespace runtime {
67class DeviceManager;
68}
69
70/// Holds information regarding whether this Tensor exists in a device-specific
71/// form, either resident or specific for a device, and what device holds it.
72class DeviceResidencyInfo final {
73 enum class TensorResidency {
74 Host,
75 Device,
76 };
77
78 // A pointer to the device manager of the device on which the tensor
79 // resides.
80 DeviceTensorTransferManager *deviceManager_{nullptr};
81 /// The residency status of the tensor.
82 TensorResidency tensorResidency_{TensorResidency::Host};
83 // A pointer to a context structure, containing the required info to access
84 // tensor data and perform transfers.
85 void *locationContext_{nullptr};
86
87public:
88 DeviceResidencyInfo()
89 : deviceManager_(nullptr), tensorResidency_(TensorResidency::Host),
90 locationContext_(nullptr) {}
91
92 /// Move ctor.
93 DeviceResidencyInfo(DeviceResidencyInfo &&other) = delete;
94
95 /// Move assignment operator.
96 DeviceResidencyInfo &operator=(DeviceResidencyInfo &&other) = delete;
97
98 ~DeviceResidencyInfo() {
99 // If a tensor is device resident, let its device manager free the device
100 // buffer.
101 if (isDeviceResident()) {
102 deviceManager_->releaseDeviceTensor(locationContext_);
103 }
104 }
105
106 /// Removes all device specific state.
107 void clear() {
108 deviceManager_ = nullptr;
109 locationContext_ = nullptr;
110 tensorResidency_ = TensorResidency::Host;
111 }
112
113 /// \returns true if this Tensor is resident or specific for a device.
114 bool isDeviceResident() const {
115 assert((tensorResidency_ == TensorResidency::Host || deviceManager_) &&
116 "Device resident tensor must have an assigned device manager.");
117 return tensorResidency_ == TensorResidency::Device;
118 }
119
120 /// \returns the DeviceManager this tensor is resident on, if any.
121 DeviceTensorTransferManager *getDeviceManager() const {
122 return deviceManager_;
123 }
124
125 /// \returns the device specific location context for a resident Tensor.
126 void *getLocationContext() const { return locationContext_; }
127
128 friend class Tensor;
129};
130
131/// A class that represents a contiguous n-dimensional array (a tensor).
132class Tensor final {
133public:
134 /// Specifies the kind initialization for the tensor.
135 enum class InitKind {
136 Zero, // The tensor is initialized to zero.
137 Broadcast, // Broadcast a single value to all elements.
138 Xavier, // Init the tensor with random values using the Xavier method.
139 };
140
141private:
142 /// A pointer to the tensor data.
143 char *data_{nullptr};
144
145 /// The type of the tensor.
146 Type type_;
147
148 /// If the tensor is unowned.
149 bool isUnowned_{false};
150
151 /// The TensorPool that is managing this Tensor (if any).
152 TensorPool *tensorPool_{nullptr};
153
154 /// The device residency info accosiated with the tensor.
155 DeviceResidencyInfo *deviceResidency_{nullptr};
156
157 /// If this tensor owns the DeviceResidencyInfo.
158 bool ownsDeviceResidency_{false};
159
160 /// Size in bytes of the unpadded region memory. This is useful communicating
161 /// the actual size of the data, this allows for copying only inputs and not
162 /// padding to the device.
163 size_t unpaddedSize_{0};
164
165 template <class ElemTy> friend class Handle;
166
167 /// \returns a pointer to the tensor data buffer.
168 char *getData() const { return data_; }
169
170public:
171 /// \returns true if it is an unowned tensor.
172 bool isUnowned() const { return isUnowned_; }
173
174 /// \returns the number of allocated bytes pointed to by \ref data_.
175 size_t getUnpaddedSizeInBytes() const { return unpaddedSize_; }
176
177 /// \returns the number of real elements in a Tensor, not including extra
178 /// padding, or not including number of elements that do not exist outside of
179 /// a partial tensor shape. Note that Tensors cannot be both custom aligned
180 /// and partial.
181 size_t getRealNumElements() const {
182 // If custom alignment then return size from the handle.
183 if (size() < actualSize()) {
184 return size();
185 }
186 // Else assume no custom alignment, so return number of elements based on
187 // unpaddedSize_, i.e. accounts for partial Tensors.
188 return unpaddedSize_ / type_.getElementSize();
189 }
190
191 /// \returns the type of the tensor.
192 const Type &getType() const { return type_; }
193
194 /// Set the type of the Tensor to \p t.
195 void setType(const TypeRef t) {
196 assert(type_.dims() == t->dims() && "New type must retain the same shape.");
197 assert(((type_.getElementType() == t->getElementType() &&
198 type_.size() == t->size()) ||
199 type_.getSizeInBytes() == t->getSizeInBytes()) &&
200 "New type must retain the same size in bytes.");
201 type_ = *t;
202 }
203
204 /// \return the element type of the tensor.
205 ElemKind getElementType() const { return type_.getElementType(); }
206
207 /// \returns True if the coordinate is within the array.
208 bool isInBounds(llvm::ArrayRef<dim_t> indices) const {
209 assert(type_.numSizes_ == indices.size() && "Invalid number of indices");
210 for (size_t i = 0u, e = indices.size(); i < e; i++) {
211 if (indices[i] >= type_.sizes_[i]) {
212 return false;
213 }
214 }
215 return true;
216 }
217
218 /// Set the content of the tensor to zero. If \p resetFusedScalesOffsets, then
219 /// fused scales/offsets will be set to 1.0/0.0 as well.
220 void zero(bool resetFusedScalesOffsets = false) {
221 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
222 size_t size = actualSize();
223 // Quantized tensors should go to their offset.
224 switch (type_.getElementType()) {
225 case ElemKind::Int8QTy: {
226 auto *data = reinterpret_cast<int8_t *>(getData());
227 std::fill(&data[0], &data[0] + size, (int8_t)type_.getOffset());
228 break;
229 }
230 case ElemKind::UInt8QTy: {
231 auto *data = reinterpret_cast<uint8_t *>(getData());
232 std::fill(&data[0], &data[0] + size, (uint8_t)type_.getOffset());
233 break;
234 }
235 case ElemKind::Int16QTy: {
236 auto *data = reinterpret_cast<int16_t *>(getData());
237 std::fill(&data[0], &data[0] + size, (int16_t)type_.getOffset());
238 break;
239 }
240 case ElemKind::Int32QTy: {
241 auto *data = reinterpret_cast<int32_t *>(getData());
242 std::fill(&data[0], &data[0] + size, (int32_t)type_.getOffset());
243 break;
244 }
245#define FUSED_CASE(ELEM_KIND, DATA_TYPE) \
246 case ElemKind::ELEM_KIND: { \
247 assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \
248 assert(dims()[1] > sizeof(DATA_TYPE) && \
249 "Fused tensor must have space for scale and offset."); \
250 const size_t dataWidth = dims()[1]; \
251 const size_t alignedLength = type_.strides()[0]; \
252 auto *data = reinterpret_cast<uint8_t *>(getData()); \
253 for (size_t i = 0, e = dims()[0]; i < e; i++) { \
254 uint8_t *scaleOffsetPtr = \
255 data + i * alignedLength + dataWidth - 2 * sizeof(DATA_TYPE); \
256 DATA_TYPE scale, offset; \
257 if (resetFusedScalesOffsets) { \
258 /* Use these as defaults, and copy them into each row. */ \
259 scale = 1.0; \
260 offset = 0.0; \
261 memcpy(scaleOffsetPtr, &scale, sizeof(DATA_TYPE)); \
262 memcpy(scaleOffsetPtr + sizeof(DATA_TYPE), &offset, \
263 sizeof(DATA_TYPE)); \
264 } else { \
265 memcpy(&scale, scaleOffsetPtr, sizeof(DATA_TYPE)); \
266 memcpy(&offset, scaleOffsetPtr + sizeof(DATA_TYPE), \
267 sizeof(DATA_TYPE)); \
268 } \
269 DCHECK_NE(static_cast<float>(scale), 0.0) \
270 << "Disallow scale = 0.0 for Fused ElemKinds; causes div by zero."; \
271 float zero = nearbyintf(-1 * static_cast<float>(offset / scale)); \
272 std::fill(data + i * alignedLength, scaleOffsetPtr, \
273 static_cast<uint8_t>(zero)); \
274 } \
275 break; \
276 }
277 FUSED_CASE(UInt8FusedQTy, float);
278 FUSED_CASE(UInt8FusedFP16QTy, float16_t);
279#undef FUSED_CASE
280
281 default:
282 // Non-quantized tensors are set to 0.
283 std::fill(&getData()[0], &getData()[0] + size * type_.getElementSize(),
284 0);
285 break;
286 }
287 }
288
289 /// \returns the shape of the tensor.
290 llvm::ArrayRef<dim_t> dims() const { return type_.dims(); }
291
292 /// \returns the number of real meaningful elements in the tensor. Does not
293 /// take strides into account.
294 dim_t size() const { return type_.size(); }
295
296 /// \returns the actual number of elements in the tensor taking striding into
297 /// account. Since size() does not take striding into account, size() is
298 /// always <= actualSize().
299 dim_t actualSize() const { return type_.actualSize(); }
300
301 /// \returns the number of bytes required to store the tensor based on its
302 /// Type. Note that this includes the size required for padding.
303 uint64_t getSizeInBytes() const { return type_.getSizeInBytes(); }
304
305 /// \returns the TensorPool managing this object, or nullptr if it is
306 /// unmanaged.
307 TensorPool *getOwningPool() { return tensorPool_; }
308
309 template <typename DataType>
310 static Tensor fromData(ElemKind elemKind, llvm::ArrayRef<dim_t> dims,
311 const std::initializer_list<DataType> &data) {
312 Tensor tensor(elemKind, dims);
313 tensor.getHandle<DataType>() = data;
314 return tensor;
315 }
316
317 template <typename DataType>
318 static Tensor fromData(ElemKind elemKind, float scale, int32_t offset,
319 llvm::ArrayRef<dim_t> dims,
320 const std::initializer_list<DataType> &data) {
321 Tensor tensor(elemKind, dims, scale, offset);
322 tensor.getHandle<DataType>() = data;
323 return tensor;
324 }
325
326 /// Initialize an empty tensor.
327 Tensor() = default;
328
329 /// Initialize from a list of float literals.
330 Tensor(const std::initializer_list<float> &vec) {
331 reset(ElemKind::FloatTy, {(dim_t)vec.size()});
332 auto *data = getRawDataPointer<float>();
333 int i = 0;
334 for (auto &f : vec) {
335 data[i++] = f;
336 }
337 }
338
339 /// Allocate and initialize a new tensor.
340 explicit Tensor(TypeRef ty) : data_(nullptr), type_(*ty), isUnowned_{false} {
341 reset(*ty);
342 }
343
344 /// Allocate and initialize a new tensor.
345 explicit Tensor(const Type &ty)
346 : data_(nullptr), type_(ty), isUnowned_{false} {
347 reset(ty);
348 }
349
350 /// Allocate and initialize a float new tensor.
351 Tensor(ElemKind elemTy, llvm::ArrayRef<dim_t> dims)
352 : data_(nullptr), type_(elemTy, dims), isUnowned_{false} {
353 reset(elemTy, dims);
354 }
355
356 /// Construct an unowned tensor provided an existing payload buffer.
357 /// This constructor can be used when there is a need to work with
358 /// "externally" managed payload buffers using Tensor APIs. Additionally
359 /// \p unpaddedSize can be set to indicate actual size of the inputs. If
360 /// negative then it defaults back to the size of the input type.
361 Tensor(void *data, TypeRef ty, ssize_t unpaddedSize = -1)
362 : data_(reinterpret_cast<char *>(data)), type_(*ty) {
363 // Mark as unowned.
364 isUnowned_ = true;
365 // We do want DeviceResidency however, since there is no owning Glow Tensor.
366 resetDeviceInfo();
367 if (unpaddedSize < 0) {
368 unpaddedSize_ = type_.getSizeInBytes();
369 } else {
370 unpaddedSize_ = static_cast<size_t>(unpaddedSize);
371 }
372 }
373
374 /// Allocate and initialize a new integer tensor with \p scale and \p offset.
375 Tensor(ElemKind elemTy, llvm::ArrayRef<dim_t> dims, float scale,
376 int32_t offset)
377 : data_(nullptr), type_(elemTy, dims, scale, offset), isUnowned_{false} {
378 reset(type_);
379 }
380
381 /// Allocate a new Tensor managed by the \p tensorPool.
382 explicit Tensor(TypeRef ty, TensorPool *tensorPool)
383 : data_(nullptr), type_(*ty), tensorPool_(tensorPool) {
384 reset(*ty);
385 }
386
387 Tensor(const Tensor &other) = delete;
388 Tensor &operator=(const Tensor &other) = delete;
389
390 /// Initialize the content of the tensor using the \p init method. The value
391 /// \p val is the initialization parameter. \p PRNG is used to generate random
392 /// numbers. Note that if the tensor's kind is Fused, then the fused
393 /// scaled/offsets will not be modified.
394 void init(InitKind init, float val, PseudoRNG &PRNG);
395
396 /// \returns an unowned tensor with the exact same dimensions as this.
397 Tensor getUnowned() const { return getUnowned(dims()); }
398
399 /// \returns unowned tensor using the same data buffer as the current tensor
400 /// but having different dimensions \p dims. \p offsets represents an optional
401 /// offset into the tensor representing the location of the first element to
402 /// start a subview from. The returned unonwed tensor is essentially a
403 /// different view or subview on the same data.
404 ///
405 /// The lifetime of the returned unowned tensor should be always within
406 /// the lifetime of its parent tensor, i.e. the unowned tensor should not
407 /// outlive its parent tensor.
408 Tensor getUnowned(llvm::ArrayRef<dim_t> dims,
409 llvm::ArrayRef<dim_t> offsets = {}) const {
410 Tensor unownedTensor;
411
412 auto *firstElemPtr = getData();
413 if (offsets.size()) {
414 assert(offsets.size() == this->dims().size() &&
415 "Number of dims of tensor must equal number of dims in offsets");
416 // Find the index of the first element and use it to find the pointer to
417 // the first element.
418 size_t index = 0;
419 for (size_t i = 0; i < this->dims().size(); i++) {
420 index += type_.strides()[i] * offsets[i];
421 }
422 firstElemPtr = &firstElemPtr[index * type_.getElementSize()];
423 }
424
425 unownedTensor.data_ = firstElemPtr;
426 unownedTensor.isUnowned_ = true;
427 unownedTensor.type_ = Type::newShape(getType(), dims);
428 unownedTensor.deviceResidency_ = deviceResidency_;
429
430 // If the original base Tensor is padded, then we only allow the unowned
431 // Tensor to be padded if there are no offsets. Otherwise assert that the
432 // base Tensor is not padded, and set unpaddedSize to that of the new
433 // unowned type.
434 if (offsets.size() == 0) {
435 unownedTensor.unpaddedSize_ = unpaddedSize_;
436 assert(actualSize() == unownedTensor.actualSize() &&
437 "The size of the unowned tensor "
438 "should be the same as the size of "
439 "the original tensor");
440
441 } else {
442 unownedTensor.unpaddedSize_ = unownedTensor.type_.getSizeInBytes();
443 assert(getSizeInBytes() == getUnpaddedSizeInBytes() &&
444 "Problematic to get unowned offsetted view of a padded tensor");
445 assert(actualSize() >= unownedTensor.actualSize() &&
446 "The size of the unowned tensor "
447 "should be no greater than the "
448 "size of the original tensor");
449 }
450 return unownedTensor;
451 }
452
453 /// This is the same as \ref getUnowned() but it produces an owned tensor
454 /// instead. \returns owned tensor copied from the data buffer of the current
455 /// tensor but having different dimensions \p dims. \p offsets represents an
456 /// optional offset into the tensor representing the location of the first
457 /// element to start a subview from.
458 Tensor getOwnedSlice(llvm::ArrayRef<dim_t> dims,
459 llvm::ArrayRef<dim_t> offsets = {}) const {
460 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
461 return getUnowned(dims, offsets).clone();
462 }
463
464 /// Reset the shape and type of this tensor to match the shape and type of
465 /// \p other. The size of the buffer is set to \p unpaddedSize unless it is
466 /// negative, which will instead default back to the number of bytes needed
467 /// for the type of \p other.
468 void reset(const Tensor *other, ssize_t unpaddedSize = -1) {
469 reset(other->getType(), unpaddedSize);
470 }
471
472 void reset(ElemKind elemTy, llvm::ArrayRef<dim_t> shape) {
473 Type t(elemTy, shape);
474 reset(t);
475 }
476
477 void reset(ElemKind elemTy, llvm::ArrayRef<dim_t> shape, float scale,
478 int32_t offset) {
479 Type t(elemTy, shape, scale, offset);
480 reset(t);
481 }
482
483 /// Assigns a new shape to the tensor and allocates a new buffer. The size of
484 /// the buffer is set to \p unpaddedSize unless it is negative, which will
485 /// instead default back to the number of bytes needed for \p T.
486 void reset(const Type &T, ssize_t unpaddedSize = -1) {
487 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
488
489 // If negative then fall back to the passed in Type's padded size.
490 if (unpaddedSize < 0) {
491 unpaddedSize = T.getSizeInBytes();
492 }
493
494 // If the new size is identical to the allocated size then there is no need
495 // to re-allocate the buffer.
496 const bool isOrigPadded =
497 getSizeInBytes() != uint64_t(getUnpaddedSizeInBytes());
498 const bool isNewPadded = T.getSizeInBytes() != size_t(unpaddedSize);
499 const bool isBufReuseAllowed =
500 (isOrigPadded == isNewPadded) &&
501 (getUnpaddedSizeInBytes() == size_t(unpaddedSize));
502 if (type_ == T && getData() && isBufReuseAllowed) {
503#ifdef GLOW_DEBUG_TENSOR_INIT
504 PseudoRNG rng;
505 init(InitKind::Broadcast, GLOW_DEBUG_TENSOR_INIT, rng);
506#endif
507 resetDeviceInfo();
508 return;
509 }
510
511 // Delete the old buffer, update the shape, and allocate a new one.
512 if (!isUnowned())
513 alignedFree(getData());
514 type_ = T;
515
516 // We are allocating memory specifically for this tensor, thus, it owns it.
517 isUnowned_ = false;
518
519 // We are allocating memory on the host so it is not device resident.
520 resetDeviceInfo();
521
522 // Note: zero-dimensional tensors (i.e. {}) have size 1. However, Tensors
523 // may have 0 for some dimension, meaning they have size of 0, and so we do
524 // not allocate anything for them.
525 data_ = unpaddedSize == 0 ? nullptr
526 : reinterpret_cast<char *>(alignedAlloc(
527 unpaddedSize, TensorAlignment));
528
529 // Set unpaddedSize_ to the actual number of bytes.
530 unpaddedSize_ = unpaddedSize;
531
532 assert(!(size() < actualSize() &&
533 getSizeInBytes() != getUnpaddedSizeInBytes()) &&
534 "Custom aligned Tensors cannot also be partial");
535
536#ifdef GLOW_DEBUG_TENSOR_INIT
537 PseudoRNG rng;
538 init(InitKind::Broadcast, GLOW_DEBUG_TENSOR_INIT, rng);
539#endif
540 }
541 /// Releases the data buffer and sets the unOwned flag to true. This is useful
542 /// for keeping metadata around but not the actual contents.
543 void release() {
544 if (!isUnowned()) {
545 alignedFree(getData());
546 }
547 if (ownsDeviceResidency_) {
548 delete deviceResidency_;
549 ownsDeviceResidency_ = false;
550 }
551
552 isUnowned_ = true;
553 }
554 ~Tensor() {
555 if (!isUnowned()) {
556 alignedFree(getData());
557 }
558
559 if (ownsDeviceResidency_) {
560 delete deviceResidency_;
561 ownsDeviceResidency_ = false;
562 }
563 }
564
565 // Move ctor.
566 Tensor(Tensor &&other) noexcept {
567 if (!isUnowned()) {
568 alignedFree(getData());
569 }
570 if (ownsDeviceResidency_) {
571 delete deviceResidency_;
572 }
573 data_ = other.data_;
574 type_ = other.type_;
575 isUnowned_ = other.isUnowned_;
576 tensorPool_ = other.tensorPool_;
577 unpaddedSize_ = other.unpaddedSize_;
578 deviceResidency_ = other.deviceResidency_;
579 ownsDeviceResidency_ = other.ownsDeviceResidency_;
580 other.data_ = nullptr;
581 other.isUnowned_ = true;
582 other.tensorPool_ = nullptr;
583 other.deviceResidency_ = nullptr;
584 other.ownsDeviceResidency_ = false;
585 }
586
587 /// Move assignment operator.
588 Tensor &operator=(Tensor &&other) {
589 if (!isUnowned()) {
590 alignedFree(getData());
591 }
592 if (ownsDeviceResidency_) {
593 delete deviceResidency_;
594 }
595 data_ = other.data_;
596 type_ = other.type_;
597 isUnowned_ = other.isUnowned_;
598 tensorPool_ = other.tensorPool_;
599 unpaddedSize_ = other.unpaddedSize_;
600 deviceResidency_ = other.deviceResidency_;
601 ownsDeviceResidency_ = other.ownsDeviceResidency_;
602 other.data_ = nullptr;
603 other.isUnowned_ = true;
604 other.tensorPool_ = nullptr;
605 other.deviceResidency_ = nullptr;
606 other.ownsDeviceResidency_ = false;
607 return *this;
608 }
609
610 /// Dump a textual representation of the Tensor into provided output stream.
611 void dump(llvm::raw_ostream &os) const;
612
613 /// Dump a textual representation of the Tensor into default output stream.
614 void dump() const;
615
616 /// Dump a textual representation of a specific number of elements in the
617 /// Tensor into provided output stream.
618 void dump(llvm::raw_ostream &os, unsigned maxNumElem) const;
619
620 /// Dump a textual representation of a specific number of elements in the
621 /// Tensor into default output stream.
622 void dump(unsigned maxNumElem) const;
623
624 /// Dump a textual representation of the Tensor to std::string.
625 std::string toString() const;
626
627 /// Dump a textual representation of a specific number of elements in the
628 /// Tensor to std::string.
629 std::string toString(unsigned maxNumElem) const;
630
631 /// Dump a textual representation of the shape of this Tensor to std::string.
632 std::string getShapeToString() const;
633
634 /// \returns true if the content of the other tensor \p other is identical to
635 /// this one, given some \p allowedError. If \p verbose and the tensors are
636 /// not equal, then we will log information about the mismatch (number of
637 /// elements exceeding allowed error; maximum error and location found; etc.).
638 bool isEqual(const Tensor &other, float allowedError = 0.0001,
639 bool verbose = true) const {
640 if (isDeviceResident()) {
641 if (!other.isDeviceResident()) {
642 if (verbose) {
643 LOG(INFO) << "Tensors cannot be compared as they are not resident in "
644 "the same location.";
645 }
646 return false;
647 }
648
649 return getDeviceManager() == other.getDeviceManager() &&
650 getLocationContext() == other.getLocationContext();
651 }
652 return isEqualImpl(other, /*isBitwise=*/false, allowedError, verbose);
653 }
654
655 /// \returns true if the content of the other tensor \p other is bitwise
656 /// identical to this one.
657 bool isBitwiseEqual(const Tensor &other, bool verbose = false) const {
658 return isEqualImpl(other, /*isBitwise=*/true, /*allowedError=*/0.0,
659 verbose);
660 }
661
662 bool isEqualImpl(const Tensor &other, bool isBitwise, float allowedError,
663 bool verbose) const {
664 if (other.dims() != dims()) {
665 if (verbose) {
666 LOG(INFO) << "Tensors are not equal as they have different shapes: "
667 << this->getShapeToString() << " vs. "
668 << other.getShapeToString();
669 }
670 return false;
671 }
672
673 // For now, make sure that either both or neither of the tensors have
674 // UInt8FusedQTy or UInt8Fused16QTy. While it is possible for an Int8QTy
675 // tensor to equal a fused tensor if the fused tensor has the same
676 // scale/offset on all of its rows, and that scale/offset match that of the
677 // Int8QTy, we do not support checking this for now.
678 assert(((getElementType() == ElemKind::UInt8FusedQTy &&
679 other.getElementType() == ElemKind::UInt8FusedQTy) ||
680 (getElementType() == ElemKind::UInt8FusedFP16QTy &&
681 other.getElementType() == ElemKind::UInt8FusedFP16QTy) ||
682 (getElementType() != ElemKind::UInt8FusedFP16QTy &&
683 other.getElementType() != ElemKind::UInt8FusedQTy)) &&
684 "Fused ElemKinds only supports comparing against same ElemKind.");
685
686 // Assert that the scale and offset match for the quantized types.
687 switch (getElementType()) {
688 default:
689 break;
690 case ElemKind::Int8QTy:
691 case ElemKind::UInt8QTy:
692 case ElemKind::Int16QTy:
693 case ElemKind::Int32QTy:
694 assert(getType().getScale() == other.getType().getScale() &&
695 "Scales must match.");
696 assert(getType().getOffset() == other.getType().getOffset() &&
697 "Offsets must match.");
698 }
699
700 // Bitwise compare.
701 if (isBitwise) {
702 return isBitwiseEqualImpl(other, verbose);
703 }
704
705 switch (getElementType()) {
706 case ElemKind::FloatTy:
707 return isEqualImpl<float>(other, allowedError, verbose);
708 case ElemKind::Float16Ty:
709 return isEqualImpl<float16_t>(other, allowedError, verbose);
710 case ElemKind::BFloat16Ty:
711 return isEqualImpl<bfloat16_t>(other, allowedError, verbose);
712 case ElemKind::Float64Ty:
713 return isEqualImpl<double>(other, allowedError, verbose);
714 case ElemKind::Int8QTy:
715 return isEqualImpl<int8_t>(other, allowedError, verbose);
716 case ElemKind::UInt8QTy:
717 return isEqualImpl<uint8_t>(other, allowedError, verbose);
718 case ElemKind::Int16QTy:
719 return isEqualImpl<int16_t>(other, allowedError, verbose);
720 case ElemKind::Int32QTy:
721 return isEqualImpl<int32_t>(other, allowedError, verbose);
722 case ElemKind::Int64QTy:
723 return isEqualImpl<int64_t>(other, allowedError, verbose);
724 case ElemKind::UInt8ITy:
725 return isEqualImpl<uint8_t>(other, allowedError, verbose);
726 case ElemKind::Int32ITy:
727 return isEqualImpl<int32_t>(other, allowedError, verbose);
728 case ElemKind::Int64ITy:
729 return isEqualImpl<int64_t>(other, allowedError, verbose);
730 // Note: We can use isEqualImpl() here because the scales/offsets will be
731 // compared as if they were data, so we will return false if any rowwise
732 // scale/offset do not match.
733 case ElemKind::UInt8FusedQTy:
734 return isEqualImpl<uint8_t>(other, allowedError, verbose);
735 case ElemKind::UInt8FusedFP16QTy:
736 return isEqualImpl<uint8_t>(other, allowedError, verbose);
737 case ElemKind::UInt4FusedFP16QTy:
738 return isEqualImpl<uint8_t>(other, allowedError, verbose);
739 case ElemKind::UInt4FusedQTy:
740 return isEqualImpl<uint8_t>(other, allowedError, verbose);
741 case ElemKind::BoolTy:
742 return isEqualImpl<bool>(other, allowedError, verbose);
743 }
744
745 // This is to make compiler happy. It can never reach this point as switch
746 // always covers all possible values.
747 llvm_unreachable("unreachable");
748 }
749
750 /// \returns whether this Tensor is tiled (repeated) along \p axis for the
751 /// given tile size \p size. Some examples:
752 /// - A Tensor with size [2, 3] equal to [[1,2,3],[1,2,3]] is tiled along
753 /// axis 0 for a tile size equal to 1.
754 /// - A Tensor with size [2, 4] equal to [[1, 2, 1, 2],[3, 4, 3, 4]] is tiled
755 /// along axis 1 for a tile size equal to 2.
756 /// When the tile size matches the dimensions size this function returns TRUE.
757 /// If the \p fractional flag is optionally given that this function will also
758 /// perform fractional tiling verification (default is FALSE). Some examples:
759 /// - For a Tensor with size [5] equal to [1,2,3,1,2], axis 0 and tile size 3,
760 /// this function returns TRUE if \p fractional is TRUE and returns FALSE if
761 /// \p fractional is FALSE.
762 bool isTiled(unsigned_t axis, dim_t size = 1, bool fractional = false) const;
763
764 /// \returns whether this Tensor is tiled (repeated) along \p axes for the
765 /// given tile sizes \p sizes. Some examples:
766 /// - A Tensor with size [2, 4] equal to [[1,2,1,2],[1,2,1,2]] is tiled along
767 /// axes {0,1} for the tile sizes {1,2}.
768 /// When the tile sizes match the dimension sizes this function returns TRUE.
769 /// If the \p fractional flag is optionally given that this function will also
770 /// perform fractional tiling verification (default is FALSE). Some examples:
771 /// - For a Tensor with size [5] equal to [1,2,3,1,2], axes {0} and sizes {3},
772 /// this function returns TRUE if \p fractional is TRUE and returns FALSE if
773 /// \p fractional is FALSE.
774 bool isTiled(llvm::ArrayRef<unsigned_t> axes, llvm::ArrayRef<dim_t> sizes,
775 bool fractional = false) const;
776
777 /// Update the content and type of the tensor from the tensor \p t.
778 void assign(const Tensor *t) {
779 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
780 assert(this != t && "Copying to self");
781 const size_t bufferSize = t->getUnpaddedSizeInBytes();
782 reset(t, bufferSize);
783 std::copy(&t->getData()[0], &t->getData()[bufferSize], getData());
784 }
785
786 /// Update the raw data of the tensor from the tensor \p t.
787 void copyRawFrom(const Tensor *t) {
788 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
789 assert(this != t && "Copying to self");
790 assert(actualSize() == t->actualSize());
791 assert(getElementType() == t->getElementType() && "Invalid element type");
792 assert(t->getUnpaddedSizeInBytes() == getUnpaddedSizeInBytes() &&
793 "Do not support copying between different unpadded sized tensors");
794 size_t bufferSize = type_.getSizeInBytes();
795 std::copy(&t->getData()[0], &t->getData()[bufferSize], getData());
796 }
797
798 /// Update the raw data of the tensor from a raw buffer \p data.
799 void copyRawFrom(const char *data) {
800 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
801 assert(data && "Null data pointer!");
802 assert(getData() != data && "Copying to self");
803 size_t bufferSize = type_.getSizeInBytes();
804 std::memcpy(getData(), data, bufferSize);
805 }
806
807 /// Update the content of the tensor with a slice from tensor \p t. A slice
808 /// is one index from the first dimension of the tensor.
809 void copySlice(const Tensor *t, size_t slice) {
810 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
811 auto dim = t->dims().slice(1);
812 (void)dim;
813 assert(dim == dims() && "Invalid slice size");
814 assert(getElementType() == t->getElementType() && "Invalid element type");
815
816 size_t bufferSize = type_.getSizeInBytes();
817 std::copy(&t->getData()[bufferSize * slice],
818 &t->getData()[bufferSize * (slice + 1)], getData());
819 }
820
821 /// Update the content of the tensor with a sequence of slices from the
822 /// tensor \p t. A slice is one index from the first dimension of the tensor.
823 /// The copying operation may overlap the end of the tensor \p t one or more
824 /// times. This means that the data in the input tensor may be duplicated.
825 void copyConsecutiveSlices(const Tensor *t, size_t startSliceIdx) {
826 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
827 auto onceSliceDim = t->dims().slice(1);
828 (void)onceSliceDim;
829 assert(onceSliceDim == dims().slice(1) && "Invalid slice size");
830 assert(getElementType() == t->getElementType() && "Invalid element type");
831 assert(dims().size() > 1 && "Tensor must contain at least two dimensions");
832
833 size_t numSlicesInInput = t->dims()[0];
834 size_t numElementsInSlice = actualSize() / dims()[0];
835 size_t bufferSize = numElementsInSlice * type_.getElementSize();
836
837 // For each outer slice in the current tensor:
838 for (size_t n = 0, e = dims()[0]; n < e; n++) {
839 size_t startIdx = (startSliceIdx + n) % numSlicesInInput;
840 std::copy(&t->getData()[bufferSize * startIdx],
841 &t->getData()[bufferSize * (startIdx + 1)],
842 &getData()[bufferSize * n]);
843 }
844 }
845
846 /// Convenience method to copy the content of \p t
847 /// to this while both have different underlying types.
848 /// This copy will read each element of \p t as SrcElemType
849 /// and cast them to DestElemType in this.
850 template <typename DestElemType, typename SrcElemType>
851 void copyWithCast(const Tensor *t) {
852 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
853 static_assert(!std::is_same<DestElemType, SrcElemType>::value,
854 "Use copyRawFrom instead");
855 assert(this != t && "Copying to self");
856 assert(getElementType() != t->getElementType() &&
857 "Use copyRawFrom instead");
858 assert(actualSize() == t->actualSize() && "Different sizes");
859 const auto *src = t->getRawDataPointer<SrcElemType>();
860 auto *dst = getRawDataPointer<DestElemType>();
861 for (size_t idx = 0, end = actualSize(); idx != end; ++idx) {
862 dst[idx] = DestElemType(src[idx]);
863 }
864 }
865
866 /// Convert each element of this tensor to \p newTy. Calls into
867 /// \ref getCopyConvertedToType() to do the conversion, and hence supports
868 /// converting between whatever ElemKinds it supports.
869 void convertToType(ElemKind newTy);
870
871 /// \returns a copy of the Tensor but converted to \p newKind. Currently
872 /// supports conversion for:
873 /// - FloatTy to Float16Ty
874 /// - FloatTy to BFloat16Ty
875 /// - Float16Ty to FloatTy
876 /// - BFloat16Ty to FloatTy
877 /// - UInt8FusedQTy to UInt8FusedFP16QTy
878 Tensor getCopyConvertedToType(ElemKind newKind) const;
879
880 /// Transpose the tensor \p src into the empty tensor \p dest. Shuffle the
881 /// axis based on the list \p shuffle, where each element is the src index.
882 void transpose(Tensor *dest, llvm::ArrayRef<unsigned_t> shuffle) const {
883 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
884 genericTranspose(this, dest, shuffle);
885 }
886
887 /// Create a new copy of the current tensor.
888 Tensor clone() const {
889 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
890 Tensor slice;
891 slice.assign(this);
892 return slice;
893 }
894
895 /// Return the raw unsafe pointer to the tensor payload.
896 char *getUnsafePtr() const { return getData(); }
897
898 /// \returns true if tensor data is stored on a device
899 bool isDeviceResident() const {
900 return deviceResidency_ && deviceResidency_->isDeviceResident();
901 }
902
903 /// Update device residency info with new device manager and context
904 void moveToDevice(DeviceTensorTransferManager *deviceManager,
905 void *locationContext);
906
907 /// If device resident, copy Tensor contents back to host memory and release
908 /// associated device memory.
909 void ensureOnHost();
910
911 /// Updates contents of a device resident Tensor with the data from \p t
912 /// without copying its contents to host.
913 void copyRawToDevice(const Tensor *t);
914
915 /// \returns the pointer to the device manager where the tensor resides.
916 DeviceTensorTransferManager *getDeviceManager() const {
917 assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist");
918 assert(deviceResidency_->isDeviceResident() &&
919 "Tensor must be device resident");
920 return deviceResidency_->getDeviceManager();
921 }
922
923 /// \returns the pointer to the location context of where the tensor resides.
924 void *getLocationContext() const {
925 assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist");
926 assert(deviceResidency_->isDeviceResident() &&
927 "Tensor must be device resident");
928 return deviceResidency_->getLocationContext();
929 }
930
931 void resetDeviceInfo() {
932 if (deviceResidency_ && ownsDeviceResidency_) {
933 deviceResidency_->clear();
934 return;
935 }
936
937 deviceResidency_ = new DeviceResidencyInfo();
938 ownsDeviceResidency_ = true;
939 }
940
941 /// Clears DeviceResidencyInfo.
942 /// Note that this does not affect the associated DeviceManager or device
943 /// memory.
944 void clearDeviceResidency() {
945 assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist");
946 assert(deviceResidency_->isDeviceResident() &&
947 "Tensor must be device resident");
948 deviceResidency_->clear();
949 }
950
951 /// \return a new handle that points and manages this tensor.
952 template <class ElemTy = float> Handle<ElemTy> getHandle() &;
953
954 template <class ElemTy = float> const Handle<ElemTy> getHandle() const &;
955
956 /// If Tensor is rvalue, it is an error to get its Handle.
957 template <class ElemTy = float> Handle<ElemTy> getHandle() && = delete;
958
959private:
960 /// \returns a pointer to the raw data, of type \p ElemTy.
961 template <class ElemTy> ElemTy *getRawDataPointer() {
962 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
963 assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type.");
964 return reinterpret_cast<ElemTy *>(data_);
965 }
966
967 /// \returns a const pointer to the raw data, of type \p ElemTy.
968 template <class ElemTy> const ElemTy *getRawDataPointer() const {
969 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
970 assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type.");
971 return reinterpret_cast<const ElemTy *>(data_);
972 }
973
974 template <class ElemTy>
975 bool isEqualImpl(const Tensor &other, float allowedError,
976 bool verbose) const {
977 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
978 auto thisHandle = getHandle<ElemTy>();
979 auto otherHandle = other.getHandle<ElemTy>();
980 double maxFoundError = 0.0;
981 size_t numExceedingError = 0;
982 size_t currIndex = 0;
983 size_t maxFoundErrorIdx = 0;
984 double maxRE = 0.0; // relative error.
985 size_t maxREIdx = 0;
986 for (auto thisHandleIt = thisHandle.begin(),
987 otherHandleIt = otherHandle.begin();
988 thisHandleIt != thisHandle.end() && otherHandleIt != otherHandle.end();
989 ++thisHandleIt, ++otherHandleIt, ++currIndex) {
990 double delta = *thisHandleIt - *otherHandleIt;
991 delta = std::abs(delta);
992 // Since any comparison with NAN returns false, we use a negated condition
993 // so that this function correctly returns false when delta is NAN.
994 if (!(delta <= allowedError)) {
995 if (!verbose) {
996 return false;
997 }
998 numExceedingError += 1;
999 if (!(delta <= maxFoundError)) {
1000 maxFoundError = delta;
1001 maxFoundErrorIdx = currIndex;
1002 }
1003 double sum = *thisHandleIt + *otherHandleIt;
1004 double re = delta / std::abs(sum);
1005 if (!(re <= maxRE)) {
1006 maxRE = re;
1007 maxREIdx = currIndex;
1008 }
1009 }
1010 }
1011 auto thisHandleIt = thisHandle.begin();
1012 auto otherHandleIt = otherHandle.begin();
1013 if (numExceedingError != 0) {
1014 LOG(INFO) << "Tensors not equal: " << numExceedingError << " out of "
1015 << actualSize() << " elements exceeded allowed error threshold "
1016 << allowedError << ". Maximum error found was " << maxFoundError
1017 << " at index " << maxFoundErrorIdx << ": "
1018 << *(thisHandleIt.operator+(maxFoundErrorIdx)) << " vs. "
1019 << *(otherHandleIt.operator+(maxFoundErrorIdx));
1020 LOG(INFO) << "Maximum relative error found was: " << maxRE
1021 << " at index: " << maxREIdx << ": "
1022 << *(thisHandleIt.operator+(maxREIdx)) << " v.s. "
1023 << *(otherHandleIt.operator+(maxREIdx));
1024 }
1025 return numExceedingError == 0;
1026 }
1027
1028 bool isBitwiseEqualImpl(const Tensor &other, bool verbose) const {
1029 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
1030 auto const *myData = getUnsafePtr();
1031 auto const *otherData = other.getUnsafePtr();
1032 dim_t mismatchCount = 0;
1033
1034 if (verbose) {
1035 for (size_t i = 0, e = getSizeInBytes(); i < e; i++) {
1036 if (myData[i] != otherData[i]) {
1037 ++mismatchCount;
1038 }
1039 }
1040 if (mismatchCount != 0) {
1041 LOG(INFO) << "Tensors not bitwise equal: " << mismatchCount
1042 << " bytes out of " << getSizeInBytes() << " mismatched.";
1043 }
1044 } else {
1045 mismatchCount = memcmp(myData, otherData, getSizeInBytes());
1046 }
1047
1048 return mismatchCount == 0;
1049 }
1050};
1051
1052//===----------------------------------------------------------------------===//
1053// Tensor Handle
1054//===----------------------------------------------------------------------===//
1055
1056constexpr unsigned MAX_DUMP_ELEMS = 100;
1057
1058void dumpAsciiImpl(const Tensor *T, llvm::raw_ostream &os);
1059void dumpAsciiImpl(const Tensor *T);
1060
1061void dumpImpl(const Tensor *T, llvm::raw_ostream &os,
1062 unsigned maxNumElem = MAX_DUMP_ELEMS);
1063void dumpImpl(const Tensor *T, unsigned maxNumElem);
1064void dumpImpl(const Tensor *T);
1065
1066template <class ElemTy> class Handle;
1067
1068/// A class that provides ability to iterate over a Handle<ElemTy>. Since it's
1069/// common to have both mutating and const iterators, this class has template
1070/// parameter IsConst, which is true to create const_iterator and false
1071/// otherwise.
1072template <class ElemTy, bool IsConst>
1073class HandleIterator
1074 : public std::iterator<std::random_access_iterator_tag, ElemTy> {
1075 using HandleTy = typename std::conditional_t<IsConst, const Handle<ElemTy> *,
1076 Handle<ElemTy> *>;
1077 using ElemTyRef =
1078 typename std::conditional_t<IsConst, const ElemTy &, ElemTy &>;
1079
1080 /// At every given moment, the iterator maintains an index, which is used to
1081 /// access the Handle. When moving the iterator forward, the index is
1082 /// incremented. Only valid elements can be accessed.
1083 /// 0 <= idx_ <= handle_->size()
1084 HandleTy handle_;
1085 llvm::ArrayRef<dim_t> sizes_;
1086 dim_t idx_;
1087 /// Holds true if the underlying tensor has non-trivial alignment (i.e. not 1)
1088 bool isAligned_;
1089
1090 HandleIterator() = default;
1091
1092 HandleIterator(HandleTy handle) : handle_(handle) {
1093 sizes_ = handle->dims();
1094 isAligned_ = handle->size() < handle->actualSize();
1095 }
1096
1097 static HandleIterator begin(HandleTy handle) {
1098 auto res = HandleIterator(handle);
1099 res.idx_ = 0;
1100 return res;
1101 }
1102
1103 static HandleIterator end(HandleTy handle) {
1104 auto res = HandleIterator(handle);
1105 res.idx_ = res.handle_->getRealNumElements();
1106 return res;
1107 }
1108
1109 friend class Handle<ElemTy>;
1110
1111public:
1112 HandleIterator &operator++() {
1113 if (*this != handle_->end()) {
1114 idx_++;
1115 }
1116 return *this;
1117 }
1118 HandleIterator &operator--() {
1119 if (idx_) {
1120 idx_--;
1121 }
1122 return *this;
1123 }
1124 HandleIterator operator+(int n) const {
1125 auto res = HandleIterator(handle_);
1126 res.idx_ = std::max(static_cast<int>(idx_) + n, 0);
1127 res.idx_ = std::min(res.idx_, res.handle_->size());
1128 return res;
1129 }
1130 HandleIterator operator-(int n) const { return *this + (-n); }
1131 operator int() const { return idx_; }
1132
1133 ElemTyRef operator*() {
1134 if (!isAligned_) {
1135 return handle_->raw(idx_);
1136 }
1137 std::vector<dim_t> indices(sizes_.size(), 0);
1138 size_t rem = idx_;
1139 for (int i = static_cast<int>(sizes_.size()) - 1; i >= 0; i--) {
1140 indices[i] = rem % sizes_[i];
1141 rem /= sizes_[i];
1142 }
1143 return handle_->at(indices);
1144 }
1145
1146 bool operator==(const HandleIterator<ElemTy, IsConst> &other) const {
1147 return idx_ == other.idx_;
1148 }
1149
1150 bool operator!=(const HandleIterator<ElemTy, IsConst> &other) const {
1151 return !(*this == other);
1152 }
1153};
1154
1155/// Helper which \returns the flattened 1D offset given \p indices into a tensor
1156/// with \p strides.
1157inline size_t getFlattenedOffset(llvm::ArrayRef<dim_t> strides,
1158 llvm::ArrayRef<dim_t> indices) {
1159 assert(indices.size() <= strides.size() && "Invalid number of indices");
1160 // The loop below can be rewritten using std::inner_product. Unfortunately
1161 // std::inner_product does not optimize very well and loops that use this
1162 // method don't get vectorized. Don't change this loop without benchmarking
1163 // the program on a few compilers.
1164 size_t index = 0;
1165 for (size_t i = 0, e = indices.size(); i < e; i++) {
1166 index += size_t(strides[i]) * size_t(indices[i]);
1167 }
1168
1169 return index;
1170}
1171
1172/// Helper function which \returns true if a slice with the shape \p sliceShape
1173/// referenced from a larger tensor with the shape \p tensorShape is contiguous
1174/// in memory (assuming the tensor it is referenced from is contiguous). This
1175/// happens when the slice dimensions:
1176/// - Start with singleton dimensions (dimensions equal to 1).
1177/// - Continue with a partially extracted dimension (one maximum).
1178/// - End with fully extracted dimensions.
1179bool isSliceContiguous(llvm::ArrayRef<dim_t> sliceShape,
1180 llvm::ArrayRef<dim_t> tensorShape);
1181
1182/// A class that provides indexed access to a tensor. This class has value
1183/// semantics and it's copied around. One of the reasons for making this class
1184/// value semantics is to allow efficient index calculation that the compiler
1185/// can optimize (because stack allocated structures don't alias).
1186template <class ElemTy> class Handle final {
1187 /// A pointer to the tensor that this handle wraps.
1188 Tensor *tensor_{nullptr};
1189
1190 /// Contains the multiplication of the sizes from current position to end.
1191 /// For example, for index (w,z,y,z): [x * y * z, y * z, z, 1]
1192 dim_t sizeIntegral_[max_tensor_dimensions] = {
1193 0,
1194 };
1195
1196 dim_t sizes_[max_tensor_dimensions] = {
1197 0,
1198 };
1199
1200 /// Saves the number of dimensions used in the tensor.
1201 uint8_t numDims_{0};
1202
1203 /// Remember end iterators. This is needed to speed up iterator increment,
1204 /// which has to check that iterator hasn't reached the end yet.
1205 HandleIterator<ElemTy, false> mutating_end_;
1206 HandleIterator<ElemTy, true> const_end_;
1207
1208 /// Create a new invalid handle. Notice that this method is private and may
1209 /// only be used by the static factory method below.
1210 Handle() = default;
1211
1212public:
1213 /// \returns an iterator to the first element of the tensor.
1214 HandleIterator<ElemTy, false> begin() {
1215 return HandleIterator<ElemTy, false>::begin(this);
1216 }
1217 HandleIterator<ElemTy, true> begin() const {
1218 return HandleIterator<ElemTy, true>::begin(this);
1219 }
1220
1221 /// \returns an iterator referring to the past-the-end element.
1222 HandleIterator<ElemTy, false> end() { return mutating_end_; }
1223 HandleIterator<ElemTy, true> end() const { return const_end_; }
1224
1225 /// Allocate a new invalid handle.
1226 static Handle createInvalidHandle() { return Handle(); }
1227
1228 /// \returns true if this Handle points to a valid tensor.
1229 bool isValid() const { return tensor_; }
1230
1231 /// Calculate the index for a specific element in the tensor. Notice that
1232 /// the list of indices may be incomplete. This method provides access to
1233 /// padding elements, meaning that it's possible to get an index pointing at
1234 /// data, added to meet alignment requirements.
1235 size_t getElementPtr(llvm::ArrayRef<dim_t> indices) const {
1236 return getFlattenedOffset(llvm::makeArrayRef(sizeIntegral_, numDims_),
1237 indices);
1238 }
1239
1240 /// \returns the value of the n'th dimension \p dim, for the index \p idx.
1241 /// 0 <= idx < size(), meaning that \p idx addresses a real data elements,
1242 /// not paddings.
1243 size_t getDimForPtr(size_t dim, size_t idx) const {
1244 assert(dim < numDims_ && "Invalid dimension");
1245 assert(idx < size() && "Invalid index");
1246 auto R = idx;
1247 for (size_t i = dim + 1; i < numDims_; i++) {
1248 R /= sizes_[i];
1249 }
1250 return R % sizes_[dim];
1251 }
1252
1253 /// \returns the type of the tensor.
1254 const Type &getType() const { return tensor_->getType(); }
1255
1256 /// \returns the element type of the tensor.
1257 ElemKind getElementType() const { return tensor_->getElementType(); }
1258
1259 /// Construct a Tensor handle.
1260 explicit Handle(Tensor *tensor) : tensor_(tensor) {
1261 auto sizes = tensor->dims();
1262 numDims_ = sizes.size();
1263
1264 /// We allow handles that wrap uninitialized tensors.
1265 if (numDims_) {
1266 // Copy the sizes of the tensor.
1267 memcpy(sizes_, tensor_->type_.sizes_,
1268 max_tensor_dimensions * sizeof(sizes_[0]));
1269 // Copy the strides of the tensor.
1270 memcpy(sizeIntegral_, tensor_->type_.strides_,
1271 max_tensor_dimensions * sizeof(tensor_->type_.strides_[0]));
1272 assert(numDims_ <= max_tensor_dimensions && "Too many dimensions.");
1273 }
1274
1275 mutating_end_ = HandleIterator<ElemTy, false>::end(this);
1276 const_end_ = HandleIterator<ElemTy, true>::end(this);
1277 }
1278
1279 llvm::ArrayRef<dim_t> dims() const {
1280 return llvm::ArrayRef<dim_t>(sizes_, numDims_);
1281 }
1282
1283 /// \returns the number of elements in the whole tensor.
1284 dim_t size() const { return tensor_->size(); }
1285
1286 /// \returns the actual number of elements in the tensor taking striding into
1287 /// account. Since size() does not take striding into account, size() is
1288 /// always <= actualSize().
1289 dim_t actualSize() const { return tensor_->actualSize(); }
1290
1291 /// \returns the unpadded size of the underlying \ref tensor_.
1292 size_t getUnpaddedSizeInBytes() const {
1293 return tensor_->getUnpaddedSizeInBytes();
1294 }
1295
1296 /// \returns the number of unpadded elements in the underlying \ref tensor_.
1297 size_t getRealNumElements() const { return tensor_->getRealNumElements(); }
1298
1299 bool isInBounds(llvm::ArrayRef<dim_t> indices) const {
1300 return tensor_->isInBounds(indices);
1301 }
1302
1303 void clear(ElemTy value = 0) { std::fill(begin(), end(), value); }
1304
1305 /// Returns reference to a meaningful data element. This method does not
1306 /// address padding elements.
1307 ElemTy &at(llvm::ArrayRef<dim_t> indices) {
1308 size_t index = getElementPtr(indices);
1309 auto *data = tensor_->getRawDataPointer<ElemTy>();
1310 return data[index];
1311 }
1312
1313 const ElemTy &at(llvm::ArrayRef<dim_t> indices) const {
1314 size_t index = getElementPtr(indices);
1315 auto *data = tensor_->getRawDataPointer<ElemTy>();
1316 return data[index];
1317 }
1318
1319 /// \returns the element at offset \p idx without any size calculations.
1320 /// The returned element can be a pad element.
1321 ElemTy &raw(size_t index) {
1322 auto *data = tensor_->getRawDataPointer<ElemTy>();
1323 return data[index];
1324 }
1325
1326 /// \returns the element at offset \p idx without any size calculations.
1327 /// The returned element can be a pad element.
1328 const ElemTy &raw(size_t index) const {
1329 auto *data = tensor_->getRawDataPointer<ElemTy>();
1330 return data[index];
1331 }
1332
1333 /// Extract a smaller dimension tensor from a specific slice (that has to be
1334 /// the first dimension).
1335 Tensor extractSlice(size_t idx) const {
1336 auto sizes = tensor_->dims();
1337 assert(sizes.size() > 1 && "Tensor must have at least two dimensions");
1338 assert(idx < sizes[0] && "Invalid first index");
1339
1340 Tensor slice{Type::newShape(tensor_->getType(), sizes.slice(1),
1341 tensor_->type_.strides().slice(1))};
1342
1343 // Extract the whole slice.
1344 size_t startIdx = sizeIntegral_[0] * idx;
1345 ElemTy *base = tensor_->getRawDataPointer<ElemTy>() + startIdx;
1346 auto *dest = slice.getRawDataPointer<ElemTy>();
1347 std::copy(base, base + sizeIntegral_[0], dest);
1348
1349 return slice;
1350 }
1351
1352 /// Insert a smaller dimension tensor into a larger tensor at a specific
1353 /// first-dimension index.
1354 void insertSlice(const Tensor &slice, size_t idx) {
1355 auto dims = tensor_->dims();
1356 (void)dims;
1357 assert(getElementType() == slice.getElementType());
1358 assert(dims.size() > 1 && "Tensor must have at least two dimensions");
1359 assert(idx < dims[0] && "Invalid first index");
1360
1361 auto sliceSize = sizeIntegral_[0];
1362 size_t startIdx = sliceSize * idx;
1363 ElemTy *base = &raw(startIdx);
1364 const ElemTy *slicePtr = slice.getRawDataPointer<float>();
1365 std::copy(slicePtr, slicePtr + sliceSize, base);
1366 }
1367
1368 /// Create a new copy of the current tensor.
1369 Tensor clone() const { return tensor_->clone(); }
1370
1371 /// Update the content of the tensor from a literal list:
1372 void operator=(const std::initializer_list<ElemTy> &vec) {
1373 assert(actualSize() == vec.size() && "Invalid input size.");
1374 size_t i = 0;
1375 for (auto &e : vec) {
1376 raw(i++) = e;
1377 }
1378 }
1379
1380 void operator=(llvm::ArrayRef<ElemTy> array) {
1381 assert(actualSize() == array.size() && "Invalid input size.");
1382 std::copy(array.begin(), array.end(), &raw(0));
1383 }
1384
1385 void dumpAscii(llvm::raw_ostream &os) const { dumpAsciiImpl(tensor_, os); }
1386 void dumpAscii() const { dumpAsciiImpl(tensor_); }
1387
1388 /// \returns the raw indices of a min and max values from the tensor.
1389 /// In case of multiple min or max, the smallest index is returned.
1390 std::pair<dim_t, dim_t> minMaxArg() const {
1391 ElemTy max = raw(0);
1392 ElemTy min = raw(0);
1393
1394 size_t maxIdx = 0;
1395 size_t minIdx = 0;
1396
1397 for (size_t i = 1, e = actualSize(); i < e; i++) {
1398 ElemTy val = raw(i);
1399 if (val > max) {
1400 max = val;
1401 maxIdx = i;
1402 } else if (val < min) {
1403 min = val;
1404 minIdx = i;
1405 }
1406 }
1407
1408 return std::make_pair(minIdx, maxIdx);
1409 }
1410
1411 /// \returns true if tensor contains only elements equal to zero.
1412 /// \p allowedError represents the delta from zero that is allowed before
1413 /// returning false.
1414 bool isZero(float allowedError = 0.0) const {
1415#define RETURN_WHETHER_FUSED_IS_ZERO(DATA_TYPE) \
1416 assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \
1417 assert(dims()[1] > 2 * sizeof(DATA_TYPE) && \
1418 "Fused tensor must have space for scale/offset."); \
1419 const dim_t dataWidth = dims()[1]; \
1420 const dim_t alignedLength = tensor_->getType().strides()[0]; \
1421 auto *data = reinterpret_cast<uint8_t *>(tensor_->getUnsafePtr()); \
1422 for (dim_t i = 0, e = dims()[0]; i < e; i++) { \
1423 uint8_t *scaleOffsetPtr = \
1424 data + i * alignedLength + dataWidth - 2 * sizeof(DATA_TYPE); \
1425 DATA_TYPE scale, offset; \
1426 memcpy(&scale, scaleOffsetPtr, sizeof(DATA_TYPE)); \
1427 memcpy(&offset, scaleOffsetPtr + sizeof(DATA_TYPE), sizeof(DATA_TYPE)); \
1428 for (dim_t j = 0, e = dataWidth - 2 * sizeof(DATA_TYPE); j < e; j++) { \
1429 float currVal = (at({i, j}) * (float)scale) + (float)offset; \
1430 if (std::abs(currVal) > allowedError) { \
1431 return false; \
1432 } \
1433 } \
1434 } \
1435 return true;
1436
1437 if (getElementType() == ElemKind::UInt8FusedQTy) {
1438 RETURN_WHETHER_FUSED_IS_ZERO(float);
1439 }
1440 if (getElementType() == ElemKind::UInt8FusedFP16QTy) {
1441 RETURN_WHETHER_FUSED_IS_ZERO(float16_t);
1442 }
1443#undef RETURN_WHETHER_FUSED_IS_ZERO
1444
1445 int32_t trueZero = getType().isQuantizedType() ? getType().getOffset() : 0;
1446 return std::all_of(begin(), end(), [=](ElemTy e) { return e == trueZero; });
1447 }
1448
1449 void dump(llvm::raw_ostream &os, unsigned maxNumElem = MAX_DUMP_ELEMS) const {
1450 dumpImpl(tensor_, os, maxNumElem);
1451 }
1452 void dump(unsigned maxNumElem) const { dumpImpl(tensor_, maxNumElem); }
1453 void dump() const { dumpImpl(tensor_, MAX_DUMP_ELEMS); }
1454
1455 /// Fill the array with random data that's close to zero using the
1456 /// Xavier method, based on the paper [Bengio and Glorot 2010].
1457 /// This type of initialization facilitates better training performance.
1458 /// The parameter \p filterSize is the number of "input" neurons in the
1459 /// tensor (or the relevant slice). For example, consider case of MatMul:
1460 /// NxM (\p input) * MxK (\p weights) == NxK (\p result)
1461 /// Correct \p filterSize for weights tensor is M, so that norm for each
1462 /// row of \p input equals to norm of corresponding row of \p result.
1463 void initXavier(size_t filterSize, PseudoRNG &PRNG) {
1464 assert(filterSize > 0 && "invalid filter size");
1465 assert(getType().isFPType() &&
1466 "Only support floating point Xavier initialization.");
1467 double scale = std::sqrt(3.0 / double(filterSize));
1468 std::uniform_real_distribution<> dist(-scale, scale);
1469 for (auto &e : *this) {
1470 e = dist(PRNG);
1471 }
1472 }
1473
1474 /// Fill the tensor with uniformly distributed values in the range
1475 /// [low .. high).
1476 template <typename T = ElemTy>
1477 typename std::enable_if<std::is_floating_point<T>::value>::type
1478 randomize(float low, float high, PseudoRNG &PRNG) {
1479 assert(low <= high && "invalid range");
1480 std::uniform_real_distribution<ElemTy> dist(low, high);
1481 for (auto &elem : *this) {
1482 elem = dist(PRNG);
1483 }
1484 }
1485
1486 /// Fill the tensor with uniformly distributed values in the range
1487 /// [low .. high]. For quantized fused tensors leave scales/offsets unchanged.
1488 template <typename T = ElemTy>
1489 typename std::enable_if<std::is_integral<T>::value>::type
1490 randomize(int low, int high, PseudoRNG &PRNG) {
1491 assert(low <= high && "invalid range");
1492 assert(low >= std::numeric_limits<ElemTy>::lowest() &&
1493 high <= std::numeric_limits<ElemTy>::max() &&
1494 "Cannot initialize outside range of representable values.");
1495 std::uniform_int_distribution<long long> dist(low, high);
1496 switch (getElementType()) {
1497 default: {
1498 for (auto &elem : *this) {
1499 elem = dist(PRNG);
1500 }
1501 return;
1502 }
1503
1504#define FUSED_CASE(ELEM_KIND, DATA_TYPE) \
1505 case ElemKind::ELEM_KIND: { \
1506 assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \
1507 assert(dims()[1] > 2 * sizeof(DATA_TYPE) && \
1508 "Fused tensor must have space for scale/offset."); \
1509 for (dim_t i = 0, e = dims()[0]; i < e; i++) { \
1510 for (dim_t j = 0, f = dims()[1] - 2 * sizeof(DATA_TYPE); j < f; j++) { \
1511 at({i, j}) = dist(PRNG); \
1512 } \
1513 } \
1514 return; \
1515 }
1516 FUSED_CASE(UInt8FusedQTy, float);
1517 FUSED_CASE(UInt8FusedFP16QTy, float16_t);
1518#undef FUSED_CASE
1519 }
1520 }
1521
1522 /// Fill the tensor with uniformly distributed values in the range
1523 /// [low .. high).
1524 template <typename T = ElemTy>
1525 typename std::enable_if<!std::is_floating_point<T>::value &&
1526 !std::is_integral<T>::value>::type
1527 randomize(float low, float high, PseudoRNG &PRNG) {
1528 assert(low <= high && "invalid range");
1529 std::uniform_real_distribution<float> dist(low, high);
1530 for (auto &elem : *this) {
1531 elem = dist(PRNG);
1532 }
1533 }
1534
1535 /// \returns the mean and variance of the tensor.
1536 std::pair<double, double> calculateMeanVariance() const {
1537 size_t n = actualSize();
1538 assert(n > 1 && "Input must have at least 2 elements.");
1539
1540 // Calculate mean.
1541 double mean = 0;
1542 for (size_t i = 0; i < n; i++) {
1543 mean += raw({i});
1544 }
1545 mean /= n;
1546
1547 // Calculate variance.
1548 double var = 0;
1549 for (size_t i = 0; i < n; i++) {
1550 double t = raw({i}) - mean;
1551 var += t * t;
1552 }
1553 var /= (n - 1);
1554
1555 return {mean, var};
1556 }
1557
1558 /// Insert the tensor \p slice at location \p offset \p count times along the
1559 /// \p axis. This operation is equivalent to the operation of scanning the
1560 /// source tensor, and saving the value that is stored at coordinate {d_0,
1561 /// d_1, ... d_n} in the new tensor at {d_0 + O_0, d_1 + O_1, ... d_n + O_n},
1562 /// where O is the offset vector, assuming \p count = 1. For \p count > 1, the
1563 /// same Tensor is copied \p count times along the provided \p axis. The
1564 /// tensors must be of the right dimensions.
1565 void insertTensors(Handle<ElemTy> &slice, llvm::ArrayRef<dim_t> offset,
1566 size_t count = 1, size_t axis = 0) {
1567 auto sliceCoor = slice.dims().vec();
1568 auto fusedCoor = dims().vec();
1569 insertTensorsImpl(sliceCoor, fusedCoor, slice, true, offset, count, axis,
1570 0);
1571 }
1572
1573 /// Extract the tensor \p slice at location \p offset. This operation is
1574 /// equivalent to the operation of scanning the destination tensor, and
1575 /// copying into the cell at coordinate {d_0, d_1, ... d_n} a value from the
1576 /// tensor at {d_0 + O_0, d_1 + O_1, ... d_n + O_n}, where O is the offset
1577 /// vector. The tensors must be of the right dimensions.
1578 void extractTensors(Handle<ElemTy> &slice, llvm::ArrayRef<dim_t> offset) {
1579 auto sliceCoor = slice.dims().vec();
1580 auto fusedCoor = dims().vec();
1581 insertTensorsImpl(sliceCoor, fusedCoor, slice, false, offset, /* count */ 1,
1582 /* axis */ 0, 0);
1583 }
1584
1585 /// \returns a pair of the scale and offset from a row \p rowIdx of a
1586 /// FusedRowwiseQuantized Tensor.
1587 template <typename T>
1588 std::pair<T, T> getFusedScaleOffsetFromRow(dim_t rowIdx) {
1589 ElemTy *rowScaleOffsetPtr = getFusedRowScaleOffsetPtr<T>(rowIdx);
1590 T scale;
1591 T offset;
1592 memcpy(&scale, rowScaleOffsetPtr, sizeof(T));
1593 memcpy(&offset, rowScaleOffsetPtr + sizeof(T), sizeof(T));
1594 return std::make_pair(scale, offset);
1595 }
1596
1597 /// Sets the \p scale and \p offset to a row \p rowIdx of a
1598 /// FusedRowwiseQuantized Tensor.
1599 template <typename T>
1600 void setFusedScaleOffsetInRow(dim_t rowIdx, T scale, T offset) {
1601 ElemTy *rowScaleOffsetPtr = getFusedRowScaleOffsetPtr<T>(rowIdx);
1602 T finalScale = static_cast<T>(scale);
1603 T finalOffset = static_cast<T>(offset);
1604 memcpy(rowScaleOffsetPtr, &finalScale, sizeof(T));
1605 memcpy(rowScaleOffsetPtr + sizeof(T), &finalOffset, sizeof(T));
1606 }
1607
1608private:
1609 /// Concats or splits tensors.
1610 /// This method concats or extracts a slice from a tensor.
1611 /// \p sliceCoor and \p fusedCoor are temporary storage that the function uses
1612 /// to construct the coordinates to access the tensor. They must be
1613 /// initialized to be the size of the shape of the tensor. \p slice and \p
1614 /// fused are the tensors to concat or extract. \p offset is the offset of the
1615 /// slice to add or extract along the dimension \p offsetDim. \p d is the
1616 /// recursion depth parameter that's following the number of the axis. if \p
1617 /// isInsert is set then data is copied from \p slice to \p fused. Otherwise
1618 /// data is copied from \p fused to \p slice. \p count and \p axis are used in
1619 /// conjunction for inserting the same tensor \p count times along the \p
1620 /// axis.
1621 void insertTensorsImpl(llvm::MutableArrayRef<dim_t> sliceCoor,
1622 llvm::MutableArrayRef<dim_t> fusedCoor,
1623 Handle<ElemTy> &slice, bool isInsert,
1624 llvm::ArrayRef<dim_t> offset, size_t count,
1625 size_t axis, unsigned d) {
1626 bool isDone = (d == slice.dims().size());
1627
1628 if (isDone) {
1629 if (isInsert) {
1630 at(fusedCoor) = slice.at(sliceCoor);
1631 } else {
1632 slice.at(sliceCoor) = at(fusedCoor);
1633 }
1634 return;
1635 }
1636
1637 // Only need to iterate over count if the current dimension d is equal to
1638 // the axis we're inserting over.
1639 const size_t countIters = (axis == d) ? count : 1;
1640 for (size_t c = 0; c < countIters; c++) {
1641 for (size_t i = 0, e = slice.dims()[d]; i < e; i++) {
1642 // Construct the coordinates for the slice and for the joint shape.
1643 // Add the 'offset' to the dimension that we concat the shapes on.
1644 sliceCoor[d] = i;
1645 // If this is the correct axis to insert multiple times then calculate
1646 // the additional offset to use.
1647 const size_t countAxisOffset = (axis == d) ? c * slice.dims()[d] : 0;
1648 fusedCoor[d] = i + offset[d] + countAxisOffset;
1649 insertTensorsImpl(sliceCoor, fusedCoor, slice, isInsert, offset, count,
1650 axis, d + 1);
1651 }
1652 }
1653 }
1654
1655 /// Given a Fused tensor, \returns a pointer to the scale and offset with type
1656 /// \p T of a row \p rowIdx.
1657 template <typename T> ElemTy *getFusedRowScaleOffsetPtr(dim_t rowIdx) {
1658 switch (getElementType()) {
1659 case ElemKind::UInt8FusedQTy:
1660 case ElemKind::UInt4FusedQTy: {
1661 constexpr auto isFloat = std::is_same<float, T>::value;
1662 DCHECK(isFloat) << "Expected float scale/offset";
1663 break;
1664 }
1665 case ElemKind::UInt4FusedFP16QTy:
1666 case ElemKind::UInt8FusedFP16QTy: {
1667 constexpr auto isFloat16 = std::is_same<float16_t, T>::value;
1668 DCHECK(isFloat16) << "Expected float16_t scale/offset";
1669 break;
1670 }
1671 default:
1672 llvm_unreachable("Must be used with Tensor of supported Fused ElemKind");
1673 }
1674
1675 static_assert(std::is_same<uint8_t, ElemTy>::value,
1676 "Handle of current Fused tensors expected to be uint8_t.");
1677 const dim_t colIdx = dims()[1] - 2 * sizeof(T);
1678 return &at({rowIdx, colIdx});
1679 }
1680};
1681
1682template <class ElemTy> Handle<ElemTy> Tensor::getHandle() & {
1683 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
1684 assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type.");
1685 return Handle<ElemTy>(this);
1686}
1687
1688template <class ElemTy> const Handle<ElemTy> Tensor::getHandle() const & {
1689 assert(!isDeviceResident() && "Tensor must reside on host to access data.");
1690 assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type.");
1691 return Handle<ElemTy>(const_cast<Tensor *>(this));
1692}
1693
1694llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor &t);
1695
1696llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor *t);
1697} // namespace glow
1698
1699#endif // GLOW_BASE_TENSOR_H
1700