1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #ifndef GLOW_BASE_TENSOR_H |
17 | #define GLOW_BASE_TENSOR_H |
18 | |
19 | #include <algorithm> |
20 | #include <cassert> |
21 | #include <vector> |
22 | |
23 | #include "glow/Base/DeviceTensorTransferManager.h" |
24 | #include "glow/Base/Type.h" |
25 | #include "glow/Support/Compiler.h" |
26 | #include "glow/Support/Memory.h" |
27 | #include "glow/Support/Random.h" |
28 | |
29 | #include "llvm/ADT/ArrayRef.h" |
30 | #include "llvm/Support/raw_ostream.h" |
31 | |
32 | namespace glow { |
33 | |
34 | //===----------------------------------------------------------------------===// |
35 | // Tensor |
36 | //===----------------------------------------------------------------------===// |
37 | |
38 | template <class ElemTy> class Handle; |
39 | |
40 | class Tensor; |
41 | class TensorPool; |
42 | |
43 | void genericTranspose(const Tensor *src, Tensor *dest, |
44 | llvm::ArrayRef<unsigned_t> shuffle); |
45 | |
46 | /// Helper function that \returns a ShapeVector of those dimensions in \p |
47 | /// currDims expanded with dimension = 1 until the maximum tensor dimension is |
48 | /// reached. The number of elements in the input dims is the same as in the |
49 | /// returned dims. For example, input {2,1,4} would result in {2,1,4,1,1,1}. |
50 | ShapeVector expandDimsToMax(llvm::ArrayRef<dim_t> currDims); |
51 | |
52 | /// Helper function that \returns a ShapeVector obtained from \p dims by |
53 | /// reducing (setting to 1) the dimensions given by \p axes. If the flag |
54 | /// \p keepDims is also used then the reduced dimensions are kept, otherwise |
55 | /// are pruned. For example, given the dimensions [2,3,4] and axes [0,2] the |
56 | /// returned shape will be [1,3,1] for keepDims true and [3] for keepDims false. |
57 | ShapeVector reduceDims(llvm::ArrayRef<dim_t> dims, |
58 | llvm::ArrayRef<unsigned_t> axes, bool keepDims); |
59 | |
60 | /// Helper function that \returns the transpose shuffle that would undo the |
61 | /// given \p shuffle so that if two transposes were composed with the given |
62 | /// shuffle and the result of this function, it would result in the identity |
63 | /// shuffle. |
64 | std::vector<unsigned_t> getInverseTranspose(llvm::ArrayRef<unsigned_t> shuffle); |
65 | |
66 | namespace runtime { |
67 | class DeviceManager; |
68 | } |
69 | |
70 | /// Holds information regarding whether this Tensor exists in a device-specific |
71 | /// form, either resident or specific for a device, and what device holds it. |
72 | class DeviceResidencyInfo final { |
73 | enum class TensorResidency { |
74 | Host, |
75 | Device, |
76 | }; |
77 | |
78 | // A pointer to the device manager of the device on which the tensor |
79 | // resides. |
80 | DeviceTensorTransferManager *deviceManager_{nullptr}; |
81 | /// The residency status of the tensor. |
82 | TensorResidency tensorResidency_{TensorResidency::Host}; |
83 | // A pointer to a context structure, containing the required info to access |
84 | // tensor data and perform transfers. |
85 | void *locationContext_{nullptr}; |
86 | |
87 | public: |
88 | DeviceResidencyInfo() |
89 | : deviceManager_(nullptr), tensorResidency_(TensorResidency::Host), |
90 | locationContext_(nullptr) {} |
91 | |
92 | /// Move ctor. |
93 | DeviceResidencyInfo(DeviceResidencyInfo &&other) = delete; |
94 | |
95 | /// Move assignment operator. |
96 | DeviceResidencyInfo &operator=(DeviceResidencyInfo &&other) = delete; |
97 | |
98 | ~DeviceResidencyInfo() { |
99 | // If a tensor is device resident, let its device manager free the device |
100 | // buffer. |
101 | if (isDeviceResident()) { |
102 | deviceManager_->releaseDeviceTensor(locationContext_); |
103 | } |
104 | } |
105 | |
106 | /// Removes all device specific state. |
107 | void clear() { |
108 | deviceManager_ = nullptr; |
109 | locationContext_ = nullptr; |
110 | tensorResidency_ = TensorResidency::Host; |
111 | } |
112 | |
113 | /// \returns true if this Tensor is resident or specific for a device. |
114 | bool isDeviceResident() const { |
115 | assert((tensorResidency_ == TensorResidency::Host || deviceManager_) && |
116 | "Device resident tensor must have an assigned device manager." ); |
117 | return tensorResidency_ == TensorResidency::Device; |
118 | } |
119 | |
120 | /// \returns the DeviceManager this tensor is resident on, if any. |
121 | DeviceTensorTransferManager *getDeviceManager() const { |
122 | return deviceManager_; |
123 | } |
124 | |
125 | /// \returns the device specific location context for a resident Tensor. |
126 | void *getLocationContext() const { return locationContext_; } |
127 | |
128 | friend class Tensor; |
129 | }; |
130 | |
131 | /// A class that represents a contiguous n-dimensional array (a tensor). |
132 | class Tensor final { |
133 | public: |
134 | /// Specifies the kind initialization for the tensor. |
135 | enum class InitKind { |
136 | Zero, // The tensor is initialized to zero. |
137 | Broadcast, // Broadcast a single value to all elements. |
138 | Xavier, // Init the tensor with random values using the Xavier method. |
139 | }; |
140 | |
141 | private: |
142 | /// A pointer to the tensor data. |
143 | char *data_{nullptr}; |
144 | |
145 | /// The type of the tensor. |
146 | Type type_; |
147 | |
148 | /// If the tensor is unowned. |
149 | bool isUnowned_{false}; |
150 | |
151 | /// The TensorPool that is managing this Tensor (if any). |
152 | TensorPool *tensorPool_{nullptr}; |
153 | |
154 | /// The device residency info accosiated with the tensor. |
155 | DeviceResidencyInfo *deviceResidency_{nullptr}; |
156 | |
157 | /// If this tensor owns the DeviceResidencyInfo. |
158 | bool ownsDeviceResidency_{false}; |
159 | |
160 | /// Size in bytes of the unpadded region memory. This is useful communicating |
161 | /// the actual size of the data, this allows for copying only inputs and not |
162 | /// padding to the device. |
163 | size_t unpaddedSize_{0}; |
164 | |
165 | template <class ElemTy> friend class Handle; |
166 | |
167 | /// \returns a pointer to the tensor data buffer. |
168 | char *getData() const { return data_; } |
169 | |
170 | public: |
171 | /// \returns true if it is an unowned tensor. |
172 | bool isUnowned() const { return isUnowned_; } |
173 | |
174 | /// \returns the number of allocated bytes pointed to by \ref data_. |
175 | size_t getUnpaddedSizeInBytes() const { return unpaddedSize_; } |
176 | |
177 | /// \returns the number of real elements in a Tensor, not including extra |
178 | /// padding, or not including number of elements that do not exist outside of |
179 | /// a partial tensor shape. Note that Tensors cannot be both custom aligned |
180 | /// and partial. |
181 | size_t getRealNumElements() const { |
182 | // If custom alignment then return size from the handle. |
183 | if (size() < actualSize()) { |
184 | return size(); |
185 | } |
186 | // Else assume no custom alignment, so return number of elements based on |
187 | // unpaddedSize_, i.e. accounts for partial Tensors. |
188 | return unpaddedSize_ / type_.getElementSize(); |
189 | } |
190 | |
191 | /// \returns the type of the tensor. |
192 | const Type &getType() const { return type_; } |
193 | |
194 | /// Set the type of the Tensor to \p t. |
195 | void setType(const TypeRef t) { |
196 | assert(type_.dims() == t->dims() && "New type must retain the same shape." ); |
197 | assert(((type_.getElementType() == t->getElementType() && |
198 | type_.size() == t->size()) || |
199 | type_.getSizeInBytes() == t->getSizeInBytes()) && |
200 | "New type must retain the same size in bytes." ); |
201 | type_ = *t; |
202 | } |
203 | |
204 | /// \return the element type of the tensor. |
205 | ElemKind getElementType() const { return type_.getElementType(); } |
206 | |
207 | /// \returns True if the coordinate is within the array. |
208 | bool isInBounds(llvm::ArrayRef<dim_t> indices) const { |
209 | assert(type_.numSizes_ == indices.size() && "Invalid number of indices" ); |
210 | for (size_t i = 0u, e = indices.size(); i < e; i++) { |
211 | if (indices[i] >= type_.sizes_[i]) { |
212 | return false; |
213 | } |
214 | } |
215 | return true; |
216 | } |
217 | |
218 | /// Set the content of the tensor to zero. If \p resetFusedScalesOffsets, then |
219 | /// fused scales/offsets will be set to 1.0/0.0 as well. |
220 | void zero(bool resetFusedScalesOffsets = false) { |
221 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
222 | size_t size = actualSize(); |
223 | // Quantized tensors should go to their offset. |
224 | switch (type_.getElementType()) { |
225 | case ElemKind::Int8QTy: { |
226 | auto *data = reinterpret_cast<int8_t *>(getData()); |
227 | std::fill(&data[0], &data[0] + size, (int8_t)type_.getOffset()); |
228 | break; |
229 | } |
230 | case ElemKind::UInt8QTy: { |
231 | auto *data = reinterpret_cast<uint8_t *>(getData()); |
232 | std::fill(&data[0], &data[0] + size, (uint8_t)type_.getOffset()); |
233 | break; |
234 | } |
235 | case ElemKind::Int16QTy: { |
236 | auto *data = reinterpret_cast<int16_t *>(getData()); |
237 | std::fill(&data[0], &data[0] + size, (int16_t)type_.getOffset()); |
238 | break; |
239 | } |
240 | case ElemKind::Int32QTy: { |
241 | auto *data = reinterpret_cast<int32_t *>(getData()); |
242 | std::fill(&data[0], &data[0] + size, (int32_t)type_.getOffset()); |
243 | break; |
244 | } |
245 | #define FUSED_CASE(ELEM_KIND, DATA_TYPE) \ |
246 | case ElemKind::ELEM_KIND: { \ |
247 | assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \ |
248 | assert(dims()[1] > sizeof(DATA_TYPE) && \ |
249 | "Fused tensor must have space for scale and offset."); \ |
250 | const size_t dataWidth = dims()[1]; \ |
251 | const size_t alignedLength = type_.strides()[0]; \ |
252 | auto *data = reinterpret_cast<uint8_t *>(getData()); \ |
253 | for (size_t i = 0, e = dims()[0]; i < e; i++) { \ |
254 | uint8_t *scaleOffsetPtr = \ |
255 | data + i * alignedLength + dataWidth - 2 * sizeof(DATA_TYPE); \ |
256 | DATA_TYPE scale, offset; \ |
257 | if (resetFusedScalesOffsets) { \ |
258 | /* Use these as defaults, and copy them into each row. */ \ |
259 | scale = 1.0; \ |
260 | offset = 0.0; \ |
261 | memcpy(scaleOffsetPtr, &scale, sizeof(DATA_TYPE)); \ |
262 | memcpy(scaleOffsetPtr + sizeof(DATA_TYPE), &offset, \ |
263 | sizeof(DATA_TYPE)); \ |
264 | } else { \ |
265 | memcpy(&scale, scaleOffsetPtr, sizeof(DATA_TYPE)); \ |
266 | memcpy(&offset, scaleOffsetPtr + sizeof(DATA_TYPE), \ |
267 | sizeof(DATA_TYPE)); \ |
268 | } \ |
269 | DCHECK_NE(static_cast<float>(scale), 0.0) \ |
270 | << "Disallow scale = 0.0 for Fused ElemKinds; causes div by zero."; \ |
271 | float zero = nearbyintf(-1 * static_cast<float>(offset / scale)); \ |
272 | std::fill(data + i * alignedLength, scaleOffsetPtr, \ |
273 | static_cast<uint8_t>(zero)); \ |
274 | } \ |
275 | break; \ |
276 | } |
277 | FUSED_CASE(UInt8FusedQTy, float); |
278 | FUSED_CASE(UInt8FusedFP16QTy, float16_t); |
279 | #undef FUSED_CASE |
280 | |
281 | default: |
282 | // Non-quantized tensors are set to 0. |
283 | std::fill(&getData()[0], &getData()[0] + size * type_.getElementSize(), |
284 | 0); |
285 | break; |
286 | } |
287 | } |
288 | |
289 | /// \returns the shape of the tensor. |
290 | llvm::ArrayRef<dim_t> dims() const { return type_.dims(); } |
291 | |
292 | /// \returns the number of real meaningful elements in the tensor. Does not |
293 | /// take strides into account. |
294 | dim_t size() const { return type_.size(); } |
295 | |
296 | /// \returns the actual number of elements in the tensor taking striding into |
297 | /// account. Since size() does not take striding into account, size() is |
298 | /// always <= actualSize(). |
299 | dim_t actualSize() const { return type_.actualSize(); } |
300 | |
301 | /// \returns the number of bytes required to store the tensor based on its |
302 | /// Type. Note that this includes the size required for padding. |
303 | uint64_t getSizeInBytes() const { return type_.getSizeInBytes(); } |
304 | |
305 | /// \returns the TensorPool managing this object, or nullptr if it is |
306 | /// unmanaged. |
307 | TensorPool *getOwningPool() { return tensorPool_; } |
308 | |
309 | template <typename DataType> |
310 | static Tensor fromData(ElemKind elemKind, llvm::ArrayRef<dim_t> dims, |
311 | const std::initializer_list<DataType> &data) { |
312 | Tensor tensor(elemKind, dims); |
313 | tensor.getHandle<DataType>() = data; |
314 | return tensor; |
315 | } |
316 | |
317 | template <typename DataType> |
318 | static Tensor fromData(ElemKind elemKind, float scale, int32_t offset, |
319 | llvm::ArrayRef<dim_t> dims, |
320 | const std::initializer_list<DataType> &data) { |
321 | Tensor tensor(elemKind, dims, scale, offset); |
322 | tensor.getHandle<DataType>() = data; |
323 | return tensor; |
324 | } |
325 | |
326 | /// Initialize an empty tensor. |
327 | Tensor() = default; |
328 | |
329 | /// Initialize from a list of float literals. |
330 | Tensor(const std::initializer_list<float> &vec) { |
331 | reset(ElemKind::FloatTy, {(dim_t)vec.size()}); |
332 | auto *data = getRawDataPointer<float>(); |
333 | int i = 0; |
334 | for (auto &f : vec) { |
335 | data[i++] = f; |
336 | } |
337 | } |
338 | |
339 | /// Allocate and initialize a new tensor. |
340 | explicit Tensor(TypeRef ty) : data_(nullptr), type_(*ty), isUnowned_{false} { |
341 | reset(*ty); |
342 | } |
343 | |
344 | /// Allocate and initialize a new tensor. |
345 | explicit Tensor(const Type &ty) |
346 | : data_(nullptr), type_(ty), isUnowned_{false} { |
347 | reset(ty); |
348 | } |
349 | |
350 | /// Allocate and initialize a float new tensor. |
351 | Tensor(ElemKind elemTy, llvm::ArrayRef<dim_t> dims) |
352 | : data_(nullptr), type_(elemTy, dims), isUnowned_{false} { |
353 | reset(elemTy, dims); |
354 | } |
355 | |
356 | /// Construct an unowned tensor provided an existing payload buffer. |
357 | /// This constructor can be used when there is a need to work with |
358 | /// "externally" managed payload buffers using Tensor APIs. Additionally |
359 | /// \p unpaddedSize can be set to indicate actual size of the inputs. If |
360 | /// negative then it defaults back to the size of the input type. |
361 | Tensor(void *data, TypeRef ty, ssize_t unpaddedSize = -1) |
362 | : data_(reinterpret_cast<char *>(data)), type_(*ty) { |
363 | // Mark as unowned. |
364 | isUnowned_ = true; |
365 | // We do want DeviceResidency however, since there is no owning Glow Tensor. |
366 | resetDeviceInfo(); |
367 | if (unpaddedSize < 0) { |
368 | unpaddedSize_ = type_.getSizeInBytes(); |
369 | } else { |
370 | unpaddedSize_ = static_cast<size_t>(unpaddedSize); |
371 | } |
372 | } |
373 | |
374 | /// Allocate and initialize a new integer tensor with \p scale and \p offset. |
375 | Tensor(ElemKind elemTy, llvm::ArrayRef<dim_t> dims, float scale, |
376 | int32_t offset) |
377 | : data_(nullptr), type_(elemTy, dims, scale, offset), isUnowned_{false} { |
378 | reset(type_); |
379 | } |
380 | |
381 | /// Allocate a new Tensor managed by the \p tensorPool. |
382 | explicit Tensor(TypeRef ty, TensorPool *tensorPool) |
383 | : data_(nullptr), type_(*ty), tensorPool_(tensorPool) { |
384 | reset(*ty); |
385 | } |
386 | |
387 | Tensor(const Tensor &other) = delete; |
388 | Tensor &operator=(const Tensor &other) = delete; |
389 | |
390 | /// Initialize the content of the tensor using the \p init method. The value |
391 | /// \p val is the initialization parameter. \p PRNG is used to generate random |
392 | /// numbers. Note that if the tensor's kind is Fused, then the fused |
393 | /// scaled/offsets will not be modified. |
394 | void init(InitKind init, float val, PseudoRNG &PRNG); |
395 | |
396 | /// \returns an unowned tensor with the exact same dimensions as this. |
397 | Tensor getUnowned() const { return getUnowned(dims()); } |
398 | |
399 | /// \returns unowned tensor using the same data buffer as the current tensor |
400 | /// but having different dimensions \p dims. \p offsets represents an optional |
401 | /// offset into the tensor representing the location of the first element to |
402 | /// start a subview from. The returned unonwed tensor is essentially a |
403 | /// different view or subview on the same data. |
404 | /// |
405 | /// The lifetime of the returned unowned tensor should be always within |
406 | /// the lifetime of its parent tensor, i.e. the unowned tensor should not |
407 | /// outlive its parent tensor. |
408 | Tensor getUnowned(llvm::ArrayRef<dim_t> dims, |
409 | llvm::ArrayRef<dim_t> offsets = {}) const { |
410 | Tensor unownedTensor; |
411 | |
412 | auto *firstElemPtr = getData(); |
413 | if (offsets.size()) { |
414 | assert(offsets.size() == this->dims().size() && |
415 | "Number of dims of tensor must equal number of dims in offsets" ); |
416 | // Find the index of the first element and use it to find the pointer to |
417 | // the first element. |
418 | size_t index = 0; |
419 | for (size_t i = 0; i < this->dims().size(); i++) { |
420 | index += type_.strides()[i] * offsets[i]; |
421 | } |
422 | firstElemPtr = &firstElemPtr[index * type_.getElementSize()]; |
423 | } |
424 | |
425 | unownedTensor.data_ = firstElemPtr; |
426 | unownedTensor.isUnowned_ = true; |
427 | unownedTensor.type_ = Type::newShape(getType(), dims); |
428 | unownedTensor.deviceResidency_ = deviceResidency_; |
429 | |
430 | // If the original base Tensor is padded, then we only allow the unowned |
431 | // Tensor to be padded if there are no offsets. Otherwise assert that the |
432 | // base Tensor is not padded, and set unpaddedSize to that of the new |
433 | // unowned type. |
434 | if (offsets.size() == 0) { |
435 | unownedTensor.unpaddedSize_ = unpaddedSize_; |
436 | assert(actualSize() == unownedTensor.actualSize() && |
437 | "The size of the unowned tensor " |
438 | "should be the same as the size of " |
439 | "the original tensor" ); |
440 | |
441 | } else { |
442 | unownedTensor.unpaddedSize_ = unownedTensor.type_.getSizeInBytes(); |
443 | assert(getSizeInBytes() == getUnpaddedSizeInBytes() && |
444 | "Problematic to get unowned offsetted view of a padded tensor" ); |
445 | assert(actualSize() >= unownedTensor.actualSize() && |
446 | "The size of the unowned tensor " |
447 | "should be no greater than the " |
448 | "size of the original tensor" ); |
449 | } |
450 | return unownedTensor; |
451 | } |
452 | |
453 | /// This is the same as \ref getUnowned() but it produces an owned tensor |
454 | /// instead. \returns owned tensor copied from the data buffer of the current |
455 | /// tensor but having different dimensions \p dims. \p offsets represents an |
456 | /// optional offset into the tensor representing the location of the first |
457 | /// element to start a subview from. |
458 | Tensor getOwnedSlice(llvm::ArrayRef<dim_t> dims, |
459 | llvm::ArrayRef<dim_t> offsets = {}) const { |
460 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
461 | return getUnowned(dims, offsets).clone(); |
462 | } |
463 | |
464 | /// Reset the shape and type of this tensor to match the shape and type of |
465 | /// \p other. The size of the buffer is set to \p unpaddedSize unless it is |
466 | /// negative, which will instead default back to the number of bytes needed |
467 | /// for the type of \p other. |
468 | void reset(const Tensor *other, ssize_t unpaddedSize = -1) { |
469 | reset(other->getType(), unpaddedSize); |
470 | } |
471 | |
472 | void reset(ElemKind elemTy, llvm::ArrayRef<dim_t> shape) { |
473 | Type t(elemTy, shape); |
474 | reset(t); |
475 | } |
476 | |
477 | void reset(ElemKind elemTy, llvm::ArrayRef<dim_t> shape, float scale, |
478 | int32_t offset) { |
479 | Type t(elemTy, shape, scale, offset); |
480 | reset(t); |
481 | } |
482 | |
483 | /// Assigns a new shape to the tensor and allocates a new buffer. The size of |
484 | /// the buffer is set to \p unpaddedSize unless it is negative, which will |
485 | /// instead default back to the number of bytes needed for \p T. |
486 | void reset(const Type &T, ssize_t unpaddedSize = -1) { |
487 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
488 | |
489 | // If negative then fall back to the passed in Type's padded size. |
490 | if (unpaddedSize < 0) { |
491 | unpaddedSize = T.getSizeInBytes(); |
492 | } |
493 | |
494 | // If the new size is identical to the allocated size then there is no need |
495 | // to re-allocate the buffer. |
496 | const bool isOrigPadded = |
497 | getSizeInBytes() != uint64_t(getUnpaddedSizeInBytes()); |
498 | const bool isNewPadded = T.getSizeInBytes() != size_t(unpaddedSize); |
499 | const bool isBufReuseAllowed = |
500 | (isOrigPadded == isNewPadded) && |
501 | (getUnpaddedSizeInBytes() == size_t(unpaddedSize)); |
502 | if (type_ == T && getData() && isBufReuseAllowed) { |
503 | #ifdef GLOW_DEBUG_TENSOR_INIT |
504 | PseudoRNG rng; |
505 | init(InitKind::Broadcast, GLOW_DEBUG_TENSOR_INIT, rng); |
506 | #endif |
507 | resetDeviceInfo(); |
508 | return; |
509 | } |
510 | |
511 | // Delete the old buffer, update the shape, and allocate a new one. |
512 | if (!isUnowned()) |
513 | alignedFree(getData()); |
514 | type_ = T; |
515 | |
516 | // We are allocating memory specifically for this tensor, thus, it owns it. |
517 | isUnowned_ = false; |
518 | |
519 | // We are allocating memory on the host so it is not device resident. |
520 | resetDeviceInfo(); |
521 | |
522 | // Note: zero-dimensional tensors (i.e. {}) have size 1. However, Tensors |
523 | // may have 0 for some dimension, meaning they have size of 0, and so we do |
524 | // not allocate anything for them. |
525 | data_ = unpaddedSize == 0 ? nullptr |
526 | : reinterpret_cast<char *>(alignedAlloc( |
527 | unpaddedSize, TensorAlignment)); |
528 | |
529 | // Set unpaddedSize_ to the actual number of bytes. |
530 | unpaddedSize_ = unpaddedSize; |
531 | |
532 | assert(!(size() < actualSize() && |
533 | getSizeInBytes() != getUnpaddedSizeInBytes()) && |
534 | "Custom aligned Tensors cannot also be partial" ); |
535 | |
536 | #ifdef GLOW_DEBUG_TENSOR_INIT |
537 | PseudoRNG rng; |
538 | init(InitKind::Broadcast, GLOW_DEBUG_TENSOR_INIT, rng); |
539 | #endif |
540 | } |
541 | /// Releases the data buffer and sets the unOwned flag to true. This is useful |
542 | /// for keeping metadata around but not the actual contents. |
543 | void release() { |
544 | if (!isUnowned()) { |
545 | alignedFree(getData()); |
546 | } |
547 | if (ownsDeviceResidency_) { |
548 | delete deviceResidency_; |
549 | ownsDeviceResidency_ = false; |
550 | } |
551 | |
552 | isUnowned_ = true; |
553 | } |
554 | ~Tensor() { |
555 | if (!isUnowned()) { |
556 | alignedFree(getData()); |
557 | } |
558 | |
559 | if (ownsDeviceResidency_) { |
560 | delete deviceResidency_; |
561 | ownsDeviceResidency_ = false; |
562 | } |
563 | } |
564 | |
565 | // Move ctor. |
566 | Tensor(Tensor &&other) noexcept { |
567 | if (!isUnowned()) { |
568 | alignedFree(getData()); |
569 | } |
570 | if (ownsDeviceResidency_) { |
571 | delete deviceResidency_; |
572 | } |
573 | data_ = other.data_; |
574 | type_ = other.type_; |
575 | isUnowned_ = other.isUnowned_; |
576 | tensorPool_ = other.tensorPool_; |
577 | unpaddedSize_ = other.unpaddedSize_; |
578 | deviceResidency_ = other.deviceResidency_; |
579 | ownsDeviceResidency_ = other.ownsDeviceResidency_; |
580 | other.data_ = nullptr; |
581 | other.isUnowned_ = true; |
582 | other.tensorPool_ = nullptr; |
583 | other.deviceResidency_ = nullptr; |
584 | other.ownsDeviceResidency_ = false; |
585 | } |
586 | |
587 | /// Move assignment operator. |
588 | Tensor &operator=(Tensor &&other) { |
589 | if (!isUnowned()) { |
590 | alignedFree(getData()); |
591 | } |
592 | if (ownsDeviceResidency_) { |
593 | delete deviceResidency_; |
594 | } |
595 | data_ = other.data_; |
596 | type_ = other.type_; |
597 | isUnowned_ = other.isUnowned_; |
598 | tensorPool_ = other.tensorPool_; |
599 | unpaddedSize_ = other.unpaddedSize_; |
600 | deviceResidency_ = other.deviceResidency_; |
601 | ownsDeviceResidency_ = other.ownsDeviceResidency_; |
602 | other.data_ = nullptr; |
603 | other.isUnowned_ = true; |
604 | other.tensorPool_ = nullptr; |
605 | other.deviceResidency_ = nullptr; |
606 | other.ownsDeviceResidency_ = false; |
607 | return *this; |
608 | } |
609 | |
610 | /// Dump a textual representation of the Tensor into provided output stream. |
611 | void dump(llvm::raw_ostream &os) const; |
612 | |
613 | /// Dump a textual representation of the Tensor into default output stream. |
614 | void dump() const; |
615 | |
616 | /// Dump a textual representation of a specific number of elements in the |
617 | /// Tensor into provided output stream. |
618 | void dump(llvm::raw_ostream &os, unsigned maxNumElem) const; |
619 | |
620 | /// Dump a textual representation of a specific number of elements in the |
621 | /// Tensor into default output stream. |
622 | void dump(unsigned maxNumElem) const; |
623 | |
624 | /// Dump a textual representation of the Tensor to std::string. |
625 | std::string toString() const; |
626 | |
627 | /// Dump a textual representation of a specific number of elements in the |
628 | /// Tensor to std::string. |
629 | std::string toString(unsigned maxNumElem) const; |
630 | |
631 | /// Dump a textual representation of the shape of this Tensor to std::string. |
632 | std::string getShapeToString() const; |
633 | |
634 | /// \returns true if the content of the other tensor \p other is identical to |
635 | /// this one, given some \p allowedError. If \p verbose and the tensors are |
636 | /// not equal, then we will log information about the mismatch (number of |
637 | /// elements exceeding allowed error; maximum error and location found; etc.). |
638 | bool isEqual(const Tensor &other, float allowedError = 0.0001, |
639 | bool verbose = true) const { |
640 | if (isDeviceResident()) { |
641 | if (!other.isDeviceResident()) { |
642 | if (verbose) { |
643 | LOG(INFO) << "Tensors cannot be compared as they are not resident in " |
644 | "the same location." ; |
645 | } |
646 | return false; |
647 | } |
648 | |
649 | return getDeviceManager() == other.getDeviceManager() && |
650 | getLocationContext() == other.getLocationContext(); |
651 | } |
652 | return isEqualImpl(other, /*isBitwise=*/false, allowedError, verbose); |
653 | } |
654 | |
655 | /// \returns true if the content of the other tensor \p other is bitwise |
656 | /// identical to this one. |
657 | bool isBitwiseEqual(const Tensor &other, bool verbose = false) const { |
658 | return isEqualImpl(other, /*isBitwise=*/true, /*allowedError=*/0.0, |
659 | verbose); |
660 | } |
661 | |
662 | bool isEqualImpl(const Tensor &other, bool isBitwise, float allowedError, |
663 | bool verbose) const { |
664 | if (other.dims() != dims()) { |
665 | if (verbose) { |
666 | LOG(INFO) << "Tensors are not equal as they have different shapes: " |
667 | << this->getShapeToString() << " vs. " |
668 | << other.getShapeToString(); |
669 | } |
670 | return false; |
671 | } |
672 | |
673 | // For now, make sure that either both or neither of the tensors have |
674 | // UInt8FusedQTy or UInt8Fused16QTy. While it is possible for an Int8QTy |
675 | // tensor to equal a fused tensor if the fused tensor has the same |
676 | // scale/offset on all of its rows, and that scale/offset match that of the |
677 | // Int8QTy, we do not support checking this for now. |
678 | assert(((getElementType() == ElemKind::UInt8FusedQTy && |
679 | other.getElementType() == ElemKind::UInt8FusedQTy) || |
680 | (getElementType() == ElemKind::UInt8FusedFP16QTy && |
681 | other.getElementType() == ElemKind::UInt8FusedFP16QTy) || |
682 | (getElementType() != ElemKind::UInt8FusedFP16QTy && |
683 | other.getElementType() != ElemKind::UInt8FusedQTy)) && |
684 | "Fused ElemKinds only supports comparing against same ElemKind." ); |
685 | |
686 | // Assert that the scale and offset match for the quantized types. |
687 | switch (getElementType()) { |
688 | default: |
689 | break; |
690 | case ElemKind::Int8QTy: |
691 | case ElemKind::UInt8QTy: |
692 | case ElemKind::Int16QTy: |
693 | case ElemKind::Int32QTy: |
694 | assert(getType().getScale() == other.getType().getScale() && |
695 | "Scales must match." ); |
696 | assert(getType().getOffset() == other.getType().getOffset() && |
697 | "Offsets must match." ); |
698 | } |
699 | |
700 | // Bitwise compare. |
701 | if (isBitwise) { |
702 | return isBitwiseEqualImpl(other, verbose); |
703 | } |
704 | |
705 | switch (getElementType()) { |
706 | case ElemKind::FloatTy: |
707 | return isEqualImpl<float>(other, allowedError, verbose); |
708 | case ElemKind::Float16Ty: |
709 | return isEqualImpl<float16_t>(other, allowedError, verbose); |
710 | case ElemKind::BFloat16Ty: |
711 | return isEqualImpl<bfloat16_t>(other, allowedError, verbose); |
712 | case ElemKind::Float64Ty: |
713 | return isEqualImpl<double>(other, allowedError, verbose); |
714 | case ElemKind::Int8QTy: |
715 | return isEqualImpl<int8_t>(other, allowedError, verbose); |
716 | case ElemKind::UInt8QTy: |
717 | return isEqualImpl<uint8_t>(other, allowedError, verbose); |
718 | case ElemKind::Int16QTy: |
719 | return isEqualImpl<int16_t>(other, allowedError, verbose); |
720 | case ElemKind::Int32QTy: |
721 | return isEqualImpl<int32_t>(other, allowedError, verbose); |
722 | case ElemKind::Int64QTy: |
723 | return isEqualImpl<int64_t>(other, allowedError, verbose); |
724 | case ElemKind::UInt8ITy: |
725 | return isEqualImpl<uint8_t>(other, allowedError, verbose); |
726 | case ElemKind::Int32ITy: |
727 | return isEqualImpl<int32_t>(other, allowedError, verbose); |
728 | case ElemKind::Int64ITy: |
729 | return isEqualImpl<int64_t>(other, allowedError, verbose); |
730 | // Note: We can use isEqualImpl() here because the scales/offsets will be |
731 | // compared as if they were data, so we will return false if any rowwise |
732 | // scale/offset do not match. |
733 | case ElemKind::UInt8FusedQTy: |
734 | return isEqualImpl<uint8_t>(other, allowedError, verbose); |
735 | case ElemKind::UInt8FusedFP16QTy: |
736 | return isEqualImpl<uint8_t>(other, allowedError, verbose); |
737 | case ElemKind::UInt4FusedFP16QTy: |
738 | return isEqualImpl<uint8_t>(other, allowedError, verbose); |
739 | case ElemKind::UInt4FusedQTy: |
740 | return isEqualImpl<uint8_t>(other, allowedError, verbose); |
741 | case ElemKind::BoolTy: |
742 | return isEqualImpl<bool>(other, allowedError, verbose); |
743 | } |
744 | |
745 | // This is to make compiler happy. It can never reach this point as switch |
746 | // always covers all possible values. |
747 | llvm_unreachable("unreachable" ); |
748 | } |
749 | |
750 | /// \returns whether this Tensor is tiled (repeated) along \p axis for the |
751 | /// given tile size \p size. Some examples: |
752 | /// - A Tensor with size [2, 3] equal to [[1,2,3],[1,2,3]] is tiled along |
753 | /// axis 0 for a tile size equal to 1. |
754 | /// - A Tensor with size [2, 4] equal to [[1, 2, 1, 2],[3, 4, 3, 4]] is tiled |
755 | /// along axis 1 for a tile size equal to 2. |
756 | /// When the tile size matches the dimensions size this function returns TRUE. |
757 | /// If the \p fractional flag is optionally given that this function will also |
758 | /// perform fractional tiling verification (default is FALSE). Some examples: |
759 | /// - For a Tensor with size [5] equal to [1,2,3,1,2], axis 0 and tile size 3, |
760 | /// this function returns TRUE if \p fractional is TRUE and returns FALSE if |
761 | /// \p fractional is FALSE. |
762 | bool isTiled(unsigned_t axis, dim_t size = 1, bool fractional = false) const; |
763 | |
764 | /// \returns whether this Tensor is tiled (repeated) along \p axes for the |
765 | /// given tile sizes \p sizes. Some examples: |
766 | /// - A Tensor with size [2, 4] equal to [[1,2,1,2],[1,2,1,2]] is tiled along |
767 | /// axes {0,1} for the tile sizes {1,2}. |
768 | /// When the tile sizes match the dimension sizes this function returns TRUE. |
769 | /// If the \p fractional flag is optionally given that this function will also |
770 | /// perform fractional tiling verification (default is FALSE). Some examples: |
771 | /// - For a Tensor with size [5] equal to [1,2,3,1,2], axes {0} and sizes {3}, |
772 | /// this function returns TRUE if \p fractional is TRUE and returns FALSE if |
773 | /// \p fractional is FALSE. |
774 | bool isTiled(llvm::ArrayRef<unsigned_t> axes, llvm::ArrayRef<dim_t> sizes, |
775 | bool fractional = false) const; |
776 | |
777 | /// Update the content and type of the tensor from the tensor \p t. |
778 | void assign(const Tensor *t) { |
779 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
780 | assert(this != t && "Copying to self" ); |
781 | const size_t bufferSize = t->getUnpaddedSizeInBytes(); |
782 | reset(t, bufferSize); |
783 | std::copy(&t->getData()[0], &t->getData()[bufferSize], getData()); |
784 | } |
785 | |
786 | /// Update the raw data of the tensor from the tensor \p t. |
787 | void copyRawFrom(const Tensor *t) { |
788 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
789 | assert(this != t && "Copying to self" ); |
790 | assert(actualSize() == t->actualSize()); |
791 | assert(getElementType() == t->getElementType() && "Invalid element type" ); |
792 | assert(t->getUnpaddedSizeInBytes() == getUnpaddedSizeInBytes() && |
793 | "Do not support copying between different unpadded sized tensors" ); |
794 | size_t bufferSize = type_.getSizeInBytes(); |
795 | std::copy(&t->getData()[0], &t->getData()[bufferSize], getData()); |
796 | } |
797 | |
798 | /// Update the raw data of the tensor from a raw buffer \p data. |
799 | void copyRawFrom(const char *data) { |
800 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
801 | assert(data && "Null data pointer!" ); |
802 | assert(getData() != data && "Copying to self" ); |
803 | size_t bufferSize = type_.getSizeInBytes(); |
804 | std::memcpy(getData(), data, bufferSize); |
805 | } |
806 | |
807 | /// Update the content of the tensor with a slice from tensor \p t. A slice |
808 | /// is one index from the first dimension of the tensor. |
809 | void copySlice(const Tensor *t, size_t slice) { |
810 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
811 | auto dim = t->dims().slice(1); |
812 | (void)dim; |
813 | assert(dim == dims() && "Invalid slice size" ); |
814 | assert(getElementType() == t->getElementType() && "Invalid element type" ); |
815 | |
816 | size_t bufferSize = type_.getSizeInBytes(); |
817 | std::copy(&t->getData()[bufferSize * slice], |
818 | &t->getData()[bufferSize * (slice + 1)], getData()); |
819 | } |
820 | |
821 | /// Update the content of the tensor with a sequence of slices from the |
822 | /// tensor \p t. A slice is one index from the first dimension of the tensor. |
823 | /// The copying operation may overlap the end of the tensor \p t one or more |
824 | /// times. This means that the data in the input tensor may be duplicated. |
825 | void copyConsecutiveSlices(const Tensor *t, size_t startSliceIdx) { |
826 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
827 | auto onceSliceDim = t->dims().slice(1); |
828 | (void)onceSliceDim; |
829 | assert(onceSliceDim == dims().slice(1) && "Invalid slice size" ); |
830 | assert(getElementType() == t->getElementType() && "Invalid element type" ); |
831 | assert(dims().size() > 1 && "Tensor must contain at least two dimensions" ); |
832 | |
833 | size_t numSlicesInInput = t->dims()[0]; |
834 | size_t numElementsInSlice = actualSize() / dims()[0]; |
835 | size_t bufferSize = numElementsInSlice * type_.getElementSize(); |
836 | |
837 | // For each outer slice in the current tensor: |
838 | for (size_t n = 0, e = dims()[0]; n < e; n++) { |
839 | size_t startIdx = (startSliceIdx + n) % numSlicesInInput; |
840 | std::copy(&t->getData()[bufferSize * startIdx], |
841 | &t->getData()[bufferSize * (startIdx + 1)], |
842 | &getData()[bufferSize * n]); |
843 | } |
844 | } |
845 | |
846 | /// Convenience method to copy the content of \p t |
847 | /// to this while both have different underlying types. |
848 | /// This copy will read each element of \p t as SrcElemType |
849 | /// and cast them to DestElemType in this. |
850 | template <typename DestElemType, typename SrcElemType> |
851 | void copyWithCast(const Tensor *t) { |
852 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
853 | static_assert(!std::is_same<DestElemType, SrcElemType>::value, |
854 | "Use copyRawFrom instead" ); |
855 | assert(this != t && "Copying to self" ); |
856 | assert(getElementType() != t->getElementType() && |
857 | "Use copyRawFrom instead" ); |
858 | assert(actualSize() == t->actualSize() && "Different sizes" ); |
859 | const auto *src = t->getRawDataPointer<SrcElemType>(); |
860 | auto *dst = getRawDataPointer<DestElemType>(); |
861 | for (size_t idx = 0, end = actualSize(); idx != end; ++idx) { |
862 | dst[idx] = DestElemType(src[idx]); |
863 | } |
864 | } |
865 | |
866 | /// Convert each element of this tensor to \p newTy. Calls into |
867 | /// \ref getCopyConvertedToType() to do the conversion, and hence supports |
868 | /// converting between whatever ElemKinds it supports. |
869 | void convertToType(ElemKind newTy); |
870 | |
871 | /// \returns a copy of the Tensor but converted to \p newKind. Currently |
872 | /// supports conversion for: |
873 | /// - FloatTy to Float16Ty |
874 | /// - FloatTy to BFloat16Ty |
875 | /// - Float16Ty to FloatTy |
876 | /// - BFloat16Ty to FloatTy |
877 | /// - UInt8FusedQTy to UInt8FusedFP16QTy |
878 | Tensor getCopyConvertedToType(ElemKind newKind) const; |
879 | |
880 | /// Transpose the tensor \p src into the empty tensor \p dest. Shuffle the |
881 | /// axis based on the list \p shuffle, where each element is the src index. |
882 | void transpose(Tensor *dest, llvm::ArrayRef<unsigned_t> shuffle) const { |
883 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
884 | genericTranspose(this, dest, shuffle); |
885 | } |
886 | |
887 | /// Create a new copy of the current tensor. |
888 | Tensor clone() const { |
889 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
890 | Tensor slice; |
891 | slice.assign(this); |
892 | return slice; |
893 | } |
894 | |
895 | /// Return the raw unsafe pointer to the tensor payload. |
896 | char *getUnsafePtr() const { return getData(); } |
897 | |
898 | /// \returns true if tensor data is stored on a device |
899 | bool isDeviceResident() const { |
900 | return deviceResidency_ && deviceResidency_->isDeviceResident(); |
901 | } |
902 | |
903 | /// Update device residency info with new device manager and context |
904 | void moveToDevice(DeviceTensorTransferManager *deviceManager, |
905 | void *locationContext); |
906 | |
907 | /// If device resident, copy Tensor contents back to host memory and release |
908 | /// associated device memory. |
909 | void ensureOnHost(); |
910 | |
911 | /// Updates contents of a device resident Tensor with the data from \p t |
912 | /// without copying its contents to host. |
913 | void copyRawToDevice(const Tensor *t); |
914 | |
915 | /// \returns the pointer to the device manager where the tensor resides. |
916 | DeviceTensorTransferManager *getDeviceManager() const { |
917 | assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist" ); |
918 | assert(deviceResidency_->isDeviceResident() && |
919 | "Tensor must be device resident" ); |
920 | return deviceResidency_->getDeviceManager(); |
921 | } |
922 | |
923 | /// \returns the pointer to the location context of where the tensor resides. |
924 | void *getLocationContext() const { |
925 | assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist" ); |
926 | assert(deviceResidency_->isDeviceResident() && |
927 | "Tensor must be device resident" ); |
928 | return deviceResidency_->getLocationContext(); |
929 | } |
930 | |
931 | void resetDeviceInfo() { |
932 | if (deviceResidency_ && ownsDeviceResidency_) { |
933 | deviceResidency_->clear(); |
934 | return; |
935 | } |
936 | |
937 | deviceResidency_ = new DeviceResidencyInfo(); |
938 | ownsDeviceResidency_ = true; |
939 | } |
940 | |
941 | /// Clears DeviceResidencyInfo. |
942 | /// Note that this does not affect the associated DeviceManager or device |
943 | /// memory. |
944 | void clearDeviceResidency() { |
945 | assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist" ); |
946 | assert(deviceResidency_->isDeviceResident() && |
947 | "Tensor must be device resident" ); |
948 | deviceResidency_->clear(); |
949 | } |
950 | |
951 | /// \return a new handle that points and manages this tensor. |
952 | template <class ElemTy = float> Handle<ElemTy> getHandle() &; |
953 | |
954 | template <class ElemTy = float> const Handle<ElemTy> getHandle() const &; |
955 | |
956 | /// If Tensor is rvalue, it is an error to get its Handle. |
957 | template <class ElemTy = float> Handle<ElemTy> getHandle() && = delete; |
958 | |
959 | private: |
960 | /// \returns a pointer to the raw data, of type \p ElemTy. |
961 | template <class ElemTy> ElemTy *getRawDataPointer() { |
962 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
963 | assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type." ); |
964 | return reinterpret_cast<ElemTy *>(data_); |
965 | } |
966 | |
967 | /// \returns a const pointer to the raw data, of type \p ElemTy. |
968 | template <class ElemTy> const ElemTy *getRawDataPointer() const { |
969 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
970 | assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type." ); |
971 | return reinterpret_cast<const ElemTy *>(data_); |
972 | } |
973 | |
974 | template <class ElemTy> |
975 | bool isEqualImpl(const Tensor &other, float allowedError, |
976 | bool verbose) const { |
977 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
978 | auto thisHandle = getHandle<ElemTy>(); |
979 | auto otherHandle = other.getHandle<ElemTy>(); |
980 | double maxFoundError = 0.0; |
981 | size_t numExceedingError = 0; |
982 | size_t currIndex = 0; |
983 | size_t maxFoundErrorIdx = 0; |
984 | double maxRE = 0.0; // relative error. |
985 | size_t maxREIdx = 0; |
986 | for (auto thisHandleIt = thisHandle.begin(), |
987 | otherHandleIt = otherHandle.begin(); |
988 | thisHandleIt != thisHandle.end() && otherHandleIt != otherHandle.end(); |
989 | ++thisHandleIt, ++otherHandleIt, ++currIndex) { |
990 | double delta = *thisHandleIt - *otherHandleIt; |
991 | delta = std::abs(delta); |
992 | // Since any comparison with NAN returns false, we use a negated condition |
993 | // so that this function correctly returns false when delta is NAN. |
994 | if (!(delta <= allowedError)) { |
995 | if (!verbose) { |
996 | return false; |
997 | } |
998 | numExceedingError += 1; |
999 | if (!(delta <= maxFoundError)) { |
1000 | maxFoundError = delta; |
1001 | maxFoundErrorIdx = currIndex; |
1002 | } |
1003 | double sum = *thisHandleIt + *otherHandleIt; |
1004 | double re = delta / std::abs(sum); |
1005 | if (!(re <= maxRE)) { |
1006 | maxRE = re; |
1007 | maxREIdx = currIndex; |
1008 | } |
1009 | } |
1010 | } |
1011 | auto thisHandleIt = thisHandle.begin(); |
1012 | auto otherHandleIt = otherHandle.begin(); |
1013 | if (numExceedingError != 0) { |
1014 | LOG(INFO) << "Tensors not equal: " << numExceedingError << " out of " |
1015 | << actualSize() << " elements exceeded allowed error threshold " |
1016 | << allowedError << ". Maximum error found was " << maxFoundError |
1017 | << " at index " << maxFoundErrorIdx << ": " |
1018 | << *(thisHandleIt.operator+(maxFoundErrorIdx)) << " vs. " |
1019 | << *(otherHandleIt.operator+(maxFoundErrorIdx)); |
1020 | LOG(INFO) << "Maximum relative error found was: " << maxRE |
1021 | << " at index: " << maxREIdx << ": " |
1022 | << *(thisHandleIt.operator+(maxREIdx)) << " v.s. " |
1023 | << *(otherHandleIt.operator+(maxREIdx)); |
1024 | } |
1025 | return numExceedingError == 0; |
1026 | } |
1027 | |
1028 | bool isBitwiseEqualImpl(const Tensor &other, bool verbose) const { |
1029 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
1030 | auto const *myData = getUnsafePtr(); |
1031 | auto const *otherData = other.getUnsafePtr(); |
1032 | dim_t mismatchCount = 0; |
1033 | |
1034 | if (verbose) { |
1035 | for (size_t i = 0, e = getSizeInBytes(); i < e; i++) { |
1036 | if (myData[i] != otherData[i]) { |
1037 | ++mismatchCount; |
1038 | } |
1039 | } |
1040 | if (mismatchCount != 0) { |
1041 | LOG(INFO) << "Tensors not bitwise equal: " << mismatchCount |
1042 | << " bytes out of " << getSizeInBytes() << " mismatched." ; |
1043 | } |
1044 | } else { |
1045 | mismatchCount = memcmp(myData, otherData, getSizeInBytes()); |
1046 | } |
1047 | |
1048 | return mismatchCount == 0; |
1049 | } |
1050 | }; |
1051 | |
1052 | //===----------------------------------------------------------------------===// |
1053 | // Tensor Handle |
1054 | //===----------------------------------------------------------------------===// |
1055 | |
1056 | constexpr unsigned MAX_DUMP_ELEMS = 100; |
1057 | |
1058 | void dumpAsciiImpl(const Tensor *T, llvm::raw_ostream &os); |
1059 | void dumpAsciiImpl(const Tensor *T); |
1060 | |
1061 | void dumpImpl(const Tensor *T, llvm::raw_ostream &os, |
1062 | unsigned maxNumElem = MAX_DUMP_ELEMS); |
1063 | void dumpImpl(const Tensor *T, unsigned maxNumElem); |
1064 | void dumpImpl(const Tensor *T); |
1065 | |
1066 | template <class ElemTy> class Handle; |
1067 | |
1068 | /// A class that provides ability to iterate over a Handle<ElemTy>. Since it's |
1069 | /// common to have both mutating and const iterators, this class has template |
1070 | /// parameter IsConst, which is true to create const_iterator and false |
1071 | /// otherwise. |
1072 | template <class ElemTy, bool IsConst> |
1073 | class HandleIterator |
1074 | : public std::iterator<std::random_access_iterator_tag, ElemTy> { |
1075 | using HandleTy = typename std::conditional_t<IsConst, const Handle<ElemTy> *, |
1076 | Handle<ElemTy> *>; |
1077 | using ElemTyRef = |
1078 | typename std::conditional_t<IsConst, const ElemTy &, ElemTy &>; |
1079 | |
1080 | /// At every given moment, the iterator maintains an index, which is used to |
1081 | /// access the Handle. When moving the iterator forward, the index is |
1082 | /// incremented. Only valid elements can be accessed. |
1083 | /// 0 <= idx_ <= handle_->size() |
1084 | HandleTy handle_; |
1085 | llvm::ArrayRef<dim_t> sizes_; |
1086 | dim_t idx_; |
1087 | /// Holds true if the underlying tensor has non-trivial alignment (i.e. not 1) |
1088 | bool isAligned_; |
1089 | |
1090 | HandleIterator() = default; |
1091 | |
1092 | HandleIterator(HandleTy handle) : handle_(handle) { |
1093 | sizes_ = handle->dims(); |
1094 | isAligned_ = handle->size() < handle->actualSize(); |
1095 | } |
1096 | |
1097 | static HandleIterator begin(HandleTy handle) { |
1098 | auto res = HandleIterator(handle); |
1099 | res.idx_ = 0; |
1100 | return res; |
1101 | } |
1102 | |
1103 | static HandleIterator end(HandleTy handle) { |
1104 | auto res = HandleIterator(handle); |
1105 | res.idx_ = res.handle_->getRealNumElements(); |
1106 | return res; |
1107 | } |
1108 | |
1109 | friend class Handle<ElemTy>; |
1110 | |
1111 | public: |
1112 | HandleIterator &operator++() { |
1113 | if (*this != handle_->end()) { |
1114 | idx_++; |
1115 | } |
1116 | return *this; |
1117 | } |
1118 | HandleIterator &operator--() { |
1119 | if (idx_) { |
1120 | idx_--; |
1121 | } |
1122 | return *this; |
1123 | } |
1124 | HandleIterator operator+(int n) const { |
1125 | auto res = HandleIterator(handle_); |
1126 | res.idx_ = std::max(static_cast<int>(idx_) + n, 0); |
1127 | res.idx_ = std::min(res.idx_, res.handle_->size()); |
1128 | return res; |
1129 | } |
1130 | HandleIterator operator-(int n) const { return *this + (-n); } |
1131 | operator int() const { return idx_; } |
1132 | |
1133 | ElemTyRef operator*() { |
1134 | if (!isAligned_) { |
1135 | return handle_->raw(idx_); |
1136 | } |
1137 | std::vector<dim_t> indices(sizes_.size(), 0); |
1138 | size_t rem = idx_; |
1139 | for (int i = static_cast<int>(sizes_.size()) - 1; i >= 0; i--) { |
1140 | indices[i] = rem % sizes_[i]; |
1141 | rem /= sizes_[i]; |
1142 | } |
1143 | return handle_->at(indices); |
1144 | } |
1145 | |
1146 | bool operator==(const HandleIterator<ElemTy, IsConst> &other) const { |
1147 | return idx_ == other.idx_; |
1148 | } |
1149 | |
1150 | bool operator!=(const HandleIterator<ElemTy, IsConst> &other) const { |
1151 | return !(*this == other); |
1152 | } |
1153 | }; |
1154 | |
1155 | /// Helper which \returns the flattened 1D offset given \p indices into a tensor |
1156 | /// with \p strides. |
1157 | inline size_t getFlattenedOffset(llvm::ArrayRef<dim_t> strides, |
1158 | llvm::ArrayRef<dim_t> indices) { |
1159 | assert(indices.size() <= strides.size() && "Invalid number of indices" ); |
1160 | // The loop below can be rewritten using std::inner_product. Unfortunately |
1161 | // std::inner_product does not optimize very well and loops that use this |
1162 | // method don't get vectorized. Don't change this loop without benchmarking |
1163 | // the program on a few compilers. |
1164 | size_t index = 0; |
1165 | for (size_t i = 0, e = indices.size(); i < e; i++) { |
1166 | index += size_t(strides[i]) * size_t(indices[i]); |
1167 | } |
1168 | |
1169 | return index; |
1170 | } |
1171 | |
1172 | /// Helper function which \returns true if a slice with the shape \p sliceShape |
1173 | /// referenced from a larger tensor with the shape \p tensorShape is contiguous |
1174 | /// in memory (assuming the tensor it is referenced from is contiguous). This |
1175 | /// happens when the slice dimensions: |
1176 | /// - Start with singleton dimensions (dimensions equal to 1). |
1177 | /// - Continue with a partially extracted dimension (one maximum). |
1178 | /// - End with fully extracted dimensions. |
1179 | bool isSliceContiguous(llvm::ArrayRef<dim_t> sliceShape, |
1180 | llvm::ArrayRef<dim_t> tensorShape); |
1181 | |
1182 | /// A class that provides indexed access to a tensor. This class has value |
1183 | /// semantics and it's copied around. One of the reasons for making this class |
1184 | /// value semantics is to allow efficient index calculation that the compiler |
1185 | /// can optimize (because stack allocated structures don't alias). |
1186 | template <class ElemTy> class Handle final { |
1187 | /// A pointer to the tensor that this handle wraps. |
1188 | Tensor *tensor_{nullptr}; |
1189 | |
1190 | /// Contains the multiplication of the sizes from current position to end. |
1191 | /// For example, for index (w,z,y,z): [x * y * z, y * z, z, 1] |
1192 | dim_t sizeIntegral_[max_tensor_dimensions] = { |
1193 | 0, |
1194 | }; |
1195 | |
1196 | dim_t sizes_[max_tensor_dimensions] = { |
1197 | 0, |
1198 | }; |
1199 | |
1200 | /// Saves the number of dimensions used in the tensor. |
1201 | uint8_t numDims_{0}; |
1202 | |
1203 | /// Remember end iterators. This is needed to speed up iterator increment, |
1204 | /// which has to check that iterator hasn't reached the end yet. |
1205 | HandleIterator<ElemTy, false> mutating_end_; |
1206 | HandleIterator<ElemTy, true> const_end_; |
1207 | |
1208 | /// Create a new invalid handle. Notice that this method is private and may |
1209 | /// only be used by the static factory method below. |
1210 | Handle() = default; |
1211 | |
1212 | public: |
1213 | /// \returns an iterator to the first element of the tensor. |
1214 | HandleIterator<ElemTy, false> begin() { |
1215 | return HandleIterator<ElemTy, false>::begin(this); |
1216 | } |
1217 | HandleIterator<ElemTy, true> begin() const { |
1218 | return HandleIterator<ElemTy, true>::begin(this); |
1219 | } |
1220 | |
1221 | /// \returns an iterator referring to the past-the-end element. |
1222 | HandleIterator<ElemTy, false> end() { return mutating_end_; } |
1223 | HandleIterator<ElemTy, true> end() const { return const_end_; } |
1224 | |
1225 | /// Allocate a new invalid handle. |
1226 | static Handle createInvalidHandle() { return Handle(); } |
1227 | |
1228 | /// \returns true if this Handle points to a valid tensor. |
1229 | bool isValid() const { return tensor_; } |
1230 | |
1231 | /// Calculate the index for a specific element in the tensor. Notice that |
1232 | /// the list of indices may be incomplete. This method provides access to |
1233 | /// padding elements, meaning that it's possible to get an index pointing at |
1234 | /// data, added to meet alignment requirements. |
1235 | size_t getElementPtr(llvm::ArrayRef<dim_t> indices) const { |
1236 | return getFlattenedOffset(llvm::makeArrayRef(sizeIntegral_, numDims_), |
1237 | indices); |
1238 | } |
1239 | |
1240 | /// \returns the value of the n'th dimension \p dim, for the index \p idx. |
1241 | /// 0 <= idx < size(), meaning that \p idx addresses a real data elements, |
1242 | /// not paddings. |
1243 | size_t getDimForPtr(size_t dim, size_t idx) const { |
1244 | assert(dim < numDims_ && "Invalid dimension" ); |
1245 | assert(idx < size() && "Invalid index" ); |
1246 | auto R = idx; |
1247 | for (size_t i = dim + 1; i < numDims_; i++) { |
1248 | R /= sizes_[i]; |
1249 | } |
1250 | return R % sizes_[dim]; |
1251 | } |
1252 | |
1253 | /// \returns the type of the tensor. |
1254 | const Type &getType() const { return tensor_->getType(); } |
1255 | |
1256 | /// \returns the element type of the tensor. |
1257 | ElemKind getElementType() const { return tensor_->getElementType(); } |
1258 | |
1259 | /// Construct a Tensor handle. |
1260 | explicit Handle(Tensor *tensor) : tensor_(tensor) { |
1261 | auto sizes = tensor->dims(); |
1262 | numDims_ = sizes.size(); |
1263 | |
1264 | /// We allow handles that wrap uninitialized tensors. |
1265 | if (numDims_) { |
1266 | // Copy the sizes of the tensor. |
1267 | memcpy(sizes_, tensor_->type_.sizes_, |
1268 | max_tensor_dimensions * sizeof(sizes_[0])); |
1269 | // Copy the strides of the tensor. |
1270 | memcpy(sizeIntegral_, tensor_->type_.strides_, |
1271 | max_tensor_dimensions * sizeof(tensor_->type_.strides_[0])); |
1272 | assert(numDims_ <= max_tensor_dimensions && "Too many dimensions." ); |
1273 | } |
1274 | |
1275 | mutating_end_ = HandleIterator<ElemTy, false>::end(this); |
1276 | const_end_ = HandleIterator<ElemTy, true>::end(this); |
1277 | } |
1278 | |
1279 | llvm::ArrayRef<dim_t> dims() const { |
1280 | return llvm::ArrayRef<dim_t>(sizes_, numDims_); |
1281 | } |
1282 | |
1283 | /// \returns the number of elements in the whole tensor. |
1284 | dim_t size() const { return tensor_->size(); } |
1285 | |
1286 | /// \returns the actual number of elements in the tensor taking striding into |
1287 | /// account. Since size() does not take striding into account, size() is |
1288 | /// always <= actualSize(). |
1289 | dim_t actualSize() const { return tensor_->actualSize(); } |
1290 | |
1291 | /// \returns the unpadded size of the underlying \ref tensor_. |
1292 | size_t getUnpaddedSizeInBytes() const { |
1293 | return tensor_->getUnpaddedSizeInBytes(); |
1294 | } |
1295 | |
1296 | /// \returns the number of unpadded elements in the underlying \ref tensor_. |
1297 | size_t getRealNumElements() const { return tensor_->getRealNumElements(); } |
1298 | |
1299 | bool isInBounds(llvm::ArrayRef<dim_t> indices) const { |
1300 | return tensor_->isInBounds(indices); |
1301 | } |
1302 | |
1303 | void clear(ElemTy value = 0) { std::fill(begin(), end(), value); } |
1304 | |
1305 | /// Returns reference to a meaningful data element. This method does not |
1306 | /// address padding elements. |
1307 | ElemTy &at(llvm::ArrayRef<dim_t> indices) { |
1308 | size_t index = getElementPtr(indices); |
1309 | auto *data = tensor_->getRawDataPointer<ElemTy>(); |
1310 | return data[index]; |
1311 | } |
1312 | |
1313 | const ElemTy &at(llvm::ArrayRef<dim_t> indices) const { |
1314 | size_t index = getElementPtr(indices); |
1315 | auto *data = tensor_->getRawDataPointer<ElemTy>(); |
1316 | return data[index]; |
1317 | } |
1318 | |
1319 | /// \returns the element at offset \p idx without any size calculations. |
1320 | /// The returned element can be a pad element. |
1321 | ElemTy &raw(size_t index) { |
1322 | auto *data = tensor_->getRawDataPointer<ElemTy>(); |
1323 | return data[index]; |
1324 | } |
1325 | |
1326 | /// \returns the element at offset \p idx without any size calculations. |
1327 | /// The returned element can be a pad element. |
1328 | const ElemTy &raw(size_t index) const { |
1329 | auto *data = tensor_->getRawDataPointer<ElemTy>(); |
1330 | return data[index]; |
1331 | } |
1332 | |
1333 | /// Extract a smaller dimension tensor from a specific slice (that has to be |
1334 | /// the first dimension). |
1335 | Tensor extractSlice(size_t idx) const { |
1336 | auto sizes = tensor_->dims(); |
1337 | assert(sizes.size() > 1 && "Tensor must have at least two dimensions" ); |
1338 | assert(idx < sizes[0] && "Invalid first index" ); |
1339 | |
1340 | Tensor slice{Type::newShape(tensor_->getType(), sizes.slice(1), |
1341 | tensor_->type_.strides().slice(1))}; |
1342 | |
1343 | // Extract the whole slice. |
1344 | size_t startIdx = sizeIntegral_[0] * idx; |
1345 | ElemTy *base = tensor_->getRawDataPointer<ElemTy>() + startIdx; |
1346 | auto *dest = slice.getRawDataPointer<ElemTy>(); |
1347 | std::copy(base, base + sizeIntegral_[0], dest); |
1348 | |
1349 | return slice; |
1350 | } |
1351 | |
1352 | /// Insert a smaller dimension tensor into a larger tensor at a specific |
1353 | /// first-dimension index. |
1354 | void insertSlice(const Tensor &slice, size_t idx) { |
1355 | auto dims = tensor_->dims(); |
1356 | (void)dims; |
1357 | assert(getElementType() == slice.getElementType()); |
1358 | assert(dims.size() > 1 && "Tensor must have at least two dimensions" ); |
1359 | assert(idx < dims[0] && "Invalid first index" ); |
1360 | |
1361 | auto sliceSize = sizeIntegral_[0]; |
1362 | size_t startIdx = sliceSize * idx; |
1363 | ElemTy *base = &raw(startIdx); |
1364 | const ElemTy *slicePtr = slice.getRawDataPointer<float>(); |
1365 | std::copy(slicePtr, slicePtr + sliceSize, base); |
1366 | } |
1367 | |
1368 | /// Create a new copy of the current tensor. |
1369 | Tensor clone() const { return tensor_->clone(); } |
1370 | |
1371 | /// Update the content of the tensor from a literal list: |
1372 | void operator=(const std::initializer_list<ElemTy> &vec) { |
1373 | assert(actualSize() == vec.size() && "Invalid input size." ); |
1374 | size_t i = 0; |
1375 | for (auto &e : vec) { |
1376 | raw(i++) = e; |
1377 | } |
1378 | } |
1379 | |
1380 | void operator=(llvm::ArrayRef<ElemTy> array) { |
1381 | assert(actualSize() == array.size() && "Invalid input size." ); |
1382 | std::copy(array.begin(), array.end(), &raw(0)); |
1383 | } |
1384 | |
1385 | void dumpAscii(llvm::raw_ostream &os) const { dumpAsciiImpl(tensor_, os); } |
1386 | void dumpAscii() const { dumpAsciiImpl(tensor_); } |
1387 | |
1388 | /// \returns the raw indices of a min and max values from the tensor. |
1389 | /// In case of multiple min or max, the smallest index is returned. |
1390 | std::pair<dim_t, dim_t> minMaxArg() const { |
1391 | ElemTy max = raw(0); |
1392 | ElemTy min = raw(0); |
1393 | |
1394 | size_t maxIdx = 0; |
1395 | size_t minIdx = 0; |
1396 | |
1397 | for (size_t i = 1, e = actualSize(); i < e; i++) { |
1398 | ElemTy val = raw(i); |
1399 | if (val > max) { |
1400 | max = val; |
1401 | maxIdx = i; |
1402 | } else if (val < min) { |
1403 | min = val; |
1404 | minIdx = i; |
1405 | } |
1406 | } |
1407 | |
1408 | return std::make_pair(minIdx, maxIdx); |
1409 | } |
1410 | |
1411 | /// \returns true if tensor contains only elements equal to zero. |
1412 | /// \p allowedError represents the delta from zero that is allowed before |
1413 | /// returning false. |
1414 | bool isZero(float allowedError = 0.0) const { |
1415 | #define RETURN_WHETHER_FUSED_IS_ZERO(DATA_TYPE) \ |
1416 | assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \ |
1417 | assert(dims()[1] > 2 * sizeof(DATA_TYPE) && \ |
1418 | "Fused tensor must have space for scale/offset."); \ |
1419 | const dim_t dataWidth = dims()[1]; \ |
1420 | const dim_t alignedLength = tensor_->getType().strides()[0]; \ |
1421 | auto *data = reinterpret_cast<uint8_t *>(tensor_->getUnsafePtr()); \ |
1422 | for (dim_t i = 0, e = dims()[0]; i < e; i++) { \ |
1423 | uint8_t *scaleOffsetPtr = \ |
1424 | data + i * alignedLength + dataWidth - 2 * sizeof(DATA_TYPE); \ |
1425 | DATA_TYPE scale, offset; \ |
1426 | memcpy(&scale, scaleOffsetPtr, sizeof(DATA_TYPE)); \ |
1427 | memcpy(&offset, scaleOffsetPtr + sizeof(DATA_TYPE), sizeof(DATA_TYPE)); \ |
1428 | for (dim_t j = 0, e = dataWidth - 2 * sizeof(DATA_TYPE); j < e; j++) { \ |
1429 | float currVal = (at({i, j}) * (float)scale) + (float)offset; \ |
1430 | if (std::abs(currVal) > allowedError) { \ |
1431 | return false; \ |
1432 | } \ |
1433 | } \ |
1434 | } \ |
1435 | return true; |
1436 | |
1437 | if (getElementType() == ElemKind::UInt8FusedQTy) { |
1438 | RETURN_WHETHER_FUSED_IS_ZERO(float); |
1439 | } |
1440 | if (getElementType() == ElemKind::UInt8FusedFP16QTy) { |
1441 | RETURN_WHETHER_FUSED_IS_ZERO(float16_t); |
1442 | } |
1443 | #undef RETURN_WHETHER_FUSED_IS_ZERO |
1444 | |
1445 | int32_t trueZero = getType().isQuantizedType() ? getType().getOffset() : 0; |
1446 | return std::all_of(begin(), end(), [=](ElemTy e) { return e == trueZero; }); |
1447 | } |
1448 | |
1449 | void dump(llvm::raw_ostream &os, unsigned maxNumElem = MAX_DUMP_ELEMS) const { |
1450 | dumpImpl(tensor_, os, maxNumElem); |
1451 | } |
1452 | void dump(unsigned maxNumElem) const { dumpImpl(tensor_, maxNumElem); } |
1453 | void dump() const { dumpImpl(tensor_, MAX_DUMP_ELEMS); } |
1454 | |
1455 | /// Fill the array with random data that's close to zero using the |
1456 | /// Xavier method, based on the paper [Bengio and Glorot 2010]. |
1457 | /// This type of initialization facilitates better training performance. |
1458 | /// The parameter \p filterSize is the number of "input" neurons in the |
1459 | /// tensor (or the relevant slice). For example, consider case of MatMul: |
1460 | /// NxM (\p input) * MxK (\p weights) == NxK (\p result) |
1461 | /// Correct \p filterSize for weights tensor is M, so that norm for each |
1462 | /// row of \p input equals to norm of corresponding row of \p result. |
1463 | void initXavier(size_t filterSize, PseudoRNG &PRNG) { |
1464 | assert(filterSize > 0 && "invalid filter size" ); |
1465 | assert(getType().isFPType() && |
1466 | "Only support floating point Xavier initialization." ); |
1467 | double scale = std::sqrt(3.0 / double(filterSize)); |
1468 | std::uniform_real_distribution<> dist(-scale, scale); |
1469 | for (auto &e : *this) { |
1470 | e = dist(PRNG); |
1471 | } |
1472 | } |
1473 | |
1474 | /// Fill the tensor with uniformly distributed values in the range |
1475 | /// [low .. high). |
1476 | template <typename T = ElemTy> |
1477 | typename std::enable_if<std::is_floating_point<T>::value>::type |
1478 | randomize(float low, float high, PseudoRNG &PRNG) { |
1479 | assert(low <= high && "invalid range" ); |
1480 | std::uniform_real_distribution<ElemTy> dist(low, high); |
1481 | for (auto &elem : *this) { |
1482 | elem = dist(PRNG); |
1483 | } |
1484 | } |
1485 | |
1486 | /// Fill the tensor with uniformly distributed values in the range |
1487 | /// [low .. high]. For quantized fused tensors leave scales/offsets unchanged. |
1488 | template <typename T = ElemTy> |
1489 | typename std::enable_if<std::is_integral<T>::value>::type |
1490 | randomize(int low, int high, PseudoRNG &PRNG) { |
1491 | assert(low <= high && "invalid range" ); |
1492 | assert(low >= std::numeric_limits<ElemTy>::lowest() && |
1493 | high <= std::numeric_limits<ElemTy>::max() && |
1494 | "Cannot initialize outside range of representable values." ); |
1495 | std::uniform_int_distribution<long long> dist(low, high); |
1496 | switch (getElementType()) { |
1497 | default: { |
1498 | for (auto &elem : *this) { |
1499 | elem = dist(PRNG); |
1500 | } |
1501 | return; |
1502 | } |
1503 | |
1504 | #define FUSED_CASE(ELEM_KIND, DATA_TYPE) \ |
1505 | case ElemKind::ELEM_KIND: { \ |
1506 | assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \ |
1507 | assert(dims()[1] > 2 * sizeof(DATA_TYPE) && \ |
1508 | "Fused tensor must have space for scale/offset."); \ |
1509 | for (dim_t i = 0, e = dims()[0]; i < e; i++) { \ |
1510 | for (dim_t j = 0, f = dims()[1] - 2 * sizeof(DATA_TYPE); j < f; j++) { \ |
1511 | at({i, j}) = dist(PRNG); \ |
1512 | } \ |
1513 | } \ |
1514 | return; \ |
1515 | } |
1516 | FUSED_CASE(UInt8FusedQTy, float); |
1517 | FUSED_CASE(UInt8FusedFP16QTy, float16_t); |
1518 | #undef FUSED_CASE |
1519 | } |
1520 | } |
1521 | |
1522 | /// Fill the tensor with uniformly distributed values in the range |
1523 | /// [low .. high). |
1524 | template <typename T = ElemTy> |
1525 | typename std::enable_if<!std::is_floating_point<T>::value && |
1526 | !std::is_integral<T>::value>::type |
1527 | randomize(float low, float high, PseudoRNG &PRNG) { |
1528 | assert(low <= high && "invalid range" ); |
1529 | std::uniform_real_distribution<float> dist(low, high); |
1530 | for (auto &elem : *this) { |
1531 | elem = dist(PRNG); |
1532 | } |
1533 | } |
1534 | |
1535 | /// \returns the mean and variance of the tensor. |
1536 | std::pair<double, double> calculateMeanVariance() const { |
1537 | size_t n = actualSize(); |
1538 | assert(n > 1 && "Input must have at least 2 elements." ); |
1539 | |
1540 | // Calculate mean. |
1541 | double mean = 0; |
1542 | for (size_t i = 0; i < n; i++) { |
1543 | mean += raw({i}); |
1544 | } |
1545 | mean /= n; |
1546 | |
1547 | // Calculate variance. |
1548 | double var = 0; |
1549 | for (size_t i = 0; i < n; i++) { |
1550 | double t = raw({i}) - mean; |
1551 | var += t * t; |
1552 | } |
1553 | var /= (n - 1); |
1554 | |
1555 | return {mean, var}; |
1556 | } |
1557 | |
1558 | /// Insert the tensor \p slice at location \p offset \p count times along the |
1559 | /// \p axis. This operation is equivalent to the operation of scanning the |
1560 | /// source tensor, and saving the value that is stored at coordinate {d_0, |
1561 | /// d_1, ... d_n} in the new tensor at {d_0 + O_0, d_1 + O_1, ... d_n + O_n}, |
1562 | /// where O is the offset vector, assuming \p count = 1. For \p count > 1, the |
1563 | /// same Tensor is copied \p count times along the provided \p axis. The |
1564 | /// tensors must be of the right dimensions. |
1565 | void insertTensors(Handle<ElemTy> &slice, llvm::ArrayRef<dim_t> offset, |
1566 | size_t count = 1, size_t axis = 0) { |
1567 | auto sliceCoor = slice.dims().vec(); |
1568 | auto fusedCoor = dims().vec(); |
1569 | insertTensorsImpl(sliceCoor, fusedCoor, slice, true, offset, count, axis, |
1570 | 0); |
1571 | } |
1572 | |
1573 | /// Extract the tensor \p slice at location \p offset. This operation is |
1574 | /// equivalent to the operation of scanning the destination tensor, and |
1575 | /// copying into the cell at coordinate {d_0, d_1, ... d_n} a value from the |
1576 | /// tensor at {d_0 + O_0, d_1 + O_1, ... d_n + O_n}, where O is the offset |
1577 | /// vector. The tensors must be of the right dimensions. |
1578 | void extractTensors(Handle<ElemTy> &slice, llvm::ArrayRef<dim_t> offset) { |
1579 | auto sliceCoor = slice.dims().vec(); |
1580 | auto fusedCoor = dims().vec(); |
1581 | insertTensorsImpl(sliceCoor, fusedCoor, slice, false, offset, /* count */ 1, |
1582 | /* axis */ 0, 0); |
1583 | } |
1584 | |
1585 | /// \returns a pair of the scale and offset from a row \p rowIdx of a |
1586 | /// FusedRowwiseQuantized Tensor. |
1587 | template <typename T> |
1588 | std::pair<T, T> getFusedScaleOffsetFromRow(dim_t rowIdx) { |
1589 | ElemTy *rowScaleOffsetPtr = getFusedRowScaleOffsetPtr<T>(rowIdx); |
1590 | T scale; |
1591 | T offset; |
1592 | memcpy(&scale, rowScaleOffsetPtr, sizeof(T)); |
1593 | memcpy(&offset, rowScaleOffsetPtr + sizeof(T), sizeof(T)); |
1594 | return std::make_pair(scale, offset); |
1595 | } |
1596 | |
1597 | /// Sets the \p scale and \p offset to a row \p rowIdx of a |
1598 | /// FusedRowwiseQuantized Tensor. |
1599 | template <typename T> |
1600 | void setFusedScaleOffsetInRow(dim_t rowIdx, T scale, T offset) { |
1601 | ElemTy *rowScaleOffsetPtr = getFusedRowScaleOffsetPtr<T>(rowIdx); |
1602 | T finalScale = static_cast<T>(scale); |
1603 | T finalOffset = static_cast<T>(offset); |
1604 | memcpy(rowScaleOffsetPtr, &finalScale, sizeof(T)); |
1605 | memcpy(rowScaleOffsetPtr + sizeof(T), &finalOffset, sizeof(T)); |
1606 | } |
1607 | |
1608 | private: |
1609 | /// Concats or splits tensors. |
1610 | /// This method concats or extracts a slice from a tensor. |
1611 | /// \p sliceCoor and \p fusedCoor are temporary storage that the function uses |
1612 | /// to construct the coordinates to access the tensor. They must be |
1613 | /// initialized to be the size of the shape of the tensor. \p slice and \p |
1614 | /// fused are the tensors to concat or extract. \p offset is the offset of the |
1615 | /// slice to add or extract along the dimension \p offsetDim. \p d is the |
1616 | /// recursion depth parameter that's following the number of the axis. if \p |
1617 | /// isInsert is set then data is copied from \p slice to \p fused. Otherwise |
1618 | /// data is copied from \p fused to \p slice. \p count and \p axis are used in |
1619 | /// conjunction for inserting the same tensor \p count times along the \p |
1620 | /// axis. |
1621 | void insertTensorsImpl(llvm::MutableArrayRef<dim_t> sliceCoor, |
1622 | llvm::MutableArrayRef<dim_t> fusedCoor, |
1623 | Handle<ElemTy> &slice, bool isInsert, |
1624 | llvm::ArrayRef<dim_t> offset, size_t count, |
1625 | size_t axis, unsigned d) { |
1626 | bool isDone = (d == slice.dims().size()); |
1627 | |
1628 | if (isDone) { |
1629 | if (isInsert) { |
1630 | at(fusedCoor) = slice.at(sliceCoor); |
1631 | } else { |
1632 | slice.at(sliceCoor) = at(fusedCoor); |
1633 | } |
1634 | return; |
1635 | } |
1636 | |
1637 | // Only need to iterate over count if the current dimension d is equal to |
1638 | // the axis we're inserting over. |
1639 | const size_t countIters = (axis == d) ? count : 1; |
1640 | for (size_t c = 0; c < countIters; c++) { |
1641 | for (size_t i = 0, e = slice.dims()[d]; i < e; i++) { |
1642 | // Construct the coordinates for the slice and for the joint shape. |
1643 | // Add the 'offset' to the dimension that we concat the shapes on. |
1644 | sliceCoor[d] = i; |
1645 | // If this is the correct axis to insert multiple times then calculate |
1646 | // the additional offset to use. |
1647 | const size_t countAxisOffset = (axis == d) ? c * slice.dims()[d] : 0; |
1648 | fusedCoor[d] = i + offset[d] + countAxisOffset; |
1649 | insertTensorsImpl(sliceCoor, fusedCoor, slice, isInsert, offset, count, |
1650 | axis, d + 1); |
1651 | } |
1652 | } |
1653 | } |
1654 | |
1655 | /// Given a Fused tensor, \returns a pointer to the scale and offset with type |
1656 | /// \p T of a row \p rowIdx. |
1657 | template <typename T> ElemTy *getFusedRowScaleOffsetPtr(dim_t rowIdx) { |
1658 | switch (getElementType()) { |
1659 | case ElemKind::UInt8FusedQTy: |
1660 | case ElemKind::UInt4FusedQTy: { |
1661 | constexpr auto isFloat = std::is_same<float, T>::value; |
1662 | DCHECK(isFloat) << "Expected float scale/offset" ; |
1663 | break; |
1664 | } |
1665 | case ElemKind::UInt4FusedFP16QTy: |
1666 | case ElemKind::UInt8FusedFP16QTy: { |
1667 | constexpr auto isFloat16 = std::is_same<float16_t, T>::value; |
1668 | DCHECK(isFloat16) << "Expected float16_t scale/offset" ; |
1669 | break; |
1670 | } |
1671 | default: |
1672 | llvm_unreachable("Must be used with Tensor of supported Fused ElemKind" ); |
1673 | } |
1674 | |
1675 | static_assert(std::is_same<uint8_t, ElemTy>::value, |
1676 | "Handle of current Fused tensors expected to be uint8_t." ); |
1677 | const dim_t colIdx = dims()[1] - 2 * sizeof(T); |
1678 | return &at({rowIdx, colIdx}); |
1679 | } |
1680 | }; |
1681 | |
1682 | template <class ElemTy> Handle<ElemTy> Tensor::getHandle() & { |
1683 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
1684 | assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type." ); |
1685 | return Handle<ElemTy>(this); |
1686 | } |
1687 | |
1688 | template <class ElemTy> const Handle<ElemTy> Tensor::getHandle() const & { |
1689 | assert(!isDeviceResident() && "Tensor must reside on host to access data." ); |
1690 | assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type." ); |
1691 | return Handle<ElemTy>(const_cast<Tensor *>(this)); |
1692 | } |
1693 | |
1694 | llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor &t); |
1695 | |
1696 | llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor *t); |
1697 | } // namespace glow |
1698 | |
1699 | #endif // GLOW_BASE_TENSOR_H |
1700 | |