Tensor.h source code [glow/include/glow/Base/Tensor.h]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#ifndef GLOW_BASE_TENSOR_H
17	#define GLOW_BASE_TENSOR_H
18
19	#include <algorithm>
20	#include <cassert>
21	#include <vector>
22
23	#include "glow/Base/DeviceTensorTransferManager.h"
24	#include "glow/Base/Type.h"
25	#include "glow/Support/Compiler.h"
26	#include "glow/Support/Memory.h"
27	#include "glow/Support/Random.h"
28
29	#include "llvm/ADT/ArrayRef.h"
30	#include "llvm/Support/raw_ostream.h"
31
32	namespace glow {
33
34	//===----------------------------------------------------------------------===//
35	// Tensor
36	//===----------------------------------------------------------------------===//
37
38	template <class ElemTy> class Handle;
39
40	class Tensor;
41	class TensorPool;
42
43	void genericTranspose(const Tensor src, Tensor dest,
44	llvm::ArrayRef<unsigned_t> shuffle);
45
46	/// Helper function that \returns a ShapeVector of those dimensions in \p
47	/// currDims expanded with dimension = 1 until the maximum tensor dimension is
48	/// reached. The number of elements in the input dims is the same as in the
49	/// returned dims. For example, input {2,1,4} would result in {2,1,4,1,1,1}.
50	ShapeVector expandDimsToMax(llvm::ArrayRef<dim_t> currDims);
51
52	/// Helper function that \returns a ShapeVector obtained from \p dims by
53	/// reducing (setting to 1) the dimensions given by \p axes. If the flag
54	/// \p keepDims is also used then the reduced dimensions are kept, otherwise
55	/// are pruned. For example, given the dimensions [2,3,4] and axes [0,2] the
56	/// returned shape will be [1,3,1] for keepDims true and [3] for keepDims false.
57	ShapeVector reduceDims(llvm::ArrayRef<dim_t> dims,
58	llvm::ArrayRef<unsigned_t> axes, bool keepDims);
59
60	/// Helper function that \returns the transpose shuffle that would undo the
61	/// given \p shuffle so that if two transposes were composed with the given
62	/// shuffle and the result of this function, it would result in the identity
63	/// shuffle.
64	std::vector<unsigned_t> getInverseTranspose(llvm::ArrayRef<unsigned_t> shuffle);
65
66	namespace runtime {
67	class DeviceManager;
68	}
69
70	/// Holds information regarding whether this Tensor exists in a device-specific
71	/// form, either resident or specific for a device, and what device holds it.
72	class DeviceResidencyInfo final {
73	enum class TensorResidency {
74	Host,
75	Device,
76	};
77
78	// A pointer to the device manager of the device on which the tensor
79	// resides.
80	DeviceTensorTransferManager deviceManager_{nullptr*};
81	/// The residency status of the tensor.
82	TensorResidency tensorResidency_{TensorResidency::Host};
83	// A pointer to a context structure, containing the required info to access
84	// tensor data and perform transfers.
85	void locationContext_{nullptr*};
86
87	public:
88	DeviceResidencyInfo()
89	: deviceManager_(nullptr), tensorResidency_(TensorResidency::Host),
90	locationContext_(nullptr) {}
91
92	/// Move ctor.
93	DeviceResidencyInfo(DeviceResidencyInfo &&other) = delete;
94
95	/// Move assignment operator.
96	DeviceResidencyInfo &operator=(DeviceResidencyInfo &&other) = delete;
97
98	~DeviceResidencyInfo() {
99	// If a tensor is device resident, let its device manager free the device
100	// buffer.
101	if (isDeviceResident()) {
102	deviceManager_->releaseDeviceTensor(locationContext_);
103	}
104	}
105
106	/// Removes all device specific state.
107	void clear() {
108	deviceManager_ = nullptr;
109	locationContext_ = nullptr;
110	tensorResidency_ = TensorResidency::Host;
111	}
112
113	/// \returns true if this Tensor is resident or specific for a device.
114	bool isDeviceResident() const {
115	assert((tensorResidency_ == TensorResidency::Host \|\| deviceManager_) &&
116	"Device resident tensor must have an assigned device manager.");
117	return tensorResidency_ == TensorResidency::Device;
118	}
119
120	/// \returns the DeviceManager this tensor is resident on, if any.
121	DeviceTensorTransferManager getDeviceManager() const* {
122	return deviceManager_;
123	}
124
125	/// \returns the device specific location context for a resident Tensor.
126	void getLocationContext() const* { return locationContext_; }
127
128	friend class Tensor;
129	};
130
131	/// A class that represents a contiguous n-dimensional array (a tensor).
132	class Tensor final {
133	public:
134	/// Specifies the kind initialization for the tensor.
135	enum class InitKind {
136	Zero, // The tensor is initialized to zero.
137	Broadcast, // Broadcast a single value to all elements.
138	Xavier, // Init the tensor with random values using the Xavier method.
139	};
140
141	private:
142	/// A pointer to the tensor data.
143	char data_{nullptr*};
144
145	/// The type of the tensor.
146	Type type_;
147
148	/// If the tensor is unowned.
149	bool isUnowned_{false};
150
151	/// The TensorPool that is managing this Tensor (if any).
152	TensorPool tensorPool_{nullptr*};
153
154	/// The device residency info accosiated with the tensor.
155	DeviceResidencyInfo deviceResidency_{nullptr*};
156
157	/// If this tensor owns the DeviceResidencyInfo.
158	bool ownsDeviceResidency_{false};
159
160	/// Size in bytes of the unpadded region memory. This is useful communicating
161	/// the actual size of the data, this allows for copying only inputs and not
162	/// padding to the device.
163	size_t unpaddedSize_{`0`};
164
165	template <class ElemTy> friend class Handle;
166
167	/// \returns a pointer to the tensor data buffer.
168	char getData() const* { return data_; }
169
170	public:
171	/// \returns true if it is an unowned tensor.
172	bool isUnowned() const { return isUnowned_; }
173
174	/// \returns the number of allocated bytes pointed to by \ref data_.
175	size_t getUnpaddedSizeInBytes() const { return unpaddedSize_; }
176
177	/// \returns the number of real elements in a Tensor, not including extra
178	/// padding, or not including number of elements that do not exist outside of
179	/// a partial tensor shape. Note that Tensors cannot be both custom aligned
180	/// and partial.
181	size_t getRealNumElements() const {
182	// If custom alignment then return size from the handle.
183	if (size() < actualSize()) {
184	return size();
185	}
186	// Else assume no custom alignment, so return number of elements based on
187	// unpaddedSize_, i.e. accounts for partial Tensors.
188	return unpaddedSize_ / type_.getElementSize();
189	}
190
191	/// \returns the type of the tensor.
192	const Type &getType() const { return type_; }
193
194	/// Set the type of the Tensor to \p t.
195	void setType(const TypeRef t) {
196	assert(type_.dims() == t->dims() && "New type must retain the same shape.");
197	assert(((type_.getElementType() == t->getElementType() &&
198	type_.size() == t->size()) \|\|
199	type_.getSizeInBytes() == t->getSizeInBytes()) &&
200	"New type must retain the same size in bytes.");
201	type_ = *t;
202	}
203
204	/// \return the element type of the tensor.
205	ElemKind getElementType() const { return type_.getElementType(); }
206
207	/// \returns True if the coordinate is within the array.
208	bool isInBounds(llvm::ArrayRef<dim_t> indices) const {
209	assert(type_.numSizes_ == indices.size() && "Invalid number of indices");
210	for (size_t i = `0u`, e = indices.size(); i < e; i++) {
211	if (indices [i] >= type_.sizes_[i]) {
212	return false;
213	}
214	}
215	return true;
216	}
217
218	/// Set the content of the tensor to zero. If \p resetFusedScalesOffsets, then
219	/// fused scales/offsets will be set to 1.0/0.0 as well.
220	void zero(bool resetFusedScalesOffsets = false) {
221	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
222	size_t size = actualSize();
223	// Quantized tensors should go to their offset.
224	switch (type_.getElementType()) {
225	case ElemKind::Int8QTy: {
226	auto data = reinterpret_cast<int8_t >(getData());
227	std::fill(&data[`0`], &data[`0`] + size, (int8_t)type_.getOffset());
228	break;
229	}
230	case ElemKind::UInt8QTy: {
231	auto data = reinterpret_cast<uint8_t >(getData());
232	std::fill(&data[`0`], &data[`0`] + size, (uint8_t)type_.getOffset());
233	break;
234	}
235	case ElemKind::Int16QTy: {
236	auto data = reinterpret_cast<int16_t >(getData());
237	std::fill(&data[`0`], &data[`0`] + size, (int16_t)type_.getOffset());
238	break;
239	}
240	case ElemKind::Int32QTy: {
241	auto data = reinterpret_cast<int32_t >(getData());
242	std::fill(&data[`0`], &data[`0`] + size, (int32_t)type_.getOffset());
243	break;
244	}
245	#define FUSED_CASE(ELEM_KIND, DATA_TYPE) \
246	case ElemKind::ELEM_KIND: { \
247	assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \
248	assert(dims()[1] > sizeof(DATA_TYPE) && \
249	"Fused tensor must have space for scale and offset."); \
250	const size_t dataWidth = dims()[1]; \
251	const size_t alignedLength = type_.strides()[0]; \
252	auto data = reinterpret_cast<uint8_t >(getData()); \
253	for (size_t i = 0, e = dims()[0]; i < e; i++) { \
254	uint8_t *scaleOffsetPtr = \
255	data + i * alignedLength + dataWidth - 2 * sizeof(DATA_TYPE); \
256	DATA_TYPE scale, offset; \
257	if (resetFusedScalesOffsets) { \
258	/* Use these as defaults, and copy them into each row. */ \
259	scale = 1.0; \
260	offset = 0.0; \
261	memcpy(scaleOffsetPtr, &scale, sizeof(DATA_TYPE)); \
262	memcpy(scaleOffsetPtr + sizeof(DATA_TYPE), &offset, \
263	sizeof(DATA_TYPE)); \
264	} else { \
265	memcpy(&scale, scaleOffsetPtr, sizeof(DATA_TYPE)); \
266	memcpy(&offset, scaleOffsetPtr + sizeof(DATA_TYPE), \
267	sizeof(DATA_TYPE)); \
268	} \
269	DCHECK_NE(static_cast<float>(scale), 0.0) \
270	<< "Disallow scale = 0.0 for Fused ElemKinds; causes div by zero."; \
271	float zero = nearbyintf(-1 * static_cast<float>(offset / scale)); \
272	std::fill(data + i * alignedLength, scaleOffsetPtr, \
273	static_cast<uint8_t>(zero)); \
274	} \
275	break; \
276	}
277	FUSED_CASE(UInt8FusedQTy, float);
278	FUSED_CASE(UInt8FusedFP16QTy, float16_t);
279	#undef FUSED_CASE
280
281	default:
282	// Non-quantized tensors are set to 0.
283	std::fill(&getData()[`0`], &getData()[`0`] + size * type_.getElementSize(),
284	`0`);
285	break;
286	}
287	}
288
289	/// \returns the shape of the tensor.
290	llvm::ArrayRef<dim_t> dims() const { return type_.dims(); }
291
292	/// \returns the number of real meaningful elements in the tensor. Does not
293	/// take strides into account.
294	dim_t size() const { return type_.size(); }
295
296	/// \returns the actual number of elements in the tensor taking striding into
297	/// account. Since size() does not take striding into account, size() is
298	/// always <= actualSize().
299	dim_t actualSize() const { return type_.actualSize(); }
300
301	/// \returns the number of bytes required to store the tensor based on its
302	/// Type. Note that this includes the size required for padding.
303	uint64_t getSizeInBytes() const { return type_.getSizeInBytes(); }
304
305	/// \returns the TensorPool managing this object, or nullptr if it is
306	/// unmanaged.
307	TensorPool getOwningPool() { return* tensorPool_; }
308
309	template <typename DataType>
310	static Tensor fromData(ElemKind elemKind, llvm::ArrayRef<dim_t> dims,
311	const std::initializer_list<DataType> &data) {
312	Tensor tensor(elemKind, dims);
313	tensor.getHandle<DataType>() = data;
314	return tensor;
315	}
316
317	template <typename DataType>
318	static Tensor fromData(ElemKind elemKind, float scale, int32_t offset,
319	llvm::ArrayRef<dim_t> dims,
320	const std::initializer_list<DataType> &data) {
321	Tensor tensor(elemKind, dims, scale, offset);
322	tensor.getHandle<DataType>() = data;
323	return tensor;
324	}
325
326	/// Initialize an empty tensor.
327	Tensor() = default;
328
329	/// Initialize from a list of float literals.
330	Tensor(const std::initializer_list<float> &vec) {
331	reset(ElemKind::FloatTy, {(dim_t)vec.size()});
332	auto data = getRawDataPointer<float*>();
333	int i = `0`;
334	for (auto &f : vec) {
335	data[i++] = f;
336	}
337	}
338
339	/// Allocate and initialize a new tensor.
340	explicit Tensor(TypeRef ty) : data_(nullptr), type_(ty), isUnowned_{false*} {
341	reset(*ty);
342	}
343
344	/// Allocate and initialize a new tensor.
345	explicit Tensor(const Type &ty)
346	: data_(nullptr), type_(ty), isUnowned_{false} {
347	reset(ty);
348	}
349
350	/// Allocate and initialize a float new tensor.
351	Tensor(ElemKind elemTy, llvm::ArrayRef<dim_t> dims)
352	: data_(nullptr), type_(elemTy, dims), isUnowned_{false} {
353	reset(elemTy, dims);
354	}
355
356	/// Construct an unowned tensor provided an existing payload buffer.
357	/// This constructor can be used when there is a need to work with
358	/// "externally" managed payload buffers using Tensor APIs. Additionally
359	/// \p unpaddedSize can be set to indicate actual size of the inputs. If
360	/// negative then it defaults back to the size of the input type.
361	Tensor(void *data, TypeRef ty, ssize_t unpaddedSize = -`1`)
362	: data_(reinterpret_cast<char >(data)), type_(ty) {
363	// Mark as unowned.
364	isUnowned_ = true;
365	// We do want DeviceResidency however, since there is no owning Glow Tensor.
366	resetDeviceInfo();
367	if (unpaddedSize < `0`) {
368	unpaddedSize_ = type_.getSizeInBytes();
369	} else {
370	unpaddedSize_ = static_cast<size_t>(unpaddedSize);
371	}
372	}
373
374	/// Allocate and initialize a new integer tensor with \p scale and \p offset.
375	Tensor(ElemKind elemTy, llvm::ArrayRef<dim_t> dims, float scale,
376	int32_t offset)
377	: data_(nullptr), type_(elemTy, dims, scale, offset), isUnowned_{false} {
378	reset(type_);
379	}
380
381	/// Allocate a new Tensor managed by the \p tensorPool.
382	explicit Tensor(TypeRef ty, TensorPool *tensorPool)
383	: data_(nullptr), type_(*ty), tensorPool_(tensorPool) {
384	reset(*ty);
385	}
386
387	Tensor(const Tensor &other) = delete;
388	Tensor &operator=(const Tensor &other) = delete;
389
390	/// Initialize the content of the tensor using the \p init method. The value
391	/// \p val is the initialization parameter. \p PRNG is used to generate random
392	/// numbers. Note that if the tensor's kind is Fused, then the fused
393	/// scaled/offsets will not be modified.
394	void init(InitKind init, float val, PseudoRNG &PRNG);
395
396	/// \returns an unowned tensor with the exact same dimensions as this.
397	Tensor getUnowned() const { return getUnowned(dims()); }
398
399	/// \returns unowned tensor using the same data buffer as the current tensor
400	/// but having different dimensions \p dims. \p offsets represents an optional
401	/// offset into the tensor representing the location of the first element to
402	/// start a subview from. The returned unonwed tensor is essentially a
403	/// different view or subview on the same data.
404	///
405	/// The lifetime of the returned unowned tensor should be always within
406	/// the lifetime of its parent tensor, i.e. the unowned tensor should not
407	/// outlive its parent tensor.
408	Tensor getUnowned(llvm::ArrayRef<dim_t> dims,
409	llvm::ArrayRef<dim_t> offsets = {}) const {
410	Tensor unownedTensor;
411
412	auto *firstElemPtr = getData();
413	if (offsets.size()) {
414	assert(offsets.size() == this->dims().size() &&
415	"Number of dims of tensor must equal number of dims in offsets");
416	// Find the index of the first element and use it to find the pointer to
417	// the first element.
418	size_t index = `0`;
419	for (size_t i = `0`; i < this->dims().size(); i++) {
420	index += type_.strides()[i] * offsets [i];
421	}
422	firstElemPtr = &firstElemPtr[index * type_.getElementSize()];
423	}
424
425	unownedTensor.data_ = firstElemPtr;
426	unownedTensor.isUnowned_ = true;
427	unownedTensor.type_ = Type::newShape(getType(), dims);
428	unownedTensor.deviceResidency_ = deviceResidency_;
429
430	// If the original base Tensor is padded, then we only allow the unowned
431	// Tensor to be padded if there are no offsets. Otherwise assert that the
432	// base Tensor is not padded, and set unpaddedSize to that of the new
433	// unowned type.
434	if (offsets.size() == `0`) {
435	unownedTensor.unpaddedSize_ = unpaddedSize_;
436	assert(actualSize() == unownedTensor.actualSize() &&
437	"The size of the unowned tensor "
438	"should be the same as the size of "
439	"the original tensor");
440
441	} else {
442	unownedTensor.unpaddedSize_ = unownedTensor.type_.getSizeInBytes();
443	assert(getSizeInBytes() == getUnpaddedSizeInBytes() &&
444	"Problematic to get unowned offsetted view of a padded tensor");
445	assert(actualSize() >= unownedTensor.actualSize() &&
446	"The size of the unowned tensor "
447	"should be no greater than the "
448	"size of the original tensor");
449	}
450	return unownedTensor;
451	}
452
453	/// This is the same as \ref getUnowned() but it produces an owned tensor
454	/// instead. \returns owned tensor copied from the data buffer of the current
455	/// tensor but having different dimensions \p dims. \p offsets represents an
456	/// optional offset into the tensor representing the location of the first
457	/// element to start a subview from.
458	Tensor getOwnedSlice(llvm::ArrayRef<dim_t> dims,
459	llvm::ArrayRef<dim_t> offsets = {}) const {
460	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
461	return getUnowned(dims, offsets).clone();
462	}
463
464	/// Reset the shape and type of this tensor to match the shape and type of
465	/// \p other. The size of the buffer is set to \p unpaddedSize unless it is
466	/// negative, which will instead default back to the number of bytes needed
467	/// for the type of \p other.
468	void reset(const Tensor *other, ssize_t unpaddedSize = -`1`) {
469	reset(other->getType(), unpaddedSize);
470	}
471
472	void reset(ElemKind elemTy, llvm::ArrayRef<dim_t> shape) {
473	Type t(elemTy, shape);
474	reset(t);
475	}
476
477	void reset(ElemKind elemTy, llvm::ArrayRef<dim_t> shape, float scale,
478	int32_t offset) {
479	Type t(elemTy, shape, scale, offset);
480	reset(t);
481	}
482
483	/// Assigns a new shape to the tensor and allocates a new buffer. The size of
484	/// the buffer is set to \p unpaddedSize unless it is negative, which will
485	/// instead default back to the number of bytes needed for \p T.
486	void reset(const Type &T, ssize_t unpaddedSize = -`1`) {
487	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
488
489	// If negative then fall back to the passed in Type's padded size.
490	if (unpaddedSize < `0`) {
491	unpaddedSize = T.getSizeInBytes();
492	}
493
494	// If the new size is identical to the allocated size then there is no need
495	// to re-allocate the buffer.
496	const bool isOrigPadded =
497	getSizeInBytes() != uint64_t(getUnpaddedSizeInBytes());
498	const bool isNewPadded = T.getSizeInBytes() != size_t(unpaddedSize);
499	const bool isBufReuseAllowed =
500	(isOrigPadded == isNewPadded) &&
501	(getUnpaddedSizeInBytes() == size_t(unpaddedSize));
502	if (type_ == T && getData() && isBufReuseAllowed) {
503	#ifdef GLOW_DEBUG_TENSOR_INIT
504	PseudoRNG rng;
505	init(InitKind::Broadcast, GLOW_DEBUG_TENSOR_INIT, rng);
506	#endif
507	resetDeviceInfo();
508	return;
509	}
510
511	// Delete the old buffer, update the shape, and allocate a new one.
512	if (!isUnowned())
513	alignedFree(getData());
514	type_ = T;
515
516	// We are allocating memory specifically for this tensor, thus, it owns it.
517	isUnowned_ = false;
518
519	// We are allocating memory on the host so it is not device resident.
520	resetDeviceInfo();
521
522	// Note: zero-dimensional tensors (i.e. {}) have size 1. However, Tensors
523	// may have 0 for some dimension, meaning they have size of 0, and so we do
524	// not allocate anything for them.
525	data_ = unpaddedSize == `0` ? nullptr
526	: reinterpret_cast<char *>(alignedAlloc(
527	unpaddedSize, TensorAlignment));
528
529	// Set unpaddedSize_ to the actual number of bytes.
530	unpaddedSize_ = unpaddedSize;
531
532	assert(!(size() < actualSize() &&
533	getSizeInBytes() != getUnpaddedSizeInBytes()) &&
534	"Custom aligned Tensors cannot also be partial");
535
536	#ifdef GLOW_DEBUG_TENSOR_INIT
537	PseudoRNG rng;
538	init(InitKind::Broadcast, GLOW_DEBUG_TENSOR_INIT, rng);
539	#endif
540	}
541	/// Releases the data buffer and sets the unOwned flag to true. This is useful
542	/// for keeping metadata around but not the actual contents.
543	void release() {
544	if (!isUnowned()) {
545	alignedFree(getData());
546	}
547	if (ownsDeviceResidency_) {
548	delete deviceResidency_;
549	ownsDeviceResidency_ = false;
550	}
551
552	isUnowned_ = true;
553	}
554	~Tensor() {
555	if (!isUnowned()) {
556	alignedFree(getData());
557	}
558
559	if (ownsDeviceResidency_) {
560	delete deviceResidency_;
561	ownsDeviceResidency_ = false;
562	}
563	}
564
565	// Move ctor.
566	Tensor(Tensor &&other) noexcept {
567	if (!isUnowned()) {
568	alignedFree(getData());
569	}
570	if (ownsDeviceResidency_) {
571	delete deviceResidency_;
572	}
573	data_ = other.data_;
574	type_ = other.type_;
575	isUnowned_ = other.isUnowned_;
576	tensorPool_ = other.tensorPool_;
577	unpaddedSize_ = other.unpaddedSize_;
578	deviceResidency_ = other.deviceResidency_;
579	ownsDeviceResidency_ = other.ownsDeviceResidency_;
580	other.data_ = nullptr;
581	other.isUnowned_ = true;
582	other.tensorPool_ = nullptr;
583	other.deviceResidency_ = nullptr;
584	other.ownsDeviceResidency_ = false;
585	}
586
587	/// Move assignment operator.
588	Tensor &operator=(Tensor &&other) {
589	if (!isUnowned()) {
590	alignedFree(getData());
591	}
592	if (ownsDeviceResidency_) {
593	delete deviceResidency_;
594	}
595	data_ = other.data_;
596	type_ = other.type_;
597	isUnowned_ = other.isUnowned_;
598	tensorPool_ = other.tensorPool_;
599	unpaddedSize_ = other.unpaddedSize_;
600	deviceResidency_ = other.deviceResidency_;
601	ownsDeviceResidency_ = other.ownsDeviceResidency_;
602	other.data_ = nullptr;
603	other.isUnowned_ = true;
604	other.tensorPool_ = nullptr;
605	other.deviceResidency_ = nullptr;
606	other.ownsDeviceResidency_ = false;
607	return *this;
608	}
609
610	/// Dump a textual representation of the Tensor into provided output stream.
611	void dump(llvm::raw_ostream &os) const;
612
613	/// Dump a textual representation of the Tensor into default output stream.
614	void dump() const;
615
616	/// Dump a textual representation of a specific number of elements in the
617	/// Tensor into provided output stream.
618	void dump(llvm::raw_ostream &os, unsigned maxNumElem) const;
619
620	/// Dump a textual representation of a specific number of elements in the
621	/// Tensor into default output stream.
622	void dump(unsigned maxNumElem) const;
623
624	/// Dump a textual representation of the Tensor to std::string.
625	std::string toString() const;
626
627	/// Dump a textual representation of a specific number of elements in the
628	/// Tensor to std::string.
629	std::string toString(unsigned maxNumElem) const;
630
631	/// Dump a textual representation of the shape of this Tensor to std::string.
632	std::string getShapeToString() const;
633
634	/// \returns true if the content of the other tensor \p other is identical to
635	/// this one, given some \p allowedError. If \p verbose and the tensors are
636	/// not equal, then we will log information about the mismatch (number of
637	/// elements exceeding allowed error; maximum error and location found; etc.).
638	bool isEqual(const Tensor &other, float allowedError = `0.0001`,
639	bool verbose = true) const {
640	if (isDeviceResident()) {
641	if (!other.isDeviceResident()) {
642	if (verbose) {
643	LOG(INFO) << "Tensors cannot be compared as they are not resident in "
644	"the same location.";
645	}
646	return false;
647	}
648
649	return getDeviceManager() == other.getDeviceManager() &&
650	getLocationContext() == other.getLocationContext();
651	}
652	return isEqualImpl(other, /isBitwise=/false, allowedError, verbose);
653	}
654
655	/// \returns true if the content of the other tensor \p other is bitwise
656	/// identical to this one.
657	bool isBitwiseEqual(const Tensor &other, bool verbose = false) const {
658	return isEqualImpl(other, /isBitwise=/true, /allowedError=/`0.0`,
659	verbose);
660	}
661
662	bool isEqualImpl(const Tensor &other, bool isBitwise, float allowedError,
663	bool verbose) const {
664	if (other.dims() != dims()) {
665	if (verbose) {
666	LOG(INFO) << "Tensors are not equal as they have different shapes: "
667	<< this->getShapeToString() << " vs. "
668	<< other.getShapeToString();
669	}
670	return false;
671	}
672
673	// For now, make sure that either both or neither of the tensors have
674	// UInt8FusedQTy or UInt8Fused16QTy. While it is possible for an Int8QTy
675	// tensor to equal a fused tensor if the fused tensor has the same
676	// scale/offset on all of its rows, and that scale/offset match that of the
677	// Int8QTy, we do not support checking this for now.
678	assert(((getElementType() == ElemKind::UInt8FusedQTy &&
679	other.getElementType() == ElemKind::UInt8FusedQTy) \|\|
680	(getElementType() == ElemKind::UInt8FusedFP16QTy &&
681	other.getElementType() == ElemKind::UInt8FusedFP16QTy) \|\|
682	(getElementType() != ElemKind::UInt8FusedFP16QTy &&
683	other.getElementType() != ElemKind::UInt8FusedQTy)) &&
684	"Fused ElemKinds only supports comparing against same ElemKind.");
685
686	// Assert that the scale and offset match for the quantized types.
687	switch (getElementType()) {
688	default:
689	break;
690	case ElemKind::Int8QTy:
691	case ElemKind::UInt8QTy:
692	case ElemKind::Int16QTy:
693	case ElemKind::Int32QTy:
694	assert(getType().getScale() == other.getType().getScale() &&
695	"Scales must match.");
696	assert(getType().getOffset() == other.getType().getOffset() &&
697	"Offsets must match.");
698	}
699
700	// Bitwise compare.
701	if (isBitwise) {
702	return isBitwiseEqualImpl(other, verbose);
703	}
704
705	switch (getElementType()) {
706	case ElemKind::FloatTy:
707	return isEqualImpl<float>(other, allowedError, verbose);
708	case ElemKind::Float16Ty:
709	return isEqualImpl<float16_t>(other, allowedError, verbose);
710	case ElemKind::BFloat16Ty:
711	return isEqualImpl<bfloat16_t>(other, allowedError, verbose);
712	case ElemKind::Float64Ty:
713	return isEqualImpl<double>(other, allowedError, verbose);
714	case ElemKind::Int8QTy:
715	return isEqualImpl<int8_t>(other, allowedError, verbose);
716	case ElemKind::UInt8QTy:
717	return isEqualImpl<uint8_t>(other, allowedError, verbose);
718	case ElemKind::Int16QTy:
719	return isEqualImpl<int16_t>(other, allowedError, verbose);
720	case ElemKind::Int32QTy:
721	return isEqualImpl<int32_t>(other, allowedError, verbose);
722	case ElemKind::Int64QTy:
723	return isEqualImpl<int64_t>(other, allowedError, verbose);
724	case ElemKind::UInt8ITy:
725	return isEqualImpl<uint8_t>(other, allowedError, verbose);
726	case ElemKind::Int32ITy:
727	return isEqualImpl<int32_t>(other, allowedError, verbose);
728	case ElemKind::Int64ITy:
729	return isEqualImpl<int64_t>(other, allowedError, verbose);
730	// Note: We can use isEqualImpl() here because the scales/offsets will be
731	// compared as if they were data, so we will return false if any rowwise
732	// scale/offset do not match.
733	case ElemKind::UInt8FusedQTy:
734	return isEqualImpl<uint8_t>(other, allowedError, verbose);
735	case ElemKind::UInt8FusedFP16QTy:
736	return isEqualImpl<uint8_t>(other, allowedError, verbose);
737	case ElemKind::UInt4FusedFP16QTy:
738	return isEqualImpl<uint8_t>(other, allowedError, verbose);
739	case ElemKind::UInt4FusedQTy:
740	return isEqualImpl<uint8_t>(other, allowedError, verbose);
741	case ElemKind::BoolTy:
742	return isEqualImpl<bool>(other, allowedError, verbose);
743	}
744
745	// This is to make compiler happy. It can never reach this point as switch
746	// always covers all possible values.
747	llvm_unreachable("unreachable");
748	}
749
750	/// \returns whether this Tensor is tiled (repeated) along \p axis for the
751	/// given tile size \p size. Some examples:
752	/// - A Tensor with size [2, 3] equal to [[1,2,3],[1,2,3]] is tiled along
753	/// axis 0 for a tile size equal to 1.
754	/// - A Tensor with size [2, 4] equal to [[1, 2, 1, 2],[3, 4, 3, 4]] is tiled
755	/// along axis 1 for a tile size equal to 2.
756	/// When the tile size matches the dimensions size this function returns TRUE.
757	/// If the \p fractional flag is optionally given that this function will also
758	/// perform fractional tiling verification (default is FALSE). Some examples:
759	/// - For a Tensor with size [5] equal to [1,2,3,1,2], axis 0 and tile size 3,
760	/// this function returns TRUE if \p fractional is TRUE and returns FALSE if
761	/// \p fractional is FALSE.
762	bool isTiled(unsigned_t axis, dim_t size = `1`, bool fractional = false) const;
763
764	/// \returns whether this Tensor is tiled (repeated) along \p axes for the
765	/// given tile sizes \p sizes. Some examples:
766	/// - A Tensor with size [2, 4] equal to [[1,2,1,2],[1,2,1,2]] is tiled along
767	/// axes {0,1} for the tile sizes {1,2}.
768	/// When the tile sizes match the dimension sizes this function returns TRUE.
769	/// If the \p fractional flag is optionally given that this function will also
770	/// perform fractional tiling verification (default is FALSE). Some examples:
771	/// - For a Tensor with size [5] equal to [1,2,3,1,2], axes {0} and sizes {3},
772	/// this function returns TRUE if \p fractional is TRUE and returns FALSE if
773	/// \p fractional is FALSE.
774	bool isTiled(llvm::ArrayRef<unsigned_t> axes, llvm::ArrayRef<dim_t> sizes,
775	bool fractional = false) const;
776
777	/// Update the content and type of the tensor from the tensor \p t.
778	void assign(const Tensor *t) {
779	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
780	assert(this != t && "Copying to self");
781	const size_t bufferSize = t->getUnpaddedSizeInBytes();
782	reset(t, bufferSize);
783	std::copy(&t->getData()[`0`], &t->getData()[bufferSize], getData());
784	}
785
786	/// Update the raw data of the tensor from the tensor \p t.
787	void copyRawFrom(const Tensor *t) {
788	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
789	assert(this != t && "Copying to self");
790	assert(actualSize() == t->actualSize());
791	assert(getElementType() == t->getElementType() && "Invalid element type");
792	assert(t->getUnpaddedSizeInBytes() == getUnpaddedSizeInBytes() &&
793	"Do not support copying between different unpadded sized tensors");
794	size_t bufferSize = type_.getSizeInBytes();
795	std::copy(&t->getData()[`0`], &t->getData()[bufferSize], getData());
796	}
797
798	/// Update the raw data of the tensor from a raw buffer \p data.
799	void copyRawFrom(const char *data) {
800	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
801	assert(data && "Null data pointer!");
802	assert(getData() != data && "Copying to self");
803	size_t bufferSize = type_.getSizeInBytes();
804	std::memcpy(getData(), data, bufferSize);
805	}
806
807	/// Update the content of the tensor with a slice from tensor \p t. A slice
808	/// is one index from the first dimension of the tensor.
809	void copySlice(const Tensor *t, size_t slice) {
810	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
811	auto dim = t->dims().slice(`1`);
812	(void)dim;
813	assert(dim == dims() && "Invalid slice size");
814	assert(getElementType() == t->getElementType() && "Invalid element type");
815
816	size_t bufferSize = type_.getSizeInBytes();
817	std::copy(&t->getData()[bufferSize * slice],
818	&t->getData()[bufferSize * (slice + `1`)], getData());
819	}
820
821	/// Update the content of the tensor with a sequence of slices from the
822	/// tensor \p t. A slice is one index from the first dimension of the tensor.
823	/// The copying operation may overlap the end of the tensor \p t one or more
824	/// times. This means that the data in the input tensor may be duplicated.
825	void copyConsecutiveSlices(const Tensor *t, size_t startSliceIdx) {
826	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
827	auto onceSliceDim = t->dims().slice(`1`);
828	(void)onceSliceDim;
829	assert(onceSliceDim == dims().slice(`1`) && "Invalid slice size");
830	assert(getElementType() == t->getElementType() && "Invalid element type");
831	assert(dims().size() > `1` && "Tensor must contain at least two dimensions");
832
833	size_t numSlicesInInput = t->dims()[`0`];
834	size_t numElementsInSlice = actualSize() / dims()[`0`];
835	size_t bufferSize = numElementsInSlice * type_.getElementSize();
836
837	// For each outer slice in the current tensor:
838	for (size_t n = `0`, e = dims()[`0`]; n < e; n++) {
839	size_t startIdx = (startSliceIdx + n) % numSlicesInInput;
840	std::copy(&t->getData()[bufferSize * startIdx],
841	&t->getData()[bufferSize * (startIdx + `1`)],
842	&getData()[bufferSize * n]);
843	}
844	}
845
846	/// Convenience method to copy the content of \p t
847	/// to this while both have different underlying types.
848	/// This copy will read each element of \p t as SrcElemType
849	/// and cast them to DestElemType in this.
850	template <typename DestElemType, typename SrcElemType>
851	void copyWithCast(const Tensor *t) {
852	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
853	static_assert(!std::is_same<DestElemType, SrcElemType>::value,
854	"Use copyRawFrom instead");
855	assert(this != t && "Copying to self");
856	assert(getElementType() != t->getElementType() &&
857	"Use copyRawFrom instead");
858	assert(actualSize() == t->actualSize() && "Different sizes");
859	const auto *src = t->getRawDataPointer<SrcElemType>();
860	auto *dst = getRawDataPointer<DestElemType>();
861	for (size_t idx = `0`, end = actualSize(); idx != end; ++idx) {
862	dst[idx] = DestElemType(src[idx]);
863	}
864	}
865
866	/// Convert each element of this tensor to \p newTy. Calls into
867	/// \ref getCopyConvertedToType() to do the conversion, and hence supports
868	/// converting between whatever ElemKinds it supports.
869	void convertToType(ElemKind newTy);
870
871	/// \returns a copy of the Tensor but converted to \p newKind. Currently
872	/// supports conversion for:
873	/// - FloatTy to Float16Ty
874	/// - FloatTy to BFloat16Ty
875	/// - Float16Ty to FloatTy
876	/// - BFloat16Ty to FloatTy
877	/// - UInt8FusedQTy to UInt8FusedFP16QTy
878	Tensor getCopyConvertedToType(ElemKind newKind) const;
879
880	/// Transpose the tensor \p src into the empty tensor \p dest. Shuffle the
881	/// axis based on the list \p shuffle, where each element is the src index.
882	void transpose(Tensor dest, llvm::ArrayRef<unsigned_t> shuffle) const* {
883	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
884	genericTranspose(this, dest, shuffle);
885	}
886
887	/// Create a new copy of the current tensor.
888	Tensor clone() const {
889	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
890	Tensor slice;
891	slice.assign(this);
892	return slice;
893	}
894
895	/// Return the raw unsafe pointer to the tensor payload.
896	char getUnsafePtr() const* { return getData(); }
897
898	/// \returns true if tensor data is stored on a device
899	bool isDeviceResident() const {
900	return deviceResidency_ && deviceResidency_->isDeviceResident();
901	}
902
903	/// Update device residency info with new device manager and context
904	void moveToDevice(DeviceTensorTransferManager *deviceManager,
905	void *locationContext);
906
907	/// If device resident, copy Tensor contents back to host memory and release
908	/// associated device memory.
909	void ensureOnHost();
910
911	/// Updates contents of a device resident Tensor with the data from \p t
912	/// without copying its contents to host.
913	void copyRawToDevice(const Tensor *t);
914
915	/// \returns the pointer to the device manager where the tensor resides.
916	DeviceTensorTransferManager getDeviceManager() const* {
917	assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist");
918	assert(deviceResidency_->isDeviceResident() &&
919	"Tensor must be device resident");
920	return deviceResidency_->getDeviceManager();
921	}
922
923	/// \returns the pointer to the location context of where the tensor resides.
924	void getLocationContext() const* {
925	assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist");
926	assert(deviceResidency_->isDeviceResident() &&
927	"Tensor must be device resident");
928	return deviceResidency_->getLocationContext();
929	}
930
931	void resetDeviceInfo() {
932	if (deviceResidency_ && ownsDeviceResidency_) {
933	deviceResidency_->clear();
934	return;
935	}
936
937	deviceResidency_ = new DeviceResidencyInfo ();
938	ownsDeviceResidency_ = true;
939	}
940
941	/// Clears DeviceResidencyInfo.
942	/// Note that this does not affect the associated DeviceManager or device
943	/// memory.
944	void clearDeviceResidency() {
945	assert(deviceResidency_ != nullptr && "DeviceResidencyInfo must exist");
946	assert(deviceResidency_->isDeviceResident() &&
947	"Tensor must be device resident");
948	deviceResidency_->clear();
949	}
950
951	/// \return a new handle that points and manages this tensor.
952	template <class ElemTy = float> Handle<ElemTy> getHandle() &;
953
954	template <class ElemTy = float> const Handle<ElemTy> getHandle() const &;
955
956	/// If Tensor is rvalue, it is an error to get its Handle.
957	template <class ElemTy = float> Handle<ElemTy> getHandle() && = delete;
958
959	private:
960	/// \returns a pointer to the raw data, of type \p ElemTy.
961	template <class ElemTy> ElemTy *getRawDataPointer() {
962	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
963	assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type.");
964	return reinterpret_cast<ElemTy *>(data_);
965	}
966
967	/// \returns a const pointer to the raw data, of type \p ElemTy.
968	template <class ElemTy> const ElemTy getRawDataPointer() const* {
969	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
970	assert(type_.isType<ElemTy>() && "Asking for the wrong ptr type.");
971	return reinterpret_cast<const ElemTy *>(data_);
972	}
973
974	template <class ElemTy>
975	bool isEqualImpl(const Tensor &other, float allowedError,
976	bool verbose) const {
977	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
978	auto thisHandle = getHandle<ElemTy>();
979	auto otherHandle = other.getHandle<ElemTy>();
980	double maxFoundError = `0.0`;
981	size_t numExceedingError = `0`;
982	size_t currIndex = `0`;
983	size_t maxFoundErrorIdx = `0`;
984	double maxRE = `0.0`; // relative error.
985	size_t maxREIdx = `0`;
986	for (auto thisHandleIt = thisHandle.begin(),
987	otherHandleIt = otherHandle.begin();
988	thisHandleIt != thisHandle.end() && otherHandleIt != otherHandle.end();
989	++thisHandleIt, ++otherHandleIt, ++currIndex) {
990	double delta = thisHandleIt - otherHandleIt;
991	delta = std::abs(delta);
992	// Since any comparison with NAN returns false, we use a negated condition
993	// so that this function correctly returns false when delta is NAN.
994	if (!(delta <= allowedError)) {
995	if (!verbose) {
996	return false;
997	}
998	numExceedingError += `1`;
999	if (!(delta <= maxFoundError)) {
1000	maxFoundError = delta;
1001	maxFoundErrorIdx = currIndex;
1002	}
1003	double sum = thisHandleIt + otherHandleIt;
1004	double re = delta / std::abs(sum);
1005	if (!(re <= maxRE)) {
1006	maxRE = re;
1007	maxREIdx = currIndex;
1008	}
1009	}
1010	}
1011	auto thisHandleIt = thisHandle.begin();
1012	auto otherHandleIt = otherHandle.begin();
1013	if (numExceedingError != `0`) {
1014	LOG(INFO) << "Tensors not equal: " << numExceedingError << " out of "
1015	<< actualSize() << " elements exceeded allowed error threshold "
1016	<< allowedError << ". Maximum error found was " << maxFoundError
1017	<< " at index " << maxFoundErrorIdx << ": "
1018	<< *(thisHandleIt.operator+(maxFoundErrorIdx)) << " vs. "
1019	<< *(otherHandleIt.operator+(maxFoundErrorIdx));
1020	LOG(INFO) << "Maximum relative error found was: " << maxRE
1021	<< " at index: " << maxREIdx << ": "
1022	<< *(thisHandleIt.operator+(maxREIdx)) << " v.s. "
1023	<< *(otherHandleIt.operator+(maxREIdx));
1024	}
1025	return numExceedingError == `0`;
1026	}
1027
1028	bool isBitwiseEqualImpl(const Tensor &other, bool verbose) const {
1029	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
1030	auto const *myData = getUnsafePtr();
1031	auto const *otherData = other.getUnsafePtr();
1032	dim_t mismatchCount = `0`;
1033
1034	if (verbose) {
1035	for (size_t i = `0`, e = getSizeInBytes(); i < e; i++) {
1036	if (myData[i] != otherData[i]) {
1037	++mismatchCount;
1038	}
1039	}
1040	if (mismatchCount != `0`) {
1041	LOG(INFO) << "Tensors not bitwise equal: " << mismatchCount
1042	<< " bytes out of " << getSizeInBytes() << " mismatched.";
1043	}
1044	} else {
1045	mismatchCount = memcmp(myData, otherData, getSizeInBytes());
1046	}
1047
1048	return mismatchCount == `0`;
1049	}
1050	};
1051
1052	//===----------------------------------------------------------------------===//
1053	// Tensor Handle
1054	//===----------------------------------------------------------------------===//
1055
1056	constexpr unsigned MAX_DUMP_ELEMS = `100`;
1057
1058	void dumpAsciiImpl(const Tensor *T, llvm::raw_ostream &os);
1059	void dumpAsciiImpl(const Tensor *T);
1060
1061	void dumpImpl(const Tensor *T, llvm::raw_ostream &os,
1062	unsigned maxNumElem = MAX_DUMP_ELEMS);
1063	void dumpImpl(const Tensor T, unsigned* maxNumElem);
1064	void dumpImpl(const Tensor *T);
1065
1066	template <class ElemTy> class Handle;
1067
1068	/// A class that provides ability to iterate over a Handle<ElemTy>. Since it's
1069	/// common to have both mutating and const iterators, this class has template
1070	/// parameter IsConst, which is true to create const_iterator and false
1071	/// otherwise.
1072	template <class ElemTy, bool IsConst>
1073	class HandleIterator
1074	: public std::iterator<std::random_access_iterator_tag, ElemTy> {
1075	using HandleTy = typename std::conditional_t<IsConst, const Handle<ElemTy> *,
1076	Handle<ElemTy> *>;
1077	using ElemTyRef =
1078	typename std::conditional_t<IsConst, const ElemTy &, ElemTy &>;
1079
1080	/// At every given moment, the iterator maintains an index, which is used to
1081	/// access the Handle. When moving the iterator forward, the index is
1082	/// incremented. Only valid elements can be accessed.
1083	/// 0 <= idx_ <= handle_->size()
1084	HandleTy handle_;
1085	llvm::ArrayRef<dim_t> sizes_;
1086	dim_t idx_;
1087	/// Holds true if the underlying tensor has non-trivial alignment (i.e. not 1)
1088	bool isAligned_;
1089
1090	HandleIterator() = default;
1091
1092	HandleIterator(HandleTy handle) : handle_(handle) {
1093	sizes_ = handle->dims();
1094	isAligned_ = handle->size() < handle->actualSize();
1095	}
1096
1097	static HandleIterator begin(HandleTy handle) {
1098	auto res = HandleIterator(handle);
1099	res.idx_ = `0`;
1100	return res;
1101	}
1102
1103	static HandleIterator end(HandleTy handle) {
1104	auto res = HandleIterator(handle);
1105	res.idx_ = res.handle_->getRealNumElements();
1106	return res;
1107	}
1108
1109	friend class Handle<ElemTy>;
1110
1111	public:
1112	HandleIterator &operator++() {
1113	if (*this != handle_->end()) {
1114	idx_++;
1115	}
1116	return *this;
1117	}
1118	HandleIterator &operator--() {
1119	if (idx_) {
1120	idx_--;
1121	}
1122	return *this;
1123	}
1124	HandleIterator operator+(int n) const {
1125	auto res = HandleIterator(handle_);
1126	res.idx_ = std::max(static_cast<int>(idx_) + n, `0`);
1127	res.idx_ = std::min(res.idx_, res.handle_->size());
1128	return res;
1129	}
1130	HandleIterator operator-(int n) const { return *this + (-n); }
1131	operator int() const { return idx_; }
1132
1133	ElemTyRef operator*() {
1134	if (!isAligned_) {
1135	return handle_->raw(idx_);
1136	}
1137	std::vector<dim_t> indices(sizes_.size(), `0`);
1138	size_t rem = idx_;
1139	for (int i = static_cast<int>(sizes_.size()) - `1`; i >= `0`; i--) {
1140	indices [i] = rem % sizes_[i];
1141	rem /= sizes_[i];
1142	}
1143	return handle_->at(indices);
1144	}
1145
1146	bool operator==(const HandleIterator<ElemTy, IsConst> &other) const {
1147	return idx_ == other.idx_;
1148	}
1149
1150	bool operator!=(const HandleIterator<ElemTy, IsConst> &other) const {
1151	return !(*this == other);
1152	}
1153	};
1154
1155	/// Helper which \returns the flattened 1D offset given \p indices into a tensor
1156	/// with \p strides.
1157	inline size_t getFlattenedOffset(llvm::ArrayRef<dim_t> strides,
1158	llvm::ArrayRef<dim_t> indices) {
1159	assert(indices.size() <= strides.size() && "Invalid number of indices");
1160	// The loop below can be rewritten using std::inner_product. Unfortunately
1161	// std::inner_product does not optimize very well and loops that use this
1162	// method don't get vectorized. Don't change this loop without benchmarking
1163	// the program on a few compilers.
1164	size_t index = `0`;
1165	for (size_t i = `0`, e = indices.size(); i < e; i++) {
1166	index += size_t(strides [i]) * size_t(indices [i]);
1167	}
1168
1169	return index;
1170	}
1171
1172	/// Helper function which \returns true if a slice with the shape \p sliceShape
1173	/// referenced from a larger tensor with the shape \p tensorShape is contiguous
1174	/// in memory (assuming the tensor it is referenced from is contiguous). This
1175	/// happens when the slice dimensions:
1176	/// - Start with singleton dimensions (dimensions equal to 1).
1177	/// - Continue with a partially extracted dimension (one maximum).
1178	/// - End with fully extracted dimensions.
1179	bool isSliceContiguous(llvm::ArrayRef<dim_t> sliceShape,
1180	llvm::ArrayRef<dim_t> tensorShape);
1181
1182	/// A class that provides indexed access to a tensor. This class has value
1183	/// semantics and it's copied around. One of the reasons for making this class
1184	/// value semantics is to allow efficient index calculation that the compiler
1185	/// can optimize (because stack allocated structures don't alias).
1186	template <class ElemTy> class Handle final {
1187	/// A pointer to the tensor that this handle wraps.
1188	Tensor tensor_{nullptr*};
1189
1190	/// Contains the multiplication of the sizes from current position to end.
1191	/// For example, for index (w,z,y,z): [x y * z, y * z, z, 1]*
1192	dim_t sizeIntegral_[max_tensor_dimensions] = {
1193	`0`,
1194	};
1195
1196	dim_t sizes_[max_tensor_dimensions] = {
1197	`0`,
1198	};
1199
1200	/// Saves the number of dimensions used in the tensor.
1201	uint8_t numDims_{`0`};
1202
1203	/// Remember end iterators. This is needed to speed up iterator increment,
1204	/// which has to check that iterator hasn't reached the end yet.
1205	HandleIterator<ElemTy, false> mutating_end_;
1206	HandleIterator<ElemTy, true> const_end_;
1207
1208	/// Create a new invalid handle. Notice that this method is private and may
1209	/// only be used by the static factory method below.
1210	Handle() = default;
1211
1212	public:
1213	/// \returns an iterator to the first element of the tensor.
1214	HandleIterator<ElemTy, false> begin() {
1215	return HandleIterator<ElemTy, false>::begin(this);
1216	}
1217	HandleIterator<ElemTy, true> begin() const {
1218	return HandleIterator<ElemTy, true>::begin(this);
1219	}
1220
1221	/// \returns an iterator referring to the past-the-end element.
1222	HandleIterator<ElemTy, false> end() { return mutating_end_; }
1223	HandleIterator<ElemTy, true> end() const { return const_end_; }
1224
1225	/// Allocate a new invalid handle.
1226	static Handle createInvalidHandle() { return Handle(); }
1227
1228	/// \returns true if this Handle points to a valid tensor.
1229	bool isValid() const { return tensor_; }
1230
1231	/// Calculate the index for a specific element in the tensor. Notice that
1232	/// the list of indices may be incomplete. This method provides access to
1233	/// padding elements, meaning that it's possible to get an index pointing at
1234	/// data, added to meet alignment requirements.
1235	size_t getElementPtr(llvm::ArrayRef<dim_t> indices) const {
1236	return getFlattenedOffset(llvm::makeArrayRef(sizeIntegral_, numDims_),
1237	indices);
1238	}
1239
1240	/// \returns the value of the n'th dimension \p dim, for the index \p idx.
1241	/// 0 <= idx < size(), meaning that \p idx addresses a real data elements,
1242	/// not paddings.
1243	size_t getDimForPtr(size_t dim, size_t idx) const {
1244	assert(dim < numDims_ && "Invalid dimension");
1245	assert(idx < size() && "Invalid index");
1246	auto R = idx;
1247	for (size_t i = dim + `1`; i < numDims_; i++) {
1248	R /= sizes_[i];
1249	}
1250	return R % sizes_[dim];
1251	}
1252
1253	/// \returns the type of the tensor.
1254	const Type &getType() const { return tensor_->getType(); }
1255
1256	/// \returns the element type of the tensor.
1257	ElemKind getElementType() const { return tensor_->getElementType(); }
1258
1259	/// Construct a Tensor handle.
1260	explicit Handle(Tensor *tensor) : tensor_(tensor) {
1261	auto sizes = tensor->dims();
1262	numDims_ = sizes.size();
1263
1264	/// We allow handles that wrap uninitialized tensors.
1265	if (numDims_) {
1266	// Copy the sizes of the tensor.
1267	memcpy(sizes_, tensor_->type_.sizes_,
1268	max_tensor_dimensions * sizeof(sizes_[`0`]));
1269	// Copy the strides of the tensor.
1270	memcpy(sizeIntegral_, tensor_->type_.strides_,
1271	max_tensor_dimensions * sizeof(tensor_->type_.strides_[`0`]));
1272	assert(numDims_ <= max_tensor_dimensions && "Too many dimensions.");
1273	}
1274
1275	mutating_end_ = HandleIterator<ElemTy, false>::end(this);
1276	const_end_ = HandleIterator<ElemTy, true>::end(this);
1277	}
1278
1279	llvm::ArrayRef<dim_t> dims() const {
1280	return llvm::ArrayRef<dim_t>(sizes_, numDims_);
1281	}
1282
1283	/// \returns the number of elements in the whole tensor.
1284	dim_t size() const { return tensor_->size(); }
1285
1286	/// \returns the actual number of elements in the tensor taking striding into
1287	/// account. Since size() does not take striding into account, size() is
1288	/// always <= actualSize().
1289	dim_t actualSize() const { return tensor_->actualSize(); }
1290
1291	/// \returns the unpadded size of the underlying \ref tensor_.
1292	size_t getUnpaddedSizeInBytes() const {
1293	return tensor_->getUnpaddedSizeInBytes();
1294	}
1295
1296	/// \returns the number of unpadded elements in the underlying \ref tensor_.
1297	size_t getRealNumElements() const { return tensor_->getRealNumElements(); }
1298
1299	bool isInBounds(llvm::ArrayRef<dim_t> indices) const {
1300	return tensor_->isInBounds(indices);
1301	}
1302
1303	void clear(ElemTy value = `0`) { std::fill(begin(), end(), value); }
1304
1305	/// Returns reference to a meaningful data element. This method does not
1306	/// address padding elements.
1307	ElemTy &at(llvm::ArrayRef<dim_t> indices) {
1308	size_t index = getElementPtr(indices);
1309	auto *data = tensor_->getRawDataPointer<ElemTy>();
1310	return data[index];
1311	}
1312
1313	const ElemTy &at(llvm::ArrayRef<dim_t> indices) const {
1314	size_t index = getElementPtr(indices);
1315	auto *data = tensor_->getRawDataPointer<ElemTy>();
1316	return data[index];
1317	}
1318
1319	/// \returns the element at offset \p idx without any size calculations.
1320	/// The returned element can be a pad element.
1321	ElemTy &raw(size_t index) {
1322	auto *data = tensor_->getRawDataPointer<ElemTy>();
1323	return data[index];
1324	}
1325
1326	/// \returns the element at offset \p idx without any size calculations.
1327	/// The returned element can be a pad element.
1328	const ElemTy &raw(size_t index) const {
1329	auto *data = tensor_->getRawDataPointer<ElemTy>();
1330	return data[index];
1331	}
1332
1333	/// Extract a smaller dimension tensor from a specific slice (that has to be
1334	/// the first dimension).
1335	Tensor extractSlice(size_t idx) const {
1336	auto sizes = tensor_->dims();
1337	assert(sizes.size() > `1` && "Tensor must have at least two dimensions");
1338	assert(idx < sizes[`0`] && "Invalid first index");
1339
1340	Tensor slice{Type::newShape(tensor_->getType(), sizes.slice(`1`),
1341	tensor_->type_.strides().slice(`1`))};
1342
1343	// Extract the whole slice.
1344	size_t startIdx = sizeIntegral_[`0`] * idx;
1345	ElemTy *base = tensor_->getRawDataPointer<ElemTy>() + startIdx;
1346	auto *dest = slice.getRawDataPointer<ElemTy>();
1347	std::copy(base, base + sizeIntegral_[`0`], dest);
1348
1349	return slice;
1350	}
1351
1352	/// Insert a smaller dimension tensor into a larger tensor at a specific
1353	/// first-dimension index.
1354	void insertSlice(const Tensor &slice, size_t idx) {
1355	auto dims = tensor_->dims();
1356	(void)dims;
1357	assert(getElementType() == slice.getElementType());
1358	assert(dims.size() > `1` && "Tensor must have at least two dimensions");
1359	assert(idx < dims[`0`] && "Invalid first index");
1360
1361	auto sliceSize = sizeIntegral_[`0`];
1362	size_t startIdx = sliceSize * idx;
1363	ElemTy *base = &raw(startIdx);
1364	const ElemTy slicePtr = slice.getRawDataPointer<float*>();
1365	std::copy(slicePtr, slicePtr + sliceSize, base);
1366	}
1367
1368	/// Create a new copy of the current tensor.
1369	Tensor clone() const { return tensor_->clone(); }
1370
1371	/// Update the content of the tensor from a literal list:
1372	void operator=(const std::initializer_list<ElemTy> &vec) {
1373	assert(actualSize() == vec.size() && "Invalid input size.");
1374	size_t i = `0`;
1375	for (auto &e : vec) {
1376	raw(i++) = e;
1377	}
1378	}
1379
1380	void operator=(llvm::ArrayRef<ElemTy> array) {
1381	assert(actualSize() == array.size() && "Invalid input size.");
1382	std::copy(array.begin(), array.end(), &raw(`0`));
1383	}
1384
1385	void dumpAscii(llvm::raw_ostream &os) const { dumpAsciiImpl(tensor_, os); }
1386	void dumpAscii() const { dumpAsciiImpl(tensor_); }
1387
1388	/// \returns the raw indices of a min and max values from the tensor.
1389	/// In case of multiple min or max, the smallest index is returned.
1390	std::pair<dim_t, dim_t> minMaxArg() const {
1391	ElemTy max = raw(`0`);
1392	ElemTy min = raw(`0`);
1393
1394	size_t maxIdx = `0`;
1395	size_t minIdx = `0`;
1396
1397	for (size_t i = `1`, e = actualSize(); i < e; i++) {
1398	ElemTy val = raw(i);
1399	if (val > max) {
1400	max = val;
1401	maxIdx = i;
1402	} else if (val < min) {
1403	min = val;
1404	minIdx = i;
1405	}
1406	}
1407
1408	return std::make_pair(minIdx, maxIdx);
1409	}
1410
1411	/// \returns true if tensor contains only elements equal to zero.
1412	/// \p allowedError represents the delta from zero that is allowed before
1413	/// returning false.
1414	bool isZero(float allowedError = `0.0`) const {
1415	#define RETURN_WHETHER_FUSED_IS_ZERO(DATA_TYPE) \
1416	assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \
1417	assert(dims()[1] > 2 * sizeof(DATA_TYPE) && \
1418	"Fused tensor must have space for scale/offset."); \
1419	const dim_t dataWidth = dims()[1]; \
1420	const dim_t alignedLength = tensor_->getType().strides()[0]; \
1421	auto data = reinterpret_cast<uint8_t >(tensor_->getUnsafePtr()); \
1422	for (dim_t i = 0, e = dims()[0]; i < e; i++) { \
1423	uint8_t *scaleOffsetPtr = \
1424	data + i * alignedLength + dataWidth - 2 * sizeof(DATA_TYPE); \
1425	DATA_TYPE scale, offset; \
1426	memcpy(&scale, scaleOffsetPtr, sizeof(DATA_TYPE)); \
1427	memcpy(&offset, scaleOffsetPtr + sizeof(DATA_TYPE), sizeof(DATA_TYPE)); \
1428	for (dim_t j = 0, e = dataWidth - 2 * sizeof(DATA_TYPE); j < e; j++) { \
1429	float currVal = (at({i, j}) * (float)scale) + (float)offset; \
1430	if (std::abs(currVal) > allowedError) { \
1431	return false; \
1432	} \
1433	} \
1434	} \
1435	return true;
1436
1437	if (getElementType() == ElemKind::UInt8FusedQTy) {
1438	RETURN_WHETHER_FUSED_IS_ZERO(float);
1439	}
1440	if (getElementType() == ElemKind::UInt8FusedFP16QTy) {
1441	RETURN_WHETHER_FUSED_IS_ZERO(float16_t);
1442	}
1443	#undef RETURN_WHETHER_FUSED_IS_ZERO
1444
1445	int32_t trueZero = getType().isQuantizedType() ? getType().getOffset() : `0`;
1446	return std::all_of(begin(), end(), [=](ElemTy e) { return e == trueZero; });
1447	}
1448
1449	void dump(llvm::raw_ostream &os, unsigned maxNumElem = MAX_DUMP_ELEMS) const {
1450	dumpImpl(tensor_, os, maxNumElem);
1451	}
1452	void dump(unsigned maxNumElem) const { dumpImpl(tensor_, maxNumElem); }
1453	void dump() const { dumpImpl(tensor_, MAX_DUMP_ELEMS); }
1454
1455	/// Fill the array with random data that's close to zero using the
1456	/// Xavier method, based on the paper [Bengio and Glorot 2010].
1457	/// This type of initialization facilitates better training performance.
1458	/// The parameter \p filterSize is the number of "input" neurons in the
1459	/// tensor (or the relevant slice). For example, consider case of MatMul:
1460	/// NxM (\p input) MxK (\p weights) == NxK (\p result)*
1461	/// Correct \p filterSize for weights tensor is M, so that norm for each
1462	/// row of \p input equals to norm of corresponding row of \p result.
1463	void initXavier(size_t filterSize, PseudoRNG &PRNG) {
1464	assert(filterSize > `0` && "invalid filter size");
1465	assert(getType().isFPType() &&
1466	"Only support floating point Xavier initialization.");
1467	double scale = std::sqrt(`3.0` / double(filterSize));
1468	std::uniform_real_distribution<> dist(-scale, scale);
1469	for (auto &e : *this) {
1470	e = dist (PRNG);
1471	}
1472	}
1473
1474	/// Fill the tensor with uniformly distributed values in the range
1475	/// [low .. high).
1476	template <typename T = ElemTy>
1477	typename std::enable_if<std::is_floating_point<T>::value>::type
1478	randomize(float low, float high, PseudoRNG &PRNG) {
1479	assert(low <= high && "invalid range");
1480	std::uniform_real_distribution<ElemTy> dist(low, high);
1481	for (auto &elem : *this) {
1482	elem = dist(PRNG);
1483	}
1484	}
1485
1486	/// Fill the tensor with uniformly distributed values in the range
1487	/// [low .. high]. For quantized fused tensors leave scales/offsets unchanged.
1488	template <typename T = ElemTy>
1489	typename std::enable_if<std::is_integral<T>::value>::type
1490	randomize(int low, int high, PseudoRNG &PRNG) {
1491	assert(low <= high && "invalid range");
1492	assert(low >= std::numeric_limits<ElemTy>::lowest() &&
1493	high <= std::numeric_limits<ElemTy>::max() &&
1494	"Cannot initialize outside range of representable values.");
1495	std::uniform_int_distribution<long long> dist(low, high);
1496	switch (getElementType()) {
1497	default: {
1498	for (auto &elem : *this) {
1499	elem = dist (PRNG);
1500	}
1501	return;
1502	}
1503
1504	#define FUSED_CASE(ELEM_KIND, DATA_TYPE) \
1505	case ElemKind::ELEM_KIND: { \
1506	assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); \
1507	assert(dims()[1] > 2 * sizeof(DATA_TYPE) && \
1508	"Fused tensor must have space for scale/offset."); \
1509	for (dim_t i = 0, e = dims()[0]; i < e; i++) { \
1510	for (dim_t j = 0, f = dims()[1] - 2 * sizeof(DATA_TYPE); j < f; j++) { \
1511	at({i, j}) = dist(PRNG); \
1512	} \
1513	} \
1514	return; \
1515	}
1516	FUSED_CASE(UInt8FusedQTy, float);
1517	FUSED_CASE(UInt8FusedFP16QTy, float16_t);
1518	#undef FUSED_CASE
1519	}
1520	}
1521
1522	/// Fill the tensor with uniformly distributed values in the range
1523	/// [low .. high).
1524	template <typename T = ElemTy>
1525	typename std::enable_if<!std::is_floating_point<T>::value &&
1526	!std::is_integral<T>::value>::type
1527	randomize(float low, float high, PseudoRNG &PRNG) {
1528	assert(low <= high && "invalid range");
1529	std::uniform_real_distribution<float> dist(low, high);
1530	for (auto &elem : *this) {
1531	elem = dist (PRNG);
1532	}
1533	}
1534
1535	/// \returns the mean and variance of the tensor.
1536	std::pair<double, double> calculateMeanVariance() const {
1537	size_t n = actualSize();
1538	assert(n > `1` && "Input must have at least 2 elements.");
1539
1540	// Calculate mean.
1541	double mean = `0`;
1542	for (size_t i = `0`; i < n; i++) {
1543	mean += raw({i});
1544	}
1545	mean /= n;
1546
1547	// Calculate variance.
1548	double var = `0`;
1549	for (size_t i = `0`; i < n; i++) {
1550	double t = raw({i}) - mean;
1551	var += t * t;
1552	}
1553	var /= (n - `1`);
1554
1555	return {mean, var};
1556	}
1557
1558	/// Insert the tensor \p slice at location \p offset \p count times along the
1559	/// \p axis. This operation is equivalent to the operation of scanning the
1560	/// source tensor, and saving the value that is stored at coordinate {d_0,
1561	/// d_1, ... d_n} in the new tensor at {d_0 + O_0, d_1 + O_1, ... d_n + O_n},
1562	/// where O is the offset vector, assuming \p count = 1. For \p count > 1, the
1563	/// same Tensor is copied \p count times along the provided \p axis. The
1564	/// tensors must be of the right dimensions.
1565	void insertTensors(Handle<ElemTy> &slice, llvm::ArrayRef<dim_t> offset,
1566	size_t count = `1`, size_t axis = `0`) {
1567	auto sliceCoor = slice.dims().vec();
1568	auto fusedCoor = dims().vec();
1569	insertTensorsImpl(sliceCoor, fusedCoor, slice, true, offset, count, axis,
1570	`0`);
1571	}
1572
1573	/// Extract the tensor \p slice at location \p offset. This operation is
1574	/// equivalent to the operation of scanning the destination tensor, and
1575	/// copying into the cell at coordinate {d_0, d_1, ... d_n} a value from the
1576	/// tensor at {d_0 + O_0, d_1 + O_1, ... d_n + O_n}, where O is the offset
1577	/// vector. The tensors must be of the right dimensions.
1578	void extractTensors(Handle<ElemTy> &slice, llvm::ArrayRef<dim_t> offset) {
1579	auto sliceCoor = slice.dims().vec();
1580	auto fusedCoor = dims().vec();
1581	insertTensorsImpl(sliceCoor, fusedCoor, slice, false, offset, / count / `1`,
1582	/ axis / `0`, `0`);
1583	}
1584
1585	/// \returns a pair of the scale and offset from a row \p rowIdx of a
1586	/// FusedRowwiseQuantized Tensor.
1587	template <typename T>
1588	std::pair<T, T> getFusedScaleOffsetFromRow(dim_t rowIdx) {
1589	ElemTy *rowScaleOffsetPtr = getFusedRowScaleOffsetPtr<T>(rowIdx);
1590	T scale;
1591	T offset;
1592	memcpy(&scale, rowScaleOffsetPtr, sizeof(T));
1593	memcpy(&offset, rowScaleOffsetPtr + sizeof(T), sizeof(T));
1594	return std::make_pair(scale, offset);
1595	}
1596
1597	/// Sets the \p scale and \p offset to a row \p rowIdx of a
1598	/// FusedRowwiseQuantized Tensor.
1599	template <typename T>
1600	void setFusedScaleOffsetInRow(dim_t rowIdx, T scale, T offset) {
1601	ElemTy *rowScaleOffsetPtr = getFusedRowScaleOffsetPtr<T>(rowIdx);
1602	T finalScale = static_cast<T>(scale);
1603	T finalOffset = static_cast<T>(offset);
1604	memcpy(rowScaleOffsetPtr, &finalScale, sizeof(T));
1605	memcpy(rowScaleOffsetPtr + sizeof(T), &finalOffset, sizeof(T));
1606	}
1607
1608	private:
1609	/// Concats or splits tensors.
1610	/// This method concats or extracts a slice from a tensor.
1611	/// \p sliceCoor and \p fusedCoor are temporary storage that the function uses
1612	/// to construct the coordinates to access the tensor. They must be
1613	/// initialized to be the size of the shape of the tensor. \p slice and \p
1614	/// fused are the tensors to concat or extract. \p offset is the offset of the
1615	/// slice to add or extract along the dimension \p offsetDim. \p d is the
1616	/// recursion depth parameter that's following the number of the axis. if \p
1617	/// isInsert is set then data is copied from \p slice to \p fused. Otherwise
1618	/// data is copied from \p fused to \p slice. \p count and \p axis are used in
1619	/// conjunction for inserting the same tensor \p count times along the \p
1620	/// axis.
1621	void insertTensorsImpl(llvm::MutableArrayRef<dim_t> sliceCoor,
1622	llvm::MutableArrayRef<dim_t> fusedCoor,
1623	Handle<ElemTy> &slice, bool isInsert,
1624	llvm::ArrayRef<dim_t> offset, size_t count,
1625	size_t axis, unsigned d) {
1626	bool isDone = (d == slice.dims().size());
1627
1628	if (isDone) {
1629	if (isInsert) {
1630	at(fusedCoor) = slice.at(sliceCoor);
1631	} else {
1632	slice.at(sliceCoor) = at(fusedCoor);
1633	}
1634	return;
1635	}
1636
1637	// Only need to iterate over count if the current dimension d is equal to
1638	// the axis we're inserting over.
1639	const size_t countIters = (axis == d) ? count : `1`;
1640	for (size_t c = `0`; c < countIters; c++) {
1641	for (size_t i = `0`, e = slice.dims()[d]; i < e; i++) {
1642	// Construct the coordinates for the slice and for the joint shape.
1643	// Add the 'offset' to the dimension that we concat the shapes on.
1644	sliceCoor [d] = i;
1645	// If this is the correct axis to insert multiple times then calculate
1646	// the additional offset to use.
1647	const size_t countAxisOffset = (axis == d) ? c * slice.dims()[d] : `0`;
1648	fusedCoor [d] = i + offset [d] + countAxisOffset;
1649	insertTensorsImpl(sliceCoor, fusedCoor, slice, isInsert, offset, count,
1650	axis, d + `1`);
1651	}
1652	}
1653	}
1654
1655	/// Given a Fused tensor, \returns a pointer to the scale and offset with type
1656	/// \p T of a row \p rowIdx.
1657	template <typename T> ElemTy *getFusedRowScaleOffsetPtr(dim_t rowIdx) {
1658	switch (getElementType()) {
1659	case ElemKind::UInt8FusedQTy:
1660	case ElemKind::UInt4FusedQTy: {
1661	constexpr auto isFloat = std::is_same<float, T>::value;
1662	DCHECK(isFloat) << "Expected float scale/offset";
1663	break;
1664	}
1665	case ElemKind::UInt4FusedFP16QTy:
1666	case ElemKind::UInt8FusedFP16QTy: {
1667	constexpr auto isFloat16 = std::is_same<float16_t, T>::value;
1668	DCHECK(isFloat16) << "Expected float16_t scale/offset";
1669	break;
1670	}
1671	default:
1672	llvm_unreachable("Must be used with Tensor of supported Fused ElemKind");
1673	}
1674
1675	static_assert(std::is_same<uint8_t, ElemTy>::value,
1676	"Handle of current Fused tensors expected to be uint8_t.");
1677	const dim_t colIdx = dims()[`1`] - `2` * sizeof(T);
1678	return &at({rowIdx, colIdx});
1679	}
1680	};
1681
1682	template <class ElemTy> Handle<ElemTy> Tensor::getHandle() & {
1683	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
1684	assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type.");
1685	return Handle<ElemTy>(this);
1686	}
1687
1688	template <class ElemTy> const Handle<ElemTy> Tensor::getHandle() const & {
1689	assert(!isDeviceResident() && "Tensor must reside on host to access data.");
1690	assert(type_.isType<ElemTy>() && "Getting a handle to the wrong type.");
1691	return Handle<ElemTy>(const_cast<Tensor >(this*));
1692	}
1693
1694	llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor &t);
1695
1696	llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Tensor *t);
1697	} // namespace glow
1698
1699	#endif // GLOW_BASE_TENSOR_H
1700

Browse the source code of glow/include/glow/Base/Tensor.h