mat.h source code [tensorflow/external/ruy/ruy/mat.h]

1	/ Copyright 2019 Google LLC. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// Internal types and helpers for matrices.
17	// "Mat" is the name we use to refer to our internal matrix classes; it can be
18	// thought of as a shorter version of "InternalMatrix"`
19	//
20	// Ruy has four internal matrix classes, besides the
21	// Matrix<T> class that we expose to the user-facing API.
22	//
23	// TODO(silvasean): Put parts of this architecture description somewhere more
24	// prominent.
25	//
26	// The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where:
27	// - "E" indicates a type-erased class, storing a void pointer and a runtime*
28	// enum value to track the scalar type, as opposed to being templatized
29	// on a Scalar type and storing a Scalar pointer.*
30	// - "P" indicates a packed matrix class, the output of the packing code and
31	// input of the kernel code. See comments in pack.h regarding packing.
32	//
33	// In other words:
34	//
35	// Plain matrices Packed matrices
36	// +----------------------------------
37	// Templated \| Mat, Matrix PMat
38	// Type-erased \| EMat PEMat
39	//
40	// Note that Matrix<T> is not* implemented in terms of the internal types. It*
41	// is an independent, simple, and user-facing type. Matrix<T> is functionally
42	// equivalent to Mat, but we keep it separate to insulate internals from
43	// interface and to be able to make different compromises in internals
44	// vs interface: in internals we prefer Mat to be a C-style struct with
45	// raw data member access and to be similar to the other PMat/EMat/PEMat
46	// classes for consistency.
47	//
48	// The use of type-erasure might seem surprising for a library like Ruy with a
49	// heavily-templated entry point, but it is motivated by the desire for most of
50	// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
51	// main parts:
52	// - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry
53	// point.
54	// - "front-end" (frontend., validate., create_trmul_params.,*
55	// prepare_packed_matrices.) - the work to handle the entry-point call down to*
56	// the point where it can be handed off to the middle/back ends below. That
57	// includes routines that select RunKernel and RunPack
58	// implementations statically based on those template parameters.
59	// - "back-end" (kernel_., pack_.)- this consists of the implementations of
60	// RunKernel and RunPack, often in assembly code, which are the building blocks
61	// that Ruy calls to perform matrix multiplication. These are templated so that
62	// only the requested types/Path's are actually emitted by the compiler.
63	// - "middle-end" (trmul.) - this is the part of Ruy that orchestrates the*
64	// calls to the "back-end" optimized building blocks. This layer has to deal
65	// with issues like cache locality and low-overhead multi-threading.
66	//
67	// There is a desire for the "middle-end" to be non-templated in order to
68	// simplify the implementation and reduce code-size. We type-erase when going
69	// from the "front-end" to the "middle-end", and un-type-erase going from the
70	// "middle-end" to the "back-end". The un-type-erasure is possible because the
71	// "front-end" is responsible for instantiating the needed "back-end" templates,
72	// and thus the static type information is still present.
73	//
74	// Each layer of Ruy uses matrix types:
75	// - "entry-point": Matrix<T>
76	// - "front-end": Mat
77	// - "middle-end": EMat, PEMat
78	// - "back-end": Mat, PMat
79	//
80	// The use of separate types for packed matrices is not essential, but makes it
81	// obvious at a glance whether a matrix is a packed matrix or not. We would
82	// reconsider this decision if there was significant duplication between packed
83	// and unpacked matrices, but that doesn't seem to be the case at the moment.
84	//
85	// Another goal is to keep the user-facing Matrix<T> as simple and
86	// understandable as possible. Ideally, a user should be able to read the struct
87	// definition for Matrix<T> and see a very simple definition with no internal
88	// details like sums and kernel block layout.
89
90	#ifndef RUY_RUY_INTERNAL_MATRIX_H_
91	#define RUY_RUY_INTERNAL_MATRIX_H_
92
93	#include <cstddef>
94	#include <cstdint>
95	#include <type_traits>
96	#include <utility>
97
98	#include "ruy/check_macros.h"
99	#include "ruy/matrix.h"
100	#include "ruy/size_util.h"
101
102	namespace ruy {
103
104	// Internal counterpart of Layout, used by Mat.
105	struct MatLayout final {
106	std::int32_t rows = `0`;
107	std::int32_t cols = `0`;
108	// Stride is the offset between two adjacent matrix elements
109	// in the non-contiguous direction.
110	std::int32_t stride = `0`;
111	Order order = Order::kColMajor;
112	};
113
114	inline MatLayout ToInternal(const Layout& src) {
115	MatLayout ret;
116	ret.rows = src.rows();
117	ret.cols = src.cols();
118	ret.stride = src.stride();
119	ret.order = src.order();
120	return ret;
121	}
122
123	// Internal counterpart of Matrix
124	template <typename Scalar>
125	struct Mat final {
126	detail::ConstCheckingPtr<Scalar> data;
127	MatLayout layout;
128	Scalar zero_point = `0`;
129	CachePolicy cache_policy = CachePolicy::kNeverCache;
130	};
131
132	template <typename Scalar>
133	inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) {
134	Mat<Scalar> ret;
135	ret.data.set(src.data());
136	ret.layout = ToInternal(src.layout());
137	ret.zero_point = src.zero_point();
138	ret.cache_policy = src.cache_policy();
139	return ret;
140	}
141
142	template <typename Scalar>
143	inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) {
144	Mat<Scalar> ret;
145	ret.data.set(src.data());
146	ret.layout = ToInternal(src.layout());
147	ret.zero_point = src.zero_point();
148	ret.cache_policy = src.cache_policy();
149	return ret;
150	}
151
152	// KernelLayout describes small-scale block structure in a packed matrix layout.
153	// It's a runtime (as opposed to compile-time-constant) version of the
154	// FixedKernelLayout struct used to declare kernel layouts.
155	//
156	// This is is sometimes known as "tiling" in other contexts.
157	//
158	// For example, consider a packed matrix in column-major format with a
159	// column-major KernelLayout. The matrix logically has a shape of
160	// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
161	// of shape `[cols / kcols, rows / krows, kcols, krows]`.
162	//
163	// Note that in the case of kcols=1, krows=1, this degenerates to
164	// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
165	// structure.
166	struct KernelLayout final {
167	Order order = Order::kColMajor;
168	std::uint8_t rows = `1`;
169	std::uint8_t cols = `1`;
170	};
171
172	// A packed matrix has a small-scale block structure that is not present in in
173	// the input matrices. This block structure is necessary for the kernels to
174	// process data efficiently.
175	//
176	// This struct is very similar to MatLayout, but has the extra KernelLayout
177	// field.
178	struct PMatLayout final {
179	std::int32_t rows = `0`;
180	std::int32_t cols = `0`;
181	// Stride is the offset between two adjacent matrix elements
182	// in the non-contiguous direction.
183	std::int32_t stride = `0`;
184	Order order = Order::kColMajor;
185	// Small scale layout shuffling, potentially departing from
186	// linear row-major or column-major storage. See KernelLayout.
187	KernelLayout kernel;
188	};
189
190	inline bool operator==(const PMatLayout& a, const PMatLayout& b) {
191	return a.cols == b.cols && a.rows == b.rows && a.stride == b.stride &&
192	a.order == b.order && a.kernel.rows == b.kernel.rows &&
193	a.kernel.cols == b.kernel.cols && a.kernel.order == b.kernel.order;
194	}
195
196	// Dynamic representation for a type.
197	//
198	// The most important field in this struct is the size, which Ruy uses to know
199	// how much memory to allocate without having to be templated on a type.
200	// Signed-ness and floating-point-ness are mainly present as debugging checks.
201	//
202	// Note: Ruy does not use this struct to to dynamically dispatch between
203	// different typed implementations. As described in the comment at the top of
204	// this file, Ruy's "front-end", which is templated, instantiates all the
205	// necessary "back-end" routines with complete static knowledge of all the
206	// types.
207	struct Type final {
208	template <typename T>
209	static Type Create() {
210	Type ret;
211	ret.is_signed = std::is_signed<T>::value;
212	ret.is_floating_point = std::is_floating_point<T>::value;
213	ret.size = sizeof(T);
214	return ret;
215	}
216
217	template <typename T>
218	void AssertIs() const {
219	RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
220	RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
221	RUY_DCHECK_EQ(size, Create<T>().size);
222	}
223
224	bool is_signed = false;
225	bool is_floating_point = false;
226	std::uint8_t size = `0`;
227	};
228
229	inline bool operator==(const Type& type1, const Type& type2) {
230	return type1.is_signed == type2.is_signed &&
231	type1.is_floating_point == type2.is_floating_point &&
232	type1.size == type2.size;
233	}
234
235	// Type-erased matrix.
236	struct EMat final {
237	Type data_type;
238	void* data = nullptr;
239	MatLayout layout;
240	std::int32_t zero_point = `0`;
241	CachePolicy cache_policy = CachePolicy::kNeverCache;
242	};
243
244	// Type-erased packed matrix.
245	struct PEMat final {
246	Type data_type;
247	void* data = nullptr;
248	Type sums_type;
249	void* sums = nullptr;
250	PMatLayout layout;
251	std::int32_t zero_point = `0`;
252	};
253
254	// Convenient typed helper for packed matrices.
255	template <typename Scalar>
256	struct PMat final {
257	// The row/column sums needed for quantized matrix multiplication when
258	// the opposite operand of the multiplication uses a non-symmetric zero
259	// point.
260	// This member is only relevant for packed matrices.
261	// Additionally, Ruy always uses 32-bit signed accumulators for quantized
262	// matrix multiplication.
263	// For floating point types, there is no quantization, so this pointer
264	// will always be null. We still need code referencing it to compile
265	// though, even if it is always branched around. Hence we use Scalar*
266	// itself as the type in that case.
267	using SumsType =
268	typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
269	std::int32_t>::type;
270
271	Scalar* data = nullptr;
272	SumsType* sums = nullptr;
273	PMatLayout layout;
274	std::int32_t zero_point = `0`;
275	};
276
277	template <typename T>
278	EMat EraseType(const Mat<T>& matrix) {
279	EMat ret;
280	ret.data_type = Type::Create<T>();
281	ret.data = const_cast<void>(static_cast<const* void*>(matrix.data.get()));
282	ret.layout = matrix.layout;
283	ret.zero_point = matrix.zero_point;
284	ret.cache_policy = matrix.cache_policy;
285	return ret;
286	}
287
288	template <typename T>
289	Mat<T> UneraseType(const EMat& matrix) {
290	matrix.data_type.AssertIs<T>();
291	Mat<T> ret;
292	ret.data.set(static_cast<T*>(matrix.data));
293	ret.layout = matrix.layout;
294	ret.zero_point = matrix.zero_point;
295	ret.cache_policy = matrix.cache_policy;
296	return ret;
297	}
298
299	template <typename T>
300	PMat<T> UneraseType(const PEMat& matrix) {
301	using SumsType = typename PMat<T>::SumsType;
302	matrix.data_type.AssertIs<T>();
303	matrix.sums_type.AssertIs<SumsType>();
304	PMat<T> ret;
305	ret.data = static_cast<T*>(matrix.data);
306	ret.sums = static_cast<SumsType*>(matrix.sums);
307	ret.layout = matrix.layout;
308	ret.zero_point = matrix.zero_point;
309	return ret;
310	}
311
312	// Helpers for MatLayout / PMatLayout.
313
314	inline bool IsUnstrided(const MatLayout& layout) {
315	if (layout.order == Order::kColMajor) {
316	return layout.stride == layout.rows;
317	} else {
318	return layout.stride == layout.cols;
319	}
320	}
321
322	inline bool IsRowMajor(const MatLayout& layout) {
323	return layout.order == Order::kRowMajor;
324	}
325
326	inline bool IsColMajor(const MatLayout& layout) {
327	return layout.order == Order::kColMajor;
328	}
329
330	inline std::ptrdiff_t FlatSize(const MatLayout& layout) {
331	const int outerdim =
332	layout.order == Order::kColMajor ? layout.cols : layout.rows;
333	return layout.stride * outerdim;
334	}
335
336	inline bool IsUnstrided(const PMatLayout& layout) {
337	if (layout.order == Order::kColMajor) {
338	return layout.stride == layout.rows;
339	} else {
340	return layout.stride == layout.cols;
341	}
342	}
343
344	inline bool IsRowMajor(const PMatLayout& layout) {
345	return layout.order == Order::kRowMajor;
346	}
347
348	inline bool IsColMajor(const PMatLayout& layout) {
349	return layout.order == Order::kColMajor;
350	}
351
352	inline std::ptrdiff_t FlatSize(const PMatLayout& layout) {
353	const int outerdim =
354	layout.order == Order::kColMajor ? layout.cols : layout.rows;
355	return layout.stride * outerdim;
356	}
357
358	// TODO(b/130417400) add a unit test
359	inline int Offset(const MatLayout& layout, int row, int col) {
360	// TODO(benoitjacob) - should check this but this make the _slow tests take
361	// 5x longer. Find a mitigation like in Eigen with an 'internal' variant
362	// bypassing the check?
363	// RUY_DCHECK_GE(row, 0);
364	// RUY_DCHECK_GE(col, 0);
365	// RUY_DCHECK_LT(row, layout.rows);
366	// RUY_DCHECK_LT(col, layout.cols);
367	int row_stride = layout.order == Order::kColMajor ? `1` : layout.stride;
368	int col_stride = layout.order == Order::kRowMajor ? `1` : layout.stride;
369	return row * row_stride + col * col_stride;
370	}
371
372	// TODO(b/130417400) add a unit test
373	inline int Offset(const PMatLayout& layout, int row, int col) {
374	RUY_DCHECK(is_pot(layout.kernel.rows));
375	RUY_DCHECK(is_pot(layout.kernel.cols));
376	int row_outer = row & ~(layout.kernel.rows - `1`);
377	int col_outer = col & ~(layout.kernel.cols - `1`);
378	int row_stride_outer =
379	layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
380	int col_stride_outer =
381	layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
382	int offset_outer =
383	row_outer * row_stride_outer + col_outer * col_stride_outer;
384	int row_inner = row - row_outer;
385	int col_inner = col - col_outer;
386	int row_stride_inner =
387	layout.kernel.order == Order::kColMajor ? `1` : layout.kernel.cols;
388	int col_stride_inner =
389	layout.kernel.order == Order::kRowMajor ? `1` : layout.kernel.rows;
390	int offset_inner =
391	row_inner * row_stride_inner + col_inner * col_stride_inner;
392	return offset_outer + offset_inner;
393	}
394
395	// Helpers for Mat<T>.
396
397	template <typename Scalar>
398	const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) {
399	return mat.data.get() + Offset(mat.layout, row, col);
400	}
401
402	template <typename Scalar>
403	Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) {
404	return mat->data.get() + Offset(mat->layout, row, col);
405	}
406
407	template <typename Scalar>
408	Scalar Element(const Mat<Scalar>& mat, int row, int col) {
409	return *ElementPtr(mat, row, col);
410	}
411
412	// Helpers for PMat<T>.
413	// Duplicated from Matrix<T>, but the duplication seems acceptable.
414
415	template <typename Scalar>
416	const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) {
417	return mat.data + Offset(mat.layout, row, col);
418	}
419
420	template <typename Scalar>
421	Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) {
422	return mat->data + Offset(mat->layout, row, col);
423	}
424
425	template <typename Scalar>
426	Scalar Element(const PMat<Scalar>& mat, int row, int col) {
427	return *ElementPtr(mat, row, col);
428	}
429
430	// Helpers for PEMat.
431
432	inline std::ptrdiff_t DataBytes(const PEMat& packed) {
433	return FlatSize(packed.layout) * packed.data_type.size;
434	}
435
436	inline std::ptrdiff_t SumsBytes(const PEMat& packed) {
437	// Packed matrices are only relevant for Ruy's TrMul implementations. For
438	// TrMul, the number of sums is always equal to the number of columns.
439	return packed.layout.cols * packed.sums_type.size;
440	}
441
442	// Transpose helpers.
443
444	inline Order Transpose(Order order) {
445	return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
446	}
447
448	inline MatLayout Transpose(const MatLayout& layout) {
449	MatLayout result(layout);
450	result.order = Transpose(result.order);
451	std::swap(result.rows, result.cols);
452	return result;
453	}
454
455	template <typename Scalar>
456	Mat<Scalar> Transpose(const Mat<Scalar>& matrix) {
457	Mat<Scalar> result(matrix);
458	result.layout = Transpose(result.layout);
459	return result;
460	}
461
462	// Compile-time version of KernelLayout, used to declare kernel layouts in a
463	// way that can be consumed by compile-time logic.
464	template <Order tOrder, int tRows, int tCols>
465	struct FixedKernelLayout {
466	static constexpr Order kOrder = tOrder;
467	static constexpr int kRows = tRows;
468	static constexpr int kCols = tCols;
469	};
470
471	template <typename FixedKernelLayout>
472	KernelLayout ToKernelLayout() {
473	KernelLayout ret;
474	ret.order = FixedKernelLayout::kOrder;
475	ret.rows = FixedKernelLayout::kRows;
476	ret.cols = FixedKernelLayout::kCols;
477	return ret;
478	}
479
480	#if (__cplusplus < 201703L)
481	// A static constexpr data member is automatically inline and should not require
482	// redeclaration without an initializer. This is actually deprecated from C++17
483	// onwards. Clang with -O0 without this can fail to link.
484	template <Order tOrder, int tRows, int tCols>
485	constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols;
486	template <Order tOrder, int tRows, int tCols>
487	constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows;
488	#endif
489
490	} // namespace ruy
491
492	#endif // RUY_RUY_INTERNAL_MATRIX_H_
493

Browse the source code of tensorflow/external/ruy/ruy/mat.h