1/* Copyright 2019 Google LLC. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// Internal types and helpers for matrices.
17// "Mat" is the name we use to refer to our internal matrix classes; it can be
18// thought of as a shorter version of "InternalMatrix"`
19//
20// Ruy has four internal matrix classes, besides the
21// Matrix<T> class that we expose to the user-facing API.
22//
23// TODO(silvasean): Put parts of this architecture description somewhere more
24// prominent.
25//
26// The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where:
27// - "E" indicates a type-erased class, storing a void* pointer and a runtime
28// enum value to track the scalar type, as opposed to being templatized
29// on a Scalar type and storing a Scalar* pointer.
30// - "P" indicates a packed matrix class, the output of the packing code and
31// input of the kernel code. See comments in pack.h regarding packing.
32//
33// In other words:
34//
35// Plain matrices Packed matrices
36// +----------------------------------
37// Templated | Mat, Matrix PMat
38// Type-erased | EMat PEMat
39//
40// Note that Matrix<T> is *not* implemented in terms of the internal types. It
41// is an independent, simple, and user-facing type. Matrix<T> is functionally
42// equivalent to Mat, but we keep it separate to insulate internals from
43// interface and to be able to make different compromises in internals
44// vs interface: in internals we prefer Mat to be a C-style struct with
45// raw data member access and to be similar to the other PMat/EMat/PEMat
46// classes for consistency.
47//
48// The use of type-erasure might seem surprising for a library like Ruy with a
49// heavily-templated entry point, but it is motivated by the desire for most of
50// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
51// main parts:
52// - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry
53// point.
54// - "front-end" (frontend.*, validate.*, create_trmul_params.*,
55// prepare_packed_matrices.*) - the work to handle the entry-point call down to
56// the point where it can be handed off to the middle/back ends below. That
57// includes routines that select RunKernel and RunPack
58// implementations statically based on those template parameters.
59// - "back-end" (kernel_*.*, pack_*.*)- this consists of the implementations of
60// RunKernel and RunPack, often in assembly code, which are the building blocks
61// that Ruy calls to perform matrix multiplication. These are templated so that
62// only the requested types/Path's are actually emitted by the compiler.
63// - "middle-end" (trmul.*) - this is the part of Ruy that orchestrates the
64// calls to the "back-end" optimized building blocks. This layer has to deal
65// with issues like cache locality and low-overhead multi-threading.
66//
67// There is a desire for the "middle-end" to be non-templated in order to
68// simplify the implementation and reduce code-size. We type-erase when going
69// from the "front-end" to the "middle-end", and un-type-erase going from the
70// "middle-end" to the "back-end". The un-type-erasure is possible because the
71// "front-end" is responsible for instantiating the needed "back-end" templates,
72// and thus the static type information is still present.
73//
74// Each layer of Ruy uses matrix types:
75// - "entry-point": Matrix<T>
76// - "front-end": Mat
77// - "middle-end": EMat, PEMat
78// - "back-end": Mat, PMat
79//
80// The use of separate types for packed matrices is not essential, but makes it
81// obvious at a glance whether a matrix is a packed matrix or not. We would
82// reconsider this decision if there was significant duplication between packed
83// and unpacked matrices, but that doesn't seem to be the case at the moment.
84//
85// Another goal is to keep the user-facing Matrix<T> as simple and
86// understandable as possible. Ideally, a user should be able to read the struct
87// definition for Matrix<T> and see a very simple definition with no internal
88// details like sums and kernel block layout.
89
90#ifndef RUY_RUY_INTERNAL_MATRIX_H_
91#define RUY_RUY_INTERNAL_MATRIX_H_
92
93#include <cstddef>
94#include <cstdint>
95#include <type_traits>
96#include <utility>
97
98#include "ruy/check_macros.h"
99#include "ruy/matrix.h"
100#include "ruy/size_util.h"
101
102namespace ruy {
103
104// Internal counterpart of Layout, used by Mat.
105struct MatLayout final {
106 std::int32_t rows = 0;
107 std::int32_t cols = 0;
108 // Stride is the offset between two adjacent matrix elements
109 // in the non-contiguous direction.
110 std::int32_t stride = 0;
111 Order order = Order::kColMajor;
112};
113
114inline MatLayout ToInternal(const Layout& src) {
115 MatLayout ret;
116 ret.rows = src.rows();
117 ret.cols = src.cols();
118 ret.stride = src.stride();
119 ret.order = src.order();
120 return ret;
121}
122
123// Internal counterpart of Matrix
124template <typename Scalar>
125struct Mat final {
126 detail::ConstCheckingPtr<Scalar> data;
127 MatLayout layout;
128 Scalar zero_point = 0;
129 CachePolicy cache_policy = CachePolicy::kNeverCache;
130};
131
132template <typename Scalar>
133inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) {
134 Mat<Scalar> ret;
135 ret.data.set(src.data());
136 ret.layout = ToInternal(src.layout());
137 ret.zero_point = src.zero_point();
138 ret.cache_policy = src.cache_policy();
139 return ret;
140}
141
142template <typename Scalar>
143inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) {
144 Mat<Scalar> ret;
145 ret.data.set(src.data());
146 ret.layout = ToInternal(src.layout());
147 ret.zero_point = src.zero_point();
148 ret.cache_policy = src.cache_policy();
149 return ret;
150}
151
152// KernelLayout describes small-scale block structure in a packed matrix layout.
153// It's a runtime (as opposed to compile-time-constant) version of the
154// FixedKernelLayout struct used to declare kernel layouts.
155//
156// This is is sometimes known as "tiling" in other contexts.
157//
158// For example, consider a packed matrix in column-major format with a
159// column-major KernelLayout. The matrix logically has a shape of
160// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
161// of shape `[cols / kcols, rows / krows, kcols, krows]`.
162//
163// Note that in the case of kcols=1, krows=1, this degenerates to
164// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
165// structure.
166struct KernelLayout final {
167 Order order = Order::kColMajor;
168 std::uint8_t rows = 1;
169 std::uint8_t cols = 1;
170};
171
172// A packed matrix has a small-scale block structure that is not present in in
173// the input matrices. This block structure is necessary for the kernels to
174// process data efficiently.
175//
176// This struct is very similar to MatLayout, but has the extra KernelLayout
177// field.
178struct PMatLayout final {
179 std::int32_t rows = 0;
180 std::int32_t cols = 0;
181 // Stride is the offset between two adjacent matrix elements
182 // in the non-contiguous direction.
183 std::int32_t stride = 0;
184 Order order = Order::kColMajor;
185 // Small scale layout shuffling, potentially departing from
186 // linear row-major or column-major storage. See KernelLayout.
187 KernelLayout kernel;
188};
189
190inline bool operator==(const PMatLayout& a, const PMatLayout& b) {
191 return a.cols == b.cols && a.rows == b.rows && a.stride == b.stride &&
192 a.order == b.order && a.kernel.rows == b.kernel.rows &&
193 a.kernel.cols == b.kernel.cols && a.kernel.order == b.kernel.order;
194}
195
196// Dynamic representation for a type.
197//
198// The most important field in this struct is the size, which Ruy uses to know
199// how much memory to allocate without having to be templated on a type.
200// Signed-ness and floating-point-ness are mainly present as debugging checks.
201//
202// Note: Ruy does not use this struct to to dynamically dispatch between
203// different typed implementations. As described in the comment at the top of
204// this file, Ruy's "front-end", which is templated, instantiates all the
205// necessary "back-end" routines with complete static knowledge of all the
206// types.
207struct Type final {
208 template <typename T>
209 static Type Create() {
210 Type ret;
211 ret.is_signed = std::is_signed<T>::value;
212 ret.is_floating_point = std::is_floating_point<T>::value;
213 ret.size = sizeof(T);
214 return ret;
215 }
216
217 template <typename T>
218 void AssertIs() const {
219 RUY_DCHECK_EQ(is_signed, Create<T>().is_signed);
220 RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point);
221 RUY_DCHECK_EQ(size, Create<T>().size);
222 }
223
224 bool is_signed = false;
225 bool is_floating_point = false;
226 std::uint8_t size = 0;
227};
228
229inline bool operator==(const Type& type1, const Type& type2) {
230 return type1.is_signed == type2.is_signed &&
231 type1.is_floating_point == type2.is_floating_point &&
232 type1.size == type2.size;
233}
234
235// Type-erased matrix.
236struct EMat final {
237 Type data_type;
238 void* data = nullptr;
239 MatLayout layout;
240 std::int32_t zero_point = 0;
241 CachePolicy cache_policy = CachePolicy::kNeverCache;
242};
243
244// Type-erased packed matrix.
245struct PEMat final {
246 Type data_type;
247 void* data = nullptr;
248 Type sums_type;
249 void* sums = nullptr;
250 PMatLayout layout;
251 std::int32_t zero_point = 0;
252};
253
254// Convenient typed helper for packed matrices.
255template <typename Scalar>
256struct PMat final {
257 // The row/column sums needed for quantized matrix multiplication when
258 // the opposite operand of the multiplication uses a non-symmetric zero
259 // point.
260 // This member is only relevant for packed matrices.
261 // Additionally, Ruy always uses 32-bit signed accumulators for quantized
262 // matrix multiplication.
263 // For floating point types, there is no quantization, so this pointer
264 // will always be null. We still need code referencing it to compile
265 // though, even if it is always branched around. Hence we use Scalar*
266 // itself as the type in that case.
267 using SumsType =
268 typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
269 std::int32_t>::type;
270
271 Scalar* data = nullptr;
272 SumsType* sums = nullptr;
273 PMatLayout layout;
274 std::int32_t zero_point = 0;
275};
276
277template <typename T>
278EMat EraseType(const Mat<T>& matrix) {
279 EMat ret;
280 ret.data_type = Type::Create<T>();
281 ret.data = const_cast<void*>(static_cast<const void*>(matrix.data.get()));
282 ret.layout = matrix.layout;
283 ret.zero_point = matrix.zero_point;
284 ret.cache_policy = matrix.cache_policy;
285 return ret;
286}
287
288template <typename T>
289Mat<T> UneraseType(const EMat& matrix) {
290 matrix.data_type.AssertIs<T>();
291 Mat<T> ret;
292 ret.data.set(static_cast<T*>(matrix.data));
293 ret.layout = matrix.layout;
294 ret.zero_point = matrix.zero_point;
295 ret.cache_policy = matrix.cache_policy;
296 return ret;
297}
298
299template <typename T>
300PMat<T> UneraseType(const PEMat& matrix) {
301 using SumsType = typename PMat<T>::SumsType;
302 matrix.data_type.AssertIs<T>();
303 matrix.sums_type.AssertIs<SumsType>();
304 PMat<T> ret;
305 ret.data = static_cast<T*>(matrix.data);
306 ret.sums = static_cast<SumsType*>(matrix.sums);
307 ret.layout = matrix.layout;
308 ret.zero_point = matrix.zero_point;
309 return ret;
310}
311
312// Helpers for MatLayout / PMatLayout.
313
314inline bool IsUnstrided(const MatLayout& layout) {
315 if (layout.order == Order::kColMajor) {
316 return layout.stride == layout.rows;
317 } else {
318 return layout.stride == layout.cols;
319 }
320}
321
322inline bool IsRowMajor(const MatLayout& layout) {
323 return layout.order == Order::kRowMajor;
324}
325
326inline bool IsColMajor(const MatLayout& layout) {
327 return layout.order == Order::kColMajor;
328}
329
330inline std::ptrdiff_t FlatSize(const MatLayout& layout) {
331 const int outerdim =
332 layout.order == Order::kColMajor ? layout.cols : layout.rows;
333 return layout.stride * outerdim;
334}
335
336inline bool IsUnstrided(const PMatLayout& layout) {
337 if (layout.order == Order::kColMajor) {
338 return layout.stride == layout.rows;
339 } else {
340 return layout.stride == layout.cols;
341 }
342}
343
344inline bool IsRowMajor(const PMatLayout& layout) {
345 return layout.order == Order::kRowMajor;
346}
347
348inline bool IsColMajor(const PMatLayout& layout) {
349 return layout.order == Order::kColMajor;
350}
351
352inline std::ptrdiff_t FlatSize(const PMatLayout& layout) {
353 const int outerdim =
354 layout.order == Order::kColMajor ? layout.cols : layout.rows;
355 return layout.stride * outerdim;
356}
357
358// TODO(b/130417400) add a unit test
359inline int Offset(const MatLayout& layout, int row, int col) {
360 // TODO(benoitjacob) - should check this but this make the _slow tests take
361 // 5x longer. Find a mitigation like in Eigen with an 'internal' variant
362 // bypassing the check?
363 // RUY_DCHECK_GE(row, 0);
364 // RUY_DCHECK_GE(col, 0);
365 // RUY_DCHECK_LT(row, layout.rows);
366 // RUY_DCHECK_LT(col, layout.cols);
367 int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
368 int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
369 return row * row_stride + col * col_stride;
370}
371
372// TODO(b/130417400) add a unit test
373inline int Offset(const PMatLayout& layout, int row, int col) {
374 RUY_DCHECK(is_pot(layout.kernel.rows));
375 RUY_DCHECK(is_pot(layout.kernel.cols));
376 int row_outer = row & ~(layout.kernel.rows - 1);
377 int col_outer = col & ~(layout.kernel.cols - 1);
378 int row_stride_outer =
379 layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
380 int col_stride_outer =
381 layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
382 int offset_outer =
383 row_outer * row_stride_outer + col_outer * col_stride_outer;
384 int row_inner = row - row_outer;
385 int col_inner = col - col_outer;
386 int row_stride_inner =
387 layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
388 int col_stride_inner =
389 layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
390 int offset_inner =
391 row_inner * row_stride_inner + col_inner * col_stride_inner;
392 return offset_outer + offset_inner;
393}
394
395// Helpers for Mat<T>.
396
397template <typename Scalar>
398const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) {
399 return mat.data.get() + Offset(mat.layout, row, col);
400}
401
402template <typename Scalar>
403Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) {
404 return mat->data.get() + Offset(mat->layout, row, col);
405}
406
407template <typename Scalar>
408Scalar Element(const Mat<Scalar>& mat, int row, int col) {
409 return *ElementPtr(mat, row, col);
410}
411
412// Helpers for PMat<T>.
413// Duplicated from Matrix<T>, but the duplication seems acceptable.
414
415template <typename Scalar>
416const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) {
417 return mat.data + Offset(mat.layout, row, col);
418}
419
420template <typename Scalar>
421Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) {
422 return mat->data + Offset(mat->layout, row, col);
423}
424
425template <typename Scalar>
426Scalar Element(const PMat<Scalar>& mat, int row, int col) {
427 return *ElementPtr(mat, row, col);
428}
429
430// Helpers for PEMat.
431
432inline std::ptrdiff_t DataBytes(const PEMat& packed) {
433 return FlatSize(packed.layout) * packed.data_type.size;
434}
435
436inline std::ptrdiff_t SumsBytes(const PEMat& packed) {
437 // Packed matrices are only relevant for Ruy's TrMul implementations. For
438 // TrMul, the number of sums is always equal to the number of columns.
439 return packed.layout.cols * packed.sums_type.size;
440}
441
442// Transpose helpers.
443
444inline Order Transpose(Order order) {
445 return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
446}
447
448inline MatLayout Transpose(const MatLayout& layout) {
449 MatLayout result(layout);
450 result.order = Transpose(result.order);
451 std::swap(result.rows, result.cols);
452 return result;
453}
454
455template <typename Scalar>
456Mat<Scalar> Transpose(const Mat<Scalar>& matrix) {
457 Mat<Scalar> result(matrix);
458 result.layout = Transpose(result.layout);
459 return result;
460}
461
462// Compile-time version of KernelLayout, used to declare kernel layouts in a
463// way that can be consumed by compile-time logic.
464template <Order tOrder, int tRows, int tCols>
465struct FixedKernelLayout {
466 static constexpr Order kOrder = tOrder;
467 static constexpr int kRows = tRows;
468 static constexpr int kCols = tCols;
469};
470
471template <typename FixedKernelLayout>
472KernelLayout ToKernelLayout() {
473 KernelLayout ret;
474 ret.order = FixedKernelLayout::kOrder;
475 ret.rows = FixedKernelLayout::kRows;
476 ret.cols = FixedKernelLayout::kCols;
477 return ret;
478}
479
480#if (__cplusplus < 201703L)
481// A static constexpr data member is automatically inline and should not require
482// redeclaration without an initializer. This is actually deprecated from C++17
483// onwards. Clang with -O0 without this can fail to link.
484template <Order tOrder, int tRows, int tCols>
485constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols;
486template <Order tOrder, int tRows, int tCols>
487constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows;
488#endif
489
490} // namespace ruy
491
492#endif // RUY_RUY_INTERNAL_MATRIX_H_
493