1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // Internal types and helpers for matrices. |
17 | // "Mat" is the name we use to refer to our internal matrix classes; it can be |
18 | // thought of as a shorter version of "InternalMatrix"` |
19 | // |
20 | // Ruy has four internal matrix classes, besides the |
21 | // Matrix<T> class that we expose to the user-facing API. |
22 | // |
23 | // TODO(silvasean): Put parts of this architecture description somewhere more |
24 | // prominent. |
25 | // |
26 | // The 4 internal matrix classes are named Mat, EMat, PMat, PEMat, where: |
27 | // - "E" indicates a type-erased class, storing a void* pointer and a runtime |
28 | // enum value to track the scalar type, as opposed to being templatized |
29 | // on a Scalar type and storing a Scalar* pointer. |
30 | // - "P" indicates a packed matrix class, the output of the packing code and |
31 | // input of the kernel code. See comments in pack.h regarding packing. |
32 | // |
33 | // In other words: |
34 | // |
35 | // Plain matrices Packed matrices |
36 | // +---------------------------------- |
37 | // Templated | Mat, Matrix PMat |
38 | // Type-erased | EMat PEMat |
39 | // |
40 | // Note that Matrix<T> is *not* implemented in terms of the internal types. It |
41 | // is an independent, simple, and user-facing type. Matrix<T> is functionally |
42 | // equivalent to Mat, but we keep it separate to insulate internals from |
43 | // interface and to be able to make different compromises in internals |
44 | // vs interface: in internals we prefer Mat to be a C-style struct with |
45 | // raw data member access and to be similar to the other PMat/EMat/PEMat |
46 | // classes for consistency. |
47 | // |
48 | // The use of type-erasure might seem surprising for a library like Ruy with a |
49 | // heavily-templated entry point, but it is motivated by the desire for most of |
50 | // Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3 |
51 | // main parts: |
52 | // - "entry-point" (ruy.h) - this is the highly templated ruy::Mul entry |
53 | // point. |
54 | // - "front-end" (frontend.*, validate.*, create_trmul_params.*, |
55 | // prepare_packed_matrices.*) - the work to handle the entry-point call down to |
56 | // the point where it can be handed off to the middle/back ends below. That |
57 | // includes routines that select RunKernel and RunPack |
58 | // implementations statically based on those template parameters. |
59 | // - "back-end" (kernel_*.*, pack_*.*)- this consists of the implementations of |
60 | // RunKernel and RunPack, often in assembly code, which are the building blocks |
61 | // that Ruy calls to perform matrix multiplication. These are templated so that |
62 | // only the requested types/Path's are actually emitted by the compiler. |
63 | // - "middle-end" (trmul.*) - this is the part of Ruy that orchestrates the |
64 | // calls to the "back-end" optimized building blocks. This layer has to deal |
65 | // with issues like cache locality and low-overhead multi-threading. |
66 | // |
67 | // There is a desire for the "middle-end" to be non-templated in order to |
68 | // simplify the implementation and reduce code-size. We type-erase when going |
69 | // from the "front-end" to the "middle-end", and un-type-erase going from the |
70 | // "middle-end" to the "back-end". The un-type-erasure is possible because the |
71 | // "front-end" is responsible for instantiating the needed "back-end" templates, |
72 | // and thus the static type information is still present. |
73 | // |
74 | // Each layer of Ruy uses matrix types: |
75 | // - "entry-point": Matrix<T> |
76 | // - "front-end": Mat |
77 | // - "middle-end": EMat, PEMat |
78 | // - "back-end": Mat, PMat |
79 | // |
80 | // The use of separate types for packed matrices is not essential, but makes it |
81 | // obvious at a glance whether a matrix is a packed matrix or not. We would |
82 | // reconsider this decision if there was significant duplication between packed |
83 | // and unpacked matrices, but that doesn't seem to be the case at the moment. |
84 | // |
85 | // Another goal is to keep the user-facing Matrix<T> as simple and |
86 | // understandable as possible. Ideally, a user should be able to read the struct |
87 | // definition for Matrix<T> and see a very simple definition with no internal |
88 | // details like sums and kernel block layout. |
89 | |
90 | #ifndef RUY_RUY_INTERNAL_MATRIX_H_ |
91 | #define RUY_RUY_INTERNAL_MATRIX_H_ |
92 | |
93 | #include <cstddef> |
94 | #include <cstdint> |
95 | #include <type_traits> |
96 | #include <utility> |
97 | |
98 | #include "ruy/check_macros.h" |
99 | #include "ruy/matrix.h" |
100 | #include "ruy/size_util.h" |
101 | |
102 | namespace ruy { |
103 | |
104 | // Internal counterpart of Layout, used by Mat. |
105 | struct MatLayout final { |
106 | std::int32_t rows = 0; |
107 | std::int32_t cols = 0; |
108 | // Stride is the offset between two adjacent matrix elements |
109 | // in the non-contiguous direction. |
110 | std::int32_t stride = 0; |
111 | Order order = Order::kColMajor; |
112 | }; |
113 | |
114 | inline MatLayout ToInternal(const Layout& src) { |
115 | MatLayout ret; |
116 | ret.rows = src.rows(); |
117 | ret.cols = src.cols(); |
118 | ret.stride = src.stride(); |
119 | ret.order = src.order(); |
120 | return ret; |
121 | } |
122 | |
123 | // Internal counterpart of Matrix |
124 | template <typename Scalar> |
125 | struct Mat final { |
126 | detail::ConstCheckingPtr<Scalar> data; |
127 | MatLayout layout; |
128 | Scalar zero_point = 0; |
129 | CachePolicy cache_policy = CachePolicy::kNeverCache; |
130 | }; |
131 | |
132 | template <typename Scalar> |
133 | inline Mat<Scalar> ToInternal(const Matrix<Scalar>& src) { |
134 | Mat<Scalar> ret; |
135 | ret.data.set(src.data()); |
136 | ret.layout = ToInternal(src.layout()); |
137 | ret.zero_point = src.zero_point(); |
138 | ret.cache_policy = src.cache_policy(); |
139 | return ret; |
140 | } |
141 | |
142 | template <typename Scalar> |
143 | inline Mat<Scalar> ToInternal(Matrix<Scalar>& src) { |
144 | Mat<Scalar> ret; |
145 | ret.data.set(src.data()); |
146 | ret.layout = ToInternal(src.layout()); |
147 | ret.zero_point = src.zero_point(); |
148 | ret.cache_policy = src.cache_policy(); |
149 | return ret; |
150 | } |
151 | |
152 | // KernelLayout describes small-scale block structure in a packed matrix layout. |
153 | // It's a runtime (as opposed to compile-time-constant) version of the |
154 | // FixedKernelLayout struct used to declare kernel layouts. |
155 | // |
156 | // This is is sometimes known as "tiling" in other contexts. |
157 | // |
158 | // For example, consider a packed matrix in column-major format with a |
159 | // column-major KernelLayout. The matrix logically has a shape of |
160 | // `[cols, rows]`. However, the matrix is laid out as though it were a 4D array |
161 | // of shape `[cols / kcols, rows / krows, kcols, krows]`. |
162 | // |
163 | // Note that in the case of kcols=1, krows=1, this degenerates to |
164 | // `[cols, rows, 1, 1]` which is equivalent to having no small-scale block |
165 | // structure. |
166 | struct KernelLayout final { |
167 | Order order = Order::kColMajor; |
168 | std::uint8_t rows = 1; |
169 | std::uint8_t cols = 1; |
170 | }; |
171 | |
172 | // A packed matrix has a small-scale block structure that is not present in in |
173 | // the input matrices. This block structure is necessary for the kernels to |
174 | // process data efficiently. |
175 | // |
176 | // This struct is very similar to MatLayout, but has the extra KernelLayout |
177 | // field. |
178 | struct PMatLayout final { |
179 | std::int32_t rows = 0; |
180 | std::int32_t cols = 0; |
181 | // Stride is the offset between two adjacent matrix elements |
182 | // in the non-contiguous direction. |
183 | std::int32_t stride = 0; |
184 | Order order = Order::kColMajor; |
185 | // Small scale layout shuffling, potentially departing from |
186 | // linear row-major or column-major storage. See KernelLayout. |
187 | KernelLayout kernel; |
188 | }; |
189 | |
190 | inline bool operator==(const PMatLayout& a, const PMatLayout& b) { |
191 | return a.cols == b.cols && a.rows == b.rows && a.stride == b.stride && |
192 | a.order == b.order && a.kernel.rows == b.kernel.rows && |
193 | a.kernel.cols == b.kernel.cols && a.kernel.order == b.kernel.order; |
194 | } |
195 | |
196 | // Dynamic representation for a type. |
197 | // |
198 | // The most important field in this struct is the size, which Ruy uses to know |
199 | // how much memory to allocate without having to be templated on a type. |
200 | // Signed-ness and floating-point-ness are mainly present as debugging checks. |
201 | // |
202 | // Note: Ruy does not use this struct to to dynamically dispatch between |
203 | // different typed implementations. As described in the comment at the top of |
204 | // this file, Ruy's "front-end", which is templated, instantiates all the |
205 | // necessary "back-end" routines with complete static knowledge of all the |
206 | // types. |
207 | struct Type final { |
208 | template <typename T> |
209 | static Type Create() { |
210 | Type ret; |
211 | ret.is_signed = std::is_signed<T>::value; |
212 | ret.is_floating_point = std::is_floating_point<T>::value; |
213 | ret.size = sizeof(T); |
214 | return ret; |
215 | } |
216 | |
217 | template <typename T> |
218 | void AssertIs() const { |
219 | RUY_DCHECK_EQ(is_signed, Create<T>().is_signed); |
220 | RUY_DCHECK_EQ(is_floating_point, Create<T>().is_floating_point); |
221 | RUY_DCHECK_EQ(size, Create<T>().size); |
222 | } |
223 | |
224 | bool is_signed = false; |
225 | bool is_floating_point = false; |
226 | std::uint8_t size = 0; |
227 | }; |
228 | |
229 | inline bool operator==(const Type& type1, const Type& type2) { |
230 | return type1.is_signed == type2.is_signed && |
231 | type1.is_floating_point == type2.is_floating_point && |
232 | type1.size == type2.size; |
233 | } |
234 | |
235 | // Type-erased matrix. |
236 | struct EMat final { |
237 | Type data_type; |
238 | void* data = nullptr; |
239 | MatLayout layout; |
240 | std::int32_t zero_point = 0; |
241 | CachePolicy cache_policy = CachePolicy::kNeverCache; |
242 | }; |
243 | |
244 | // Type-erased packed matrix. |
245 | struct PEMat final { |
246 | Type data_type; |
247 | void* data = nullptr; |
248 | Type sums_type; |
249 | void* sums = nullptr; |
250 | PMatLayout layout; |
251 | std::int32_t zero_point = 0; |
252 | }; |
253 | |
254 | // Convenient typed helper for packed matrices. |
255 | template <typename Scalar> |
256 | struct PMat final { |
257 | // The row/column sums needed for quantized matrix multiplication when |
258 | // the opposite operand of the multiplication uses a non-symmetric zero |
259 | // point. |
260 | // This member is only relevant for packed matrices. |
261 | // Additionally, Ruy always uses 32-bit signed accumulators for quantized |
262 | // matrix multiplication. |
263 | // For floating point types, there is no quantization, so this pointer |
264 | // will always be null. We still need code referencing it to compile |
265 | // though, even if it is always branched around. Hence we use Scalar* |
266 | // itself as the type in that case. |
267 | using SumsType = |
268 | typename std::conditional<std::is_floating_point<Scalar>::value, Scalar, |
269 | std::int32_t>::type; |
270 | |
271 | Scalar* data = nullptr; |
272 | SumsType* sums = nullptr; |
273 | PMatLayout layout; |
274 | std::int32_t zero_point = 0; |
275 | }; |
276 | |
277 | template <typename T> |
278 | EMat EraseType(const Mat<T>& matrix) { |
279 | EMat ret; |
280 | ret.data_type = Type::Create<T>(); |
281 | ret.data = const_cast<void*>(static_cast<const void*>(matrix.data.get())); |
282 | ret.layout = matrix.layout; |
283 | ret.zero_point = matrix.zero_point; |
284 | ret.cache_policy = matrix.cache_policy; |
285 | return ret; |
286 | } |
287 | |
288 | template <typename T> |
289 | Mat<T> UneraseType(const EMat& matrix) { |
290 | matrix.data_type.AssertIs<T>(); |
291 | Mat<T> ret; |
292 | ret.data.set(static_cast<T*>(matrix.data)); |
293 | ret.layout = matrix.layout; |
294 | ret.zero_point = matrix.zero_point; |
295 | ret.cache_policy = matrix.cache_policy; |
296 | return ret; |
297 | } |
298 | |
299 | template <typename T> |
300 | PMat<T> UneraseType(const PEMat& matrix) { |
301 | using SumsType = typename PMat<T>::SumsType; |
302 | matrix.data_type.AssertIs<T>(); |
303 | matrix.sums_type.AssertIs<SumsType>(); |
304 | PMat<T> ret; |
305 | ret.data = static_cast<T*>(matrix.data); |
306 | ret.sums = static_cast<SumsType*>(matrix.sums); |
307 | ret.layout = matrix.layout; |
308 | ret.zero_point = matrix.zero_point; |
309 | return ret; |
310 | } |
311 | |
312 | // Helpers for MatLayout / PMatLayout. |
313 | |
314 | inline bool IsUnstrided(const MatLayout& layout) { |
315 | if (layout.order == Order::kColMajor) { |
316 | return layout.stride == layout.rows; |
317 | } else { |
318 | return layout.stride == layout.cols; |
319 | } |
320 | } |
321 | |
322 | inline bool IsRowMajor(const MatLayout& layout) { |
323 | return layout.order == Order::kRowMajor; |
324 | } |
325 | |
326 | inline bool IsColMajor(const MatLayout& layout) { |
327 | return layout.order == Order::kColMajor; |
328 | } |
329 | |
330 | inline std::ptrdiff_t FlatSize(const MatLayout& layout) { |
331 | const int outerdim = |
332 | layout.order == Order::kColMajor ? layout.cols : layout.rows; |
333 | return layout.stride * outerdim; |
334 | } |
335 | |
336 | inline bool IsUnstrided(const PMatLayout& layout) { |
337 | if (layout.order == Order::kColMajor) { |
338 | return layout.stride == layout.rows; |
339 | } else { |
340 | return layout.stride == layout.cols; |
341 | } |
342 | } |
343 | |
344 | inline bool IsRowMajor(const PMatLayout& layout) { |
345 | return layout.order == Order::kRowMajor; |
346 | } |
347 | |
348 | inline bool IsColMajor(const PMatLayout& layout) { |
349 | return layout.order == Order::kColMajor; |
350 | } |
351 | |
352 | inline std::ptrdiff_t FlatSize(const PMatLayout& layout) { |
353 | const int outerdim = |
354 | layout.order == Order::kColMajor ? layout.cols : layout.rows; |
355 | return layout.stride * outerdim; |
356 | } |
357 | |
358 | // TODO(b/130417400) add a unit test |
359 | inline int Offset(const MatLayout& layout, int row, int col) { |
360 | // TODO(benoitjacob) - should check this but this make the _slow tests take |
361 | // 5x longer. Find a mitigation like in Eigen with an 'internal' variant |
362 | // bypassing the check? |
363 | // RUY_DCHECK_GE(row, 0); |
364 | // RUY_DCHECK_GE(col, 0); |
365 | // RUY_DCHECK_LT(row, layout.rows); |
366 | // RUY_DCHECK_LT(col, layout.cols); |
367 | int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride; |
368 | int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride; |
369 | return row * row_stride + col * col_stride; |
370 | } |
371 | |
372 | // TODO(b/130417400) add a unit test |
373 | inline int Offset(const PMatLayout& layout, int row, int col) { |
374 | RUY_DCHECK(is_pot(layout.kernel.rows)); |
375 | RUY_DCHECK(is_pot(layout.kernel.cols)); |
376 | int row_outer = row & ~(layout.kernel.rows - 1); |
377 | int col_outer = col & ~(layout.kernel.cols - 1); |
378 | int row_stride_outer = |
379 | layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride; |
380 | int col_stride_outer = |
381 | layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride; |
382 | int offset_outer = |
383 | row_outer * row_stride_outer + col_outer * col_stride_outer; |
384 | int row_inner = row - row_outer; |
385 | int col_inner = col - col_outer; |
386 | int row_stride_inner = |
387 | layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols; |
388 | int col_stride_inner = |
389 | layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows; |
390 | int offset_inner = |
391 | row_inner * row_stride_inner + col_inner * col_stride_inner; |
392 | return offset_outer + offset_inner; |
393 | } |
394 | |
395 | // Helpers for Mat<T>. |
396 | |
397 | template <typename Scalar> |
398 | const Scalar* ElementPtr(const Mat<Scalar>& mat, int row, int col) { |
399 | return mat.data.get() + Offset(mat.layout, row, col); |
400 | } |
401 | |
402 | template <typename Scalar> |
403 | Scalar* ElementPtr(Mat<Scalar>* mat, int row, int col) { |
404 | return mat->data.get() + Offset(mat->layout, row, col); |
405 | } |
406 | |
407 | template <typename Scalar> |
408 | Scalar Element(const Mat<Scalar>& mat, int row, int col) { |
409 | return *ElementPtr(mat, row, col); |
410 | } |
411 | |
412 | // Helpers for PMat<T>. |
413 | // Duplicated from Matrix<T>, but the duplication seems acceptable. |
414 | |
415 | template <typename Scalar> |
416 | const Scalar* ElementPtr(const PMat<Scalar>& mat, int row, int col) { |
417 | return mat.data + Offset(mat.layout, row, col); |
418 | } |
419 | |
420 | template <typename Scalar> |
421 | Scalar* ElementPtr(PMat<Scalar>* mat, int row, int col) { |
422 | return mat->data + Offset(mat->layout, row, col); |
423 | } |
424 | |
425 | template <typename Scalar> |
426 | Scalar Element(const PMat<Scalar>& mat, int row, int col) { |
427 | return *ElementPtr(mat, row, col); |
428 | } |
429 | |
430 | // Helpers for PEMat. |
431 | |
432 | inline std::ptrdiff_t DataBytes(const PEMat& packed) { |
433 | return FlatSize(packed.layout) * packed.data_type.size; |
434 | } |
435 | |
436 | inline std::ptrdiff_t SumsBytes(const PEMat& packed) { |
437 | // Packed matrices are only relevant for Ruy's TrMul implementations. For |
438 | // TrMul, the number of sums is always equal to the number of columns. |
439 | return packed.layout.cols * packed.sums_type.size; |
440 | } |
441 | |
442 | // Transpose helpers. |
443 | |
444 | inline Order Transpose(Order order) { |
445 | return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor; |
446 | } |
447 | |
448 | inline MatLayout Transpose(const MatLayout& layout) { |
449 | MatLayout result(layout); |
450 | result.order = Transpose(result.order); |
451 | std::swap(result.rows, result.cols); |
452 | return result; |
453 | } |
454 | |
455 | template <typename Scalar> |
456 | Mat<Scalar> Transpose(const Mat<Scalar>& matrix) { |
457 | Mat<Scalar> result(matrix); |
458 | result.layout = Transpose(result.layout); |
459 | return result; |
460 | } |
461 | |
462 | // Compile-time version of KernelLayout, used to declare kernel layouts in a |
463 | // way that can be consumed by compile-time logic. |
464 | template <Order tOrder, int tRows, int tCols> |
465 | struct FixedKernelLayout { |
466 | static constexpr Order kOrder = tOrder; |
467 | static constexpr int kRows = tRows; |
468 | static constexpr int kCols = tCols; |
469 | }; |
470 | |
471 | template <typename FixedKernelLayout> |
472 | KernelLayout ToKernelLayout() { |
473 | KernelLayout ret; |
474 | ret.order = FixedKernelLayout::kOrder; |
475 | ret.rows = FixedKernelLayout::kRows; |
476 | ret.cols = FixedKernelLayout::kCols; |
477 | return ret; |
478 | } |
479 | |
480 | #if (__cplusplus < 201703L) |
481 | // A static constexpr data member is automatically inline and should not require |
482 | // redeclaration without an initializer. This is actually deprecated from C++17 |
483 | // onwards. Clang with -O0 without this can fail to link. |
484 | template <Order tOrder, int tRows, int tCols> |
485 | constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kCols; |
486 | template <Order tOrder, int tRows, int tCols> |
487 | constexpr int FixedKernelLayout<tOrder, tRows, tCols>::kRows; |
488 | #endif |
489 | |
490 | } // namespace ruy |
491 | |
492 | #endif // RUY_RUY_INTERNAL_MATRIX_H_ |
493 | |