1/* Copyright 2019 Google LLC. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// # What is "packing"?
17//
18// Before feeding data to the gemm kernels (the parts of Ruy that do lots
19// of multiply-add operations), Ruy first performs a data transformation (which
20// we call "packing") on the input matrices. This transformation has two main
21// goals:
22// - rearrange data into blocks that are a convenient size/layout for the gemm
23// kernels to consume. This helps make the memory access pattern of the gemm
24// kernel simpler and more contiguous, and puts the data in a layout most
25// convenient for specific arithmetic instructions in the gemm kernel.
26// - compute row/column sums needed for handling quantization with non-symmetric
27// zero points.
28//
29// # Simplified algorithmic analysis of packing
30//
31// Packing is a relatively simple transformation which does a small constant
32// amount of work on each element of an input matrix, and hence for an NxM
33// matrix performs O(N*M) work. If N and M are of the same order, then this is
34// O(N^2) work.
35//
36// A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations.
37// Note that if N, K, and M are all the same order, then the number of
38// multiply-accumulate operations is O(N^3).
39//
40// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
41// case of all dimensions being roughly the same order.
42//
43// # Packing cost can be significant
44//
45// When matrix * matrix multiplications begin to look more like matrix * vector
46// multiplications, packing cost can become significant. We sometimes call these
47// cases "gemv-like".
48//
49// Continuing the algorithmic analysis above, if we consider a case where an
50// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
51// situation is different. In this case, the multiply-accumulate work is only
52// quadratic, so the quadratic cost of packing can be come significant.
53//
54// Another way to say this is that the cost of packing an input matrix (either
55// the LHS or RHS) is amortized across the non-depth dimension of the opposite
56// input matrix. Thus, when the LHS has very few rows or the RHS has very few
57// columns, the cost of packing the opposite input matrix can become
58// significant.
59//
60// As a rough rule of thumb, the cost of packing starts to become significant
61// when either N or M is below 32 (and other dimensions are hundreds), with very
62// significant packing costs at 8 or below. This varies by data type, Path, and
63// tuning, so these numbers are only rough guides.
64//
65// One practical use case that is affected by this is inference of
66// fully connected neural network layers with a low batch size. The weight
67// matrix (which is a constant for inference) is the one affected by significant
68// packing cost.
69//
70// Ruy has an optional feature, accessed by Matrix::set_cache_policy(), to
71// cache the packed forms of constant matrices.
72//
73// # Implementation notes
74//
75// Ruy's packing routines always operate on a range of columns and can be
76// applied to either the LHS or RHS. This is possible because Ruy internally
77// implements a TrMul, so the accumulation along depth is done along columns of
78// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
79// for the LHS is along rows). As another example, we are always computing
80// column sums for quantization (and never row sums, since the LHS is
81// transposed).
82
83#ifndef RUY_RUY_PACK_H_
84#define RUY_RUY_PACK_H_
85
86#include "ruy/mat.h"
87#include "ruy/pack_common.h"
88#include "ruy/path.h"
89#include "ruy/platform.h"
90#include "ruy/trace.h"
91
92// IWYU pragma: begin_exports
93#if RUY_PLATFORM_NEON
94#include "ruy/pack_arm.h"
95#elif RUY_PLATFORM_X86
96#include "ruy/pack_x86.h"
97#endif
98// IWYU pragma: end_exports
99
100namespace ruy {
101
102// General implementation of the PackImpl template, overridden by template
103// specializations for specific SIMD code paths. This general implementation
104// is used with Path::kStandardCpp and its internal test-only variants.
105template <Path ThePath, typename FixedKernelLayout, typename Scalar,
106 typename PackedScalar, typename SumsType, Order SrcOrder>
107struct PackImpl {
108 static void Run(Tuning, const Mat<Scalar>& src_matrix,
109 PMat<PackedScalar>* packed_matrix, int start_col,
110 int end_col) {
111 profiler::ScopeLabel label("Pack (generic)");
112 RUY_DCHECK_EQ(SrcOrder, src_matrix.layout.order);
113 RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0);
114 SumsType* sums = packed_matrix->sums;
115 for (int col = start_col; col < end_col; col++) {
116 SumsType accum = 0;
117 for (int row = 0; row < packed_matrix->layout.rows; row++) {
118 PackedScalar packed_val;
119 if (col < src_matrix.layout.cols && row < src_matrix.layout.rows) {
120 packed_val = Pack<PackedScalar>(Element(src_matrix, row, col));
121 } else {
122 packed_val = packed_matrix->zero_point;
123 }
124 accum += packed_val;
125 *ElementPtr(packed_matrix, row, col) = packed_val;
126 }
127 if (sums) {
128 sums[col] = accum;
129 }
130 }
131 }
132};
133
134// Main entry point for packing.
135template <Path ThePath, typename FixedKernelLayout, typename Scalar,
136 typename PackedScalar>
137void RunPack(Tuning tuning, const EMat& src_matrix, PEMat* packed_matrix,
138 int start_col, int end_col) {
139 RUY_TRACE_SCOPE;
140 using SumsType = typename PMat<PackedScalar>::SumsType;
141 Mat<Scalar> src = UneraseType<Scalar>(src_matrix);
142 PMat<PackedScalar> packed = UneraseType<PackedScalar>(*packed_matrix);
143 RUY_TRACE_INFO(RUN_PACK);
144 if (src.layout.order == Order::kColMajor) {
145 PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
146 Order::kColMajor>::Run(tuning, src, &packed, start_col, end_col);
147 } else {
148 PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
149 Order::kRowMajor>::Run(tuning, src, &packed, start_col, end_col);
150 }
151}
152
153} // namespace ruy
154
155#endif // RUY_RUY_PACK_H_
156