1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // # What is "packing"? |
17 | // |
18 | // Before feeding data to the gemm kernels (the parts of Ruy that do lots |
19 | // of multiply-add operations), Ruy first performs a data transformation (which |
20 | // we call "packing") on the input matrices. This transformation has two main |
21 | // goals: |
22 | // - rearrange data into blocks that are a convenient size/layout for the gemm |
23 | // kernels to consume. This helps make the memory access pattern of the gemm |
24 | // kernel simpler and more contiguous, and puts the data in a layout most |
25 | // convenient for specific arithmetic instructions in the gemm kernel. |
26 | // - compute row/column sums needed for handling quantization with non-symmetric |
27 | // zero points. |
28 | // |
29 | // # Simplified algorithmic analysis of packing |
30 | // |
31 | // Packing is a relatively simple transformation which does a small constant |
32 | // amount of work on each element of an input matrix, and hence for an NxM |
33 | // matrix performs O(N*M) work. If N and M are of the same order, then this is |
34 | // O(N^2) work. |
35 | // |
36 | // A NxKxM matrix multiplication requires N*K*M multiply-accumulate operations. |
37 | // Note that if N, K, and M are all the same order, then the number of |
38 | // multiply-accumulate operations is O(N^3). |
39 | // |
40 | // Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the |
41 | // case of all dimensions being roughly the same order. |
42 | // |
43 | // # Packing cost can be significant |
44 | // |
45 | // When matrix * matrix multiplications begin to look more like matrix * vector |
46 | // multiplications, packing cost can become significant. We sometimes call these |
47 | // cases "gemv-like". |
48 | // |
49 | // Continuing the algorithmic analysis above, if we consider a case where an |
50 | // NxKxM matrix multiplication has either N = O(1) or M = O(1), then the |
51 | // situation is different. In this case, the multiply-accumulate work is only |
52 | // quadratic, so the quadratic cost of packing can be come significant. |
53 | // |
54 | // Another way to say this is that the cost of packing an input matrix (either |
55 | // the LHS or RHS) is amortized across the non-depth dimension of the opposite |
56 | // input matrix. Thus, when the LHS has very few rows or the RHS has very few |
57 | // columns, the cost of packing the opposite input matrix can become |
58 | // significant. |
59 | // |
60 | // As a rough rule of thumb, the cost of packing starts to become significant |
61 | // when either N or M is below 32 (and other dimensions are hundreds), with very |
62 | // significant packing costs at 8 or below. This varies by data type, Path, and |
63 | // tuning, so these numbers are only rough guides. |
64 | // |
65 | // One practical use case that is affected by this is inference of |
66 | // fully connected neural network layers with a low batch size. The weight |
67 | // matrix (which is a constant for inference) is the one affected by significant |
68 | // packing cost. |
69 | // |
70 | // Ruy has an optional feature, accessed by Matrix::set_cache_policy(), to |
71 | // cache the packed forms of constant matrices. |
72 | // |
73 | // # Implementation notes |
74 | // |
75 | // Ruy's packing routines always operate on a range of columns and can be |
76 | // applied to either the LHS or RHS. This is possible because Ruy internally |
77 | // implements a TrMul, so the accumulation along depth is done along columns of |
78 | // both the LHS and RHS (whereas for a normal Mul the accumulation along depth |
79 | // for the LHS is along rows). As another example, we are always computing |
80 | // column sums for quantization (and never row sums, since the LHS is |
81 | // transposed). |
82 | |
83 | #ifndef RUY_RUY_PACK_H_ |
84 | #define RUY_RUY_PACK_H_ |
85 | |
86 | #include "ruy/mat.h" |
87 | #include "ruy/pack_common.h" |
88 | #include "ruy/path.h" |
89 | #include "ruy/platform.h" |
90 | #include "ruy/trace.h" |
91 | |
92 | // IWYU pragma: begin_exports |
93 | #if RUY_PLATFORM_NEON |
94 | #include "ruy/pack_arm.h" |
95 | #elif RUY_PLATFORM_X86 |
96 | #include "ruy/pack_x86.h" |
97 | #endif |
98 | // IWYU pragma: end_exports |
99 | |
100 | namespace ruy { |
101 | |
102 | // General implementation of the PackImpl template, overridden by template |
103 | // specializations for specific SIMD code paths. This general implementation |
104 | // is used with Path::kStandardCpp and its internal test-only variants. |
105 | template <Path ThePath, typename FixedKernelLayout, typename Scalar, |
106 | typename PackedScalar, typename SumsType, Order SrcOrder> |
107 | struct PackImpl { |
108 | static void Run(Tuning, const Mat<Scalar>& src_matrix, |
109 | PMat<PackedScalar>* packed_matrix, int start_col, |
110 | int end_col) { |
111 | profiler::ScopeLabel label("Pack (generic)" ); |
112 | RUY_DCHECK_EQ(SrcOrder, src_matrix.layout.order); |
113 | RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0); |
114 | SumsType* sums = packed_matrix->sums; |
115 | for (int col = start_col; col < end_col; col++) { |
116 | SumsType accum = 0; |
117 | for (int row = 0; row < packed_matrix->layout.rows; row++) { |
118 | PackedScalar packed_val; |
119 | if (col < src_matrix.layout.cols && row < src_matrix.layout.rows) { |
120 | packed_val = Pack<PackedScalar>(Element(src_matrix, row, col)); |
121 | } else { |
122 | packed_val = packed_matrix->zero_point; |
123 | } |
124 | accum += packed_val; |
125 | *ElementPtr(packed_matrix, row, col) = packed_val; |
126 | } |
127 | if (sums) { |
128 | sums[col] = accum; |
129 | } |
130 | } |
131 | } |
132 | }; |
133 | |
134 | // Main entry point for packing. |
135 | template <Path ThePath, typename FixedKernelLayout, typename Scalar, |
136 | typename PackedScalar> |
137 | void RunPack(Tuning tuning, const EMat& src_matrix, PEMat* packed_matrix, |
138 | int start_col, int end_col) { |
139 | RUY_TRACE_SCOPE; |
140 | using SumsType = typename PMat<PackedScalar>::SumsType; |
141 | Mat<Scalar> src = UneraseType<Scalar>(src_matrix); |
142 | PMat<PackedScalar> packed = UneraseType<PackedScalar>(*packed_matrix); |
143 | RUY_TRACE_INFO(RUN_PACK); |
144 | if (src.layout.order == Order::kColMajor) { |
145 | PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType, |
146 | Order::kColMajor>::Run(tuning, src, &packed, start_col, end_col); |
147 | } else { |
148 | PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType, |
149 | Order::kRowMajor>::Run(tuning, src, &packed, start_col, end_col); |
150 | } |
151 | } |
152 | |
153 | } // namespace ruy |
154 | |
155 | #endif // RUY_RUY_PACK_H_ |
156 | |