pack.h source code [tensorflow/external/ruy/ruy/pack.h]

1	/ Copyright 2019 Google LLC. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// # What is "packing"?
17	//
18	// Before feeding data to the gemm kernels (the parts of Ruy that do lots
19	// of multiply-add operations), Ruy first performs a data transformation (which
20	// we call "packing") on the input matrices. This transformation has two main
21	// goals:
22	// - rearrange data into blocks that are a convenient size/layout for the gemm
23	// kernels to consume. This helps make the memory access pattern of the gemm
24	// kernel simpler and more contiguous, and puts the data in a layout most
25	// convenient for specific arithmetic instructions in the gemm kernel.
26	// - compute row/column sums needed for handling quantization with non-symmetric
27	// zero points.
28	//
29	// # Simplified algorithmic analysis of packing
30	//
31	// Packing is a relatively simple transformation which does a small constant
32	// amount of work on each element of an input matrix, and hence for an NxM
33	// matrix performs O(NM) work. If N and M are of the same order, then this is*
34	// O(N^2) work.
35	//
36	// A NxKxM matrix multiplication requires NKM multiply-accumulate operations.
37	// Note that if N, K, and M are all the same order, then the number of
38	// multiply-accumulate operations is O(N^3).
39	//
40	// Thus, the O(N^2) cost of packing is small compared to the O(N^3) work, in the
41	// case of all dimensions being roughly the same order.
42	//
43	// # Packing cost can be significant
44	//
45	// When matrix matrix multiplications begin to look more like matrix * vector*
46	// multiplications, packing cost can become significant. We sometimes call these
47	// cases "gemv-like".
48	//
49	// Continuing the algorithmic analysis above, if we consider a case where an
50	// NxKxM matrix multiplication has either N = O(1) or M = O(1), then the
51	// situation is different. In this case, the multiply-accumulate work is only
52	// quadratic, so the quadratic cost of packing can be come significant.
53	//
54	// Another way to say this is that the cost of packing an input matrix (either
55	// the LHS or RHS) is amortized across the non-depth dimension of the opposite
56	// input matrix. Thus, when the LHS has very few rows or the RHS has very few
57	// columns, the cost of packing the opposite input matrix can become
58	// significant.
59	//
60	// As a rough rule of thumb, the cost of packing starts to become significant
61	// when either N or M is below 32 (and other dimensions are hundreds), with very
62	// significant packing costs at 8 or below. This varies by data type, Path, and
63	// tuning, so these numbers are only rough guides.
64	//
65	// One practical use case that is affected by this is inference of
66	// fully connected neural network layers with a low batch size. The weight
67	// matrix (which is a constant for inference) is the one affected by significant
68	// packing cost.
69	//
70	// Ruy has an optional feature, accessed by Matrix::set_cache_policy(), to
71	// cache the packed forms of constant matrices.
72	//
73	// # Implementation notes
74	//
75	// Ruy's packing routines always operate on a range of columns and can be
76	// applied to either the LHS or RHS. This is possible because Ruy internally
77	// implements a TrMul, so the accumulation along depth is done along columns of
78	// both the LHS and RHS (whereas for a normal Mul the accumulation along depth
79	// for the LHS is along rows). As another example, we are always computing
80	// column sums for quantization (and never row sums, since the LHS is
81	// transposed).
82
83	#ifndef RUY_RUY_PACK_H_
84	#define RUY_RUY_PACK_H_
85
86	#include "ruy/mat.h"
87	#include "ruy/pack_common.h"
88	#include "ruy/path.h"
89	#include "ruy/platform.h"
90	#include "ruy/trace.h"
91
92	// IWYU pragma: begin_exports
93	#if RUY_PLATFORM_NEON
94	#include "ruy/pack_arm.h"
95	#elif RUY_PLATFORM_X86
96	#include "ruy/pack_x86.h"
97	#endif
98	// IWYU pragma: end_exports
99
100	namespace ruy {
101
102	// General implementation of the PackImpl template, overridden by template
103	// specializations for specific SIMD code paths. This general implementation
104	// is used with Path::kStandardCpp and its internal test-only variants.
105	template <Path ThePath, typename FixedKernelLayout, typename Scalar,
106	typename PackedScalar, typename SumsType, Order SrcOrder>
107	struct PackImpl {
108	static void Run(Tuning, const Mat<Scalar>& src_matrix,
109	PMat<PackedScalar>* packed_matrix, int start_col,
110	int end_col) {
111	profiler::ScopeLabel label("Pack (generic)");
112	RUY_DCHECK_EQ(SrcOrder, src_matrix.layout.order);
113	RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, `0`);
114	SumsType* sums = packed_matrix->sums;
115	for (int col = start_col; col < end_col; col++) {
116	SumsType accum = `0`;
117	for (int row = `0`; row < packed_matrix->layout.rows; row++) {
118	PackedScalar packed_val;
119	if (col < src_matrix.layout.cols && row < src_matrix.layout.rows) {
120	packed_val = Pack<PackedScalar>(Element(src_matrix, row, col));
121	} else {
122	packed_val = packed_matrix->zero_point;
123	}
124	accum += packed_val;
125	*ElementPtr(packed_matrix, row, col) = packed_val;
126	}
127	if (sums) {
128	sums[col] = accum;
129	}
130	}
131	}
132	};
133
134	// Main entry point for packing.
135	template <Path ThePath, typename FixedKernelLayout, typename Scalar,
136	typename PackedScalar>
137	void RunPack(Tuning tuning, const EMat& src_matrix, PEMat* packed_matrix,
138	int start_col, int end_col) {
139	RUY_TRACE_SCOPE;
140	using SumsType = typename PMat<PackedScalar>::SumsType;
141	Mat<Scalar> src = UneraseType<Scalar>(src_matrix);
142	PMat<PackedScalar> packed = UneraseType<PackedScalar>(*packed_matrix);
143	RUY_TRACE_INFO(RUN_PACK);
144	if (src.layout.order == Order::kColMajor) {
145	PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
146	Order::kColMajor>::Run(tuning, src, &packed, start_col, end_col);
147	} else {
148	PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType,
149	Order::kRowMajor>::Run(tuning, src, &packed, start_col, end_col);
150	}
151	}
152
153	} // namespace ruy
154
155	#endif // RUY_RUY_PACK_H_
156

Browse the source code of tensorflow/external/ruy/ruy/pack.h