block_params.h source code [tensorflow/external/gemmlowp/internal/block_params.h]

1	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	// block_params.h: Logic to choose L1 and L2 block sizes
16	// to optimize cache-friendliness.
17
18	#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
19	#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
20
21	#include "common.h"
22
23	namespace gemmlowp {
24
25	// A BlockParams instance contains a full description of all the block size
26	// parameters to be used by a Gemm.
27	// There are two nested levels of block subdivisions: first a subdivision
28	// into large blocks that should fit in last-level cache (what we call L2 here)
29	// and then another subdivision into smaller blocks that should fit in
30	// L1 cache. There is then actually a third level of subdivision to fit
31	// in registers, but we are not concerned with that here.
32	struct BlockParams {
33	// L1 block parameters determine the size of small blocks that should
34	// fit in L1 cache.
35	int l1_rows;
36	int l1_cols;
37	int l1_depth;
38
39	// L2 block parameters determine the size of larger blocks that should
40	// fit in L2 cache.
41	int l2_rows;
42	int l2_cols;
43	int l2_depth;
44
45	template <typename KernelFormat>
46	void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use,
47	int l2_bytes_to_use, float l2_rhs_factor) {
48	FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
49	l2_bytes_to_use, l2_rhs_factor, &l2_rows,
50	&l2_cols, &l2_depth);
51	FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use,
52	&l1_rows, &l1_cols, &l1_depth);
53	}
54
55	template <typename KernelFormat>
56	static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
57	int l2_bytes_to_use, float l2_rhs_factor,
58	int* out_l2_rows, int* out_l2_cols,
59	int* out_l2_depth) {
60	int l2_rows = `0`;
61	int l2_cols = `0`;
62	int l2_depth = `0`;
63
64	int per_thread_rows =
65	std::max(`1`, RoundUp<KernelFormat::kRows>(rows) / num_threads);
66
67	// No L2 blocking in the depth dimension at the moment.
68	// Too much loss of accuracy due to storing intermediate results in
69	// low precision.
70	// However, we still want to round l2_depth up to the next multiple
71	// of register size, so as to avoid having to special-case unaligned depths.
72	l2_depth = RoundUp<kRegisterSize>(depth);
73
74	{
75	int max_cache_friendly_l2_cols = std::max(
76	`1`, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
77	int min_l2_cols_blocks =
78	std::max(`1`, CeilQuotient(cols, max_cache_friendly_l2_cols));
79	l2_cols =
80	RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
81	}
82
83	// No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row
84	// dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
85	// the performance on x86.
86	if (l2_rhs_factor == `1.0f`) {
87	l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows);
88	} else {
89	int max_cache_friendly_l2_rows =
90	std::max(`1`, (l2_bytes_to_use - l2_depth * l2_cols) /
91	(num_threads * (l2_depth + `4` * l2_cols)));
92	int min_l2_rows_blocks = std::max(
93	`1`, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows));
94	l2_rows = RoundUp<KernelFormat::kRows>(
95	CeilQuotient(per_thread_rows, min_l2_rows_blocks));
96	}
97
98	*out_l2_rows = l2_rows;
99	*out_l2_cols = l2_cols;
100	*out_l2_depth = l2_depth;
101	}
102
103	template <typename KernelFormat>
104	static void FindL1BlockSizes(int rows, int cols, int depth,
105	int l1_bytes_to_use, int* out_l1_rows,
106	int* out_l1_cols, int* out_l1_depth) {
107	int l1_rows = `0`;
108	int l1_cols = `0`;
109	int l1_depth = `0`;
110
111	// L2 block sizes should already be multiples of kernel block sizes.
112	assert(rows % KernelFormat::kRows == `0`);
113	assert(cols % KernelFormat::kCols == `0`);
114	assert(depth % KernelFormat::kDepth == `0`);
115
116	// No L1 blocking in the columns dimension at the moment.
117	// Thought not to be needed. Similar to Eigen.
118	l1_cols = cols;
119
120	{
121	int max_cache_friendly_l1_depth = std::max(
122	`1`, (l1_bytes_to_use - `4` * KernelFormat::kRows * KernelFormat::kCols) /
123	(KernelFormat::kRows + KernelFormat::kCols));
124	int min_l1_depth_blocks =
125	std::max(`1`, CeilQuotient(depth, max_cache_friendly_l1_depth));
126	l1_depth =
127	RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks));
128	}
129
130	{
131	int max_cache_friendly_l1_rows =
132	std::max(`1`, l1_bytes_to_use / (l1_depth + `4` * l1_cols));
133	int min_l1_rows_blocks =
134	std::max(`1`, CeilQuotient(rows, max_cache_friendly_l1_rows));
135	l1_rows =
136	RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
137	}
138
139	*out_l1_rows = l1_rows;
140	*out_l1_cols = l1_cols;
141	*out_l1_depth = l1_depth;
142	}
143	};
144
145	// A SideBlockParams instance contains only the block params relevant to
146	// one side (LHS or RHS), expressed in terms of 'width' instead of
147	// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
148	// the number of rows, while in the RHS, 'width' means the number of columns.
149	// That allows us to write generic code that applies to either LHS or RHS.
150	struct SideBlockParams {
151	// L1 block parameters determine the size of small blocks that should
152	// fit in L1 cache.
153	int l1_width;
154	int l1_depth;
155
156	// L2 block parameters determine the size of larger blocks that should
157	// fit in L2 cache.
158	int l2_width;
159	int l2_depth;
160	};
161
162	enum class Side { Lhs, Rhs };
163
164	inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
165	const BlockParams& block_params) {
166	side_block_params->l1_width =
167	side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
168	side_block_params->l2_width =
169	side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
170
171	side_block_params->l1_depth = block_params.l1_depth;
172	side_block_params->l2_depth = block_params.l2_depth;
173	}
174
175	} // namespace gemmlowp
176
177	#endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
178

Browse the source code of tensorflow/external/gemmlowp/internal/block_params.h