1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | // block_params.h: Logic to choose L1 and L2 block sizes |
16 | // to optimize cache-friendliness. |
17 | |
18 | #ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ |
19 | #define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ |
20 | |
21 | #include "common.h" |
22 | |
23 | namespace gemmlowp { |
24 | |
25 | // A BlockParams instance contains a full description of all the block size |
26 | // parameters to be used by a Gemm. |
27 | // There are two nested levels of block subdivisions: first a subdivision |
28 | // into large blocks that should fit in last-level cache (what we call L2 here) |
29 | // and then another subdivision into smaller blocks that should fit in |
30 | // L1 cache. There is then actually a third level of subdivision to fit |
31 | // in registers, but we are not concerned with that here. |
32 | struct BlockParams { |
33 | // L1 block parameters determine the size of small blocks that should |
34 | // fit in L1 cache. |
35 | int l1_rows; |
36 | int l1_cols; |
37 | int l1_depth; |
38 | |
39 | // L2 block parameters determine the size of larger blocks that should |
40 | // fit in L2 cache. |
41 | int l2_rows; |
42 | int l2_cols; |
43 | int l2_depth; |
44 | |
45 | template <typename KernelFormat> |
46 | void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use, |
47 | int l2_bytes_to_use, float l2_rhs_factor) { |
48 | FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, |
49 | l2_bytes_to_use, l2_rhs_factor, &l2_rows, |
50 | &l2_cols, &l2_depth); |
51 | FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use, |
52 | &l1_rows, &l1_cols, &l1_depth); |
53 | } |
54 | |
55 | template <typename KernelFormat> |
56 | static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads, |
57 | int l2_bytes_to_use, float l2_rhs_factor, |
58 | int* out_l2_rows, int* out_l2_cols, |
59 | int* out_l2_depth) { |
60 | int l2_rows = 0; |
61 | int l2_cols = 0; |
62 | int l2_depth = 0; |
63 | |
64 | int per_thread_rows = |
65 | std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads); |
66 | |
67 | // No L2 blocking in the depth dimension at the moment. |
68 | // Too much loss of accuracy due to storing intermediate results in |
69 | // low precision. |
70 | // However, we still want to round l2_depth up to the next multiple |
71 | // of register size, so as to avoid having to special-case unaligned depths. |
72 | l2_depth = RoundUp<kRegisterSize>(depth); |
73 | |
74 | { |
75 | int max_cache_friendly_l2_cols = std::max( |
76 | 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth))); |
77 | int min_l2_cols_blocks = |
78 | std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols)); |
79 | l2_cols = |
80 | RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks)); |
81 | } |
82 | |
83 | // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row |
84 | // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances |
85 | // the performance on x86. |
86 | if (l2_rhs_factor == 1.0f) { |
87 | l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows); |
88 | } else { |
89 | int max_cache_friendly_l2_rows = |
90 | std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) / |
91 | (num_threads * (l2_depth + 4 * l2_cols))); |
92 | int min_l2_rows_blocks = std::max( |
93 | 1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows)); |
94 | l2_rows = RoundUp<KernelFormat::kRows>( |
95 | CeilQuotient(per_thread_rows, min_l2_rows_blocks)); |
96 | } |
97 | |
98 | *out_l2_rows = l2_rows; |
99 | *out_l2_cols = l2_cols; |
100 | *out_l2_depth = l2_depth; |
101 | } |
102 | |
103 | template <typename KernelFormat> |
104 | static void FindL1BlockSizes(int rows, int cols, int depth, |
105 | int l1_bytes_to_use, int* out_l1_rows, |
106 | int* out_l1_cols, int* out_l1_depth) { |
107 | int l1_rows = 0; |
108 | int l1_cols = 0; |
109 | int l1_depth = 0; |
110 | |
111 | // L2 block sizes should already be multiples of kernel block sizes. |
112 | assert(rows % KernelFormat::kRows == 0); |
113 | assert(cols % KernelFormat::kCols == 0); |
114 | assert(depth % KernelFormat::kDepth == 0); |
115 | |
116 | // No L1 blocking in the columns dimension at the moment. |
117 | // Thought not to be needed. Similar to Eigen. |
118 | l1_cols = cols; |
119 | |
120 | { |
121 | int max_cache_friendly_l1_depth = std::max( |
122 | 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) / |
123 | (KernelFormat::kRows + KernelFormat::kCols)); |
124 | int min_l1_depth_blocks = |
125 | std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth)); |
126 | l1_depth = |
127 | RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks)); |
128 | } |
129 | |
130 | { |
131 | int max_cache_friendly_l1_rows = |
132 | std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols)); |
133 | int min_l1_rows_blocks = |
134 | std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows)); |
135 | l1_rows = |
136 | RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks)); |
137 | } |
138 | |
139 | *out_l1_rows = l1_rows; |
140 | *out_l1_cols = l1_cols; |
141 | *out_l1_depth = l1_depth; |
142 | } |
143 | }; |
144 | |
145 | // A SideBlockParams instance contains only the block params relevant to |
146 | // one side (LHS or RHS), expressed in terms of 'width' instead of |
147 | // rows/colums. See the explanation in kernel.h: in the LHS, 'width' means |
148 | // the number of rows, while in the RHS, 'width' means the number of columns. |
149 | // That allows us to write generic code that applies to either LHS or RHS. |
150 | struct SideBlockParams { |
151 | // L1 block parameters determine the size of small blocks that should |
152 | // fit in L1 cache. |
153 | int l1_width; |
154 | int l1_depth; |
155 | |
156 | // L2 block parameters determine the size of larger blocks that should |
157 | // fit in L2 cache. |
158 | int l2_width; |
159 | int l2_depth; |
160 | }; |
161 | |
162 | enum class Side { Lhs, Rhs }; |
163 | |
164 | inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params, |
165 | const BlockParams& block_params) { |
166 | side_block_params->l1_width = |
167 | side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols; |
168 | side_block_params->l2_width = |
169 | side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols; |
170 | |
171 | side_block_params->l1_depth = block_params.l1_depth; |
172 | side_block_params->l2_depth = block_params.l2_depth; |
173 | } |
174 | |
175 | } // namespace gemmlowp |
176 | |
177 | #endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ |
178 | |