1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "ruy/block_map.h" |
17 | |
18 | #include <algorithm> |
19 | #include <cstdint> |
20 | #include <limits> |
21 | |
22 | #ifdef RUY_MAKEBLOCKMAP_DEBUG |
23 | #include <cstdio> |
24 | #include <cstdlib> |
25 | #include <string> |
26 | #endif |
27 | |
28 | #include "ruy/check_macros.h" |
29 | #include "ruy/opt_set.h" |
30 | #include "ruy/profiler/instrumentation.h" |
31 | #include "ruy/size_util.h" |
32 | #include "ruy/trace.h" |
33 | |
34 | namespace ruy { |
35 | |
36 | namespace { |
37 | |
38 | void DecodeTraversalLinear(int size_log2, std::uint32_t square_index, |
39 | SidePair<int>* local_pos) { |
40 | (*local_pos)[Side::kLhs] = square_index & ((1 << size_log2) - 1); |
41 | (*local_pos)[Side::kRhs] = square_index >> size_log2; |
42 | } |
43 | |
44 | void DecodeTraversalFractalZ(std::uint32_t square_index, |
45 | SidePair<int>* local_pos) { |
46 | const std::uint32_t n1 = square_index; |
47 | const std::uint32_t n2 = (n1 & 0x99999999u) | ((n1 & 0x44444444u) >> 1) | |
48 | ((n1 & 0x22222222u) << 1); |
49 | const std::uint32_t n4 = (n2 & 0xc3c3c3c3u) | ((n2 & 0x30303030u) >> 2) | |
50 | ((n2 & 0x0c0c0c0cu) << 2); |
51 | const std::uint32_t n8 = (n4 & 0xf00ff00fu) | ((n4 & 0x0f000f00u) >> 4) | |
52 | ((n4 & 0x00f000f0u) << 4); |
53 | const std::uint32_t n16 = (n8 & 0xff0000ffu) | ((n8 & 0x00ff0000u) >> 8) | |
54 | ((n8 & 0x0000ff00u) << 8); |
55 | (*local_pos)[Side::kLhs] = n16 & 0xffff; |
56 | (*local_pos)[Side::kRhs] = n16 >> 16; |
57 | } |
58 | |
59 | void DecodeTraversalFractalU(std::uint32_t square_index, |
60 | SidePair<int>* local_pos) { |
61 | DecodeTraversalFractalZ(square_index, local_pos); |
62 | // Change fractal z-order to u-order |
63 | (*local_pos)[Side::kLhs] ^= (*local_pos)[Side::kRhs]; |
64 | } |
65 | |
66 | // Code inspired by the sample code in |
67 | // https://en.wikipedia.org/wiki/Hilbert_curve |
68 | // The main optimization is to avoid hard-to-predict conditional branches |
69 | // based on the bits of the square_index parameter. |
70 | void DecodeTraversalFractalHilbert(int size_log2, std::uint32_t square_index, |
71 | SidePair<int>* local_pos) { |
72 | std::uint32_t t = square_index; |
73 | std::uint32_t x = 0; |
74 | std::uint32_t y = 0; |
75 | // Easy-to-predict for loop, the number of iterations is the same for |
76 | // an entire GEMM. |
77 | for (int sb = 0; sb < size_log2; sb++) { |
78 | std::uint32_t s = 1 << sb; |
79 | bool rx = t & 2; |
80 | bool ry = (t & 1) ^ rx; |
81 | std::uint32_t tmp = rx ? (s - 1 - x) : x; |
82 | x = ry ? x : rx ? (s - 1 - y) : y; |
83 | y = ry ? (y + s) : tmp; |
84 | x = rx ? (x + s) : x; |
85 | t >>= 2; |
86 | } |
87 | (*local_pos)[Side::kLhs] = y; |
88 | (*local_pos)[Side::kRhs] = x; |
89 | } |
90 | |
91 | } // end anonymous namespace |
92 | |
93 | void GetBlockByIndex(const BlockMap& block_map, int index, |
94 | SidePair<int>* block) { |
95 | profiler::ScopeLabel label("GetBlockByIndex" ); |
96 | const std::uint32_t index_u32 = index; |
97 | |
98 | const std::uint32_t num_blocks_per_local_curve = |
99 | 1u << (2 * block_map.num_blocks_base_log2); |
100 | const std::uint32_t square_index = |
101 | index_u32 & (num_blocks_per_local_curve - 1); |
102 | |
103 | const int size_log2 = block_map.num_blocks_base_log2; |
104 | SidePair<int> local_pos; |
105 | switch (block_map.traversal_order) { |
106 | case BlockMapTraversalOrder::kFractalZ: |
107 | DecodeTraversalFractalZ(square_index, &local_pos); |
108 | break; |
109 | case BlockMapTraversalOrder::kFractalU: |
110 | DecodeTraversalFractalU(square_index, &local_pos); |
111 | break; |
112 | case BlockMapTraversalOrder::kFractalHilbert: |
113 | DecodeTraversalFractalHilbert(size_log2, square_index, &local_pos); |
114 | break; |
115 | default: |
116 | RUY_DCHECK(block_map.traversal_order == BlockMapTraversalOrder::kLinear); |
117 | DecodeTraversalLinear(size_log2, square_index, &local_pos); |
118 | break; |
119 | } |
120 | |
121 | const std::uint32_t rectangular_index = |
122 | index_u32 >> 2 * block_map.num_blocks_base_log2; |
123 | for (Side side : {Side::kLhs, Side::kRhs}) { |
124 | const std::uint32_t mask = (1u << block_map.rectangularness_log2[side]) - 1; |
125 | const int rectangular_offset = (rectangular_index & mask) |
126 | << block_map.num_blocks_base_log2; |
127 | (*block)[side] = local_pos[side] + rectangular_offset; |
128 | } |
129 | } |
130 | |
131 | namespace { |
132 | |
133 | BlockMapTraversalOrder GetTraversalOrder( |
134 | int rows_after_rectangularness_division, |
135 | int cols_after_rectangularness_division, int depth, int lhs_scalar_size, |
136 | int rhs_scalar_size, const CpuCacheParams& cpu_cache_params) { |
137 | static constexpr bool kAnyFractal = |
138 | RUY_OPT(FRACTAL_Z) | RUY_OPT(FRACTAL_U) | RUY_OPT(FRACTAL_HILBERT); |
139 | const int working_set_size = |
140 | (lhs_scalar_size * rows_after_rectangularness_division + |
141 | rhs_scalar_size * cols_after_rectangularness_division) * |
142 | depth; |
143 | if (kAnyFractal && (working_set_size > cpu_cache_params.local_cache_size)) { |
144 | if (RUY_OPT(FRACTAL_HILBERT) && |
145 | (working_set_size > cpu_cache_params.last_level_cache_size)) { |
146 | return BlockMapTraversalOrder::kFractalHilbert; |
147 | } else if (RUY_OPT(FRACTAL_U)) { |
148 | return BlockMapTraversalOrder::kFractalU; |
149 | } else { |
150 | return BlockMapTraversalOrder::kFractalZ; |
151 | } |
152 | } else { |
153 | return BlockMapTraversalOrder::kLinear; |
154 | } |
155 | } |
156 | |
157 | int floor_log2_quotient(int num, int denom) { |
158 | if (num <= denom) { |
159 | return 0; |
160 | } |
161 | int log2_quotient = floor_log2(num) - ceil_log2(denom); |
162 | if ((denom << (log2_quotient + 1)) <= num) { |
163 | log2_quotient++; |
164 | } |
165 | return log2_quotient; |
166 | } |
167 | |
168 | // Computes the rectangularness of the matrix shape (rows, cols). This is |
169 | // essentially just the log2 of the quotient (rows / cols). The kernel_rows and |
170 | // kernel_cols only get into the picture for clamping bounds but don't affect |
171 | // the generic computation. |
172 | void GetRectangularness(int rows, int cols, int kernel_rows, int kernel_cols, |
173 | int* rows_rectangularness_log2, |
174 | int* cols_rectangularness_log2) { |
175 | *rows_rectangularness_log2 = 0; |
176 | *cols_rectangularness_log2 = 0; |
177 | |
178 | // In GEMV-ish cases, that is when kernel blocks are as narrow as the kernel |
179 | // itself, we risk having too small kernel blocks for good kernel |
180 | // amortization. We avoid that by limiting recangularness so that kernel |
181 | // blocks are not too tiny at least in that dimension. Specifically, we try to |
182 | // have at least (2^min_kernel_inner_loop_runs_log2) kernels fitting in each |
183 | // kernel block along the large dimension. |
184 | const int min_kernel_inner_loop_runs_log2 = 3; |
185 | if (rows > cols) { |
186 | int cols_of_kernel_inner_loop_runs_log2 = |
187 | ceil_log2(cols) - pot_log2(kernel_cols); |
188 | int min_rows_of_kernel_inner_loop_runs_log2 = |
189 | std::max(0, min_kernel_inner_loop_runs_log2 - |
190 | cols_of_kernel_inner_loop_runs_log2); |
191 | *rows_rectangularness_log2 = |
192 | std::min(floor_log2_quotient(rows, cols), |
193 | std::max(0, floor_log2(rows) - pot_log2(kernel_rows) - |
194 | min_rows_of_kernel_inner_loop_runs_log2)); |
195 | // Sanity check that we did not over-estimate rows_rectangularness_log2. |
196 | RUY_DCHECK_GE(rows >> *rows_rectangularness_log2, cols); |
197 | } else if (cols > rows) { |
198 | int rows_of_kernel_inner_loop_runs_log2 = |
199 | ceil_log2(rows) - pot_log2(kernel_rows); |
200 | int min_cols_of_kernel_inner_loop_runs_log2 = |
201 | std::max(0, min_kernel_inner_loop_runs_log2 - |
202 | rows_of_kernel_inner_loop_runs_log2); |
203 | *cols_rectangularness_log2 = |
204 | std::min(floor_log2_quotient(cols, rows), |
205 | std::max(0, floor_log2(cols) - pot_log2(kernel_cols) - |
206 | min_cols_of_kernel_inner_loop_runs_log2)); |
207 | // Sanity check that we did not over-estimate cols_rectangularness_log2. |
208 | RUY_DCHECK_GE(cols >> *cols_rectangularness_log2, rows); |
209 | } |
210 | RUY_DCHECK(!*rows_rectangularness_log2 || !*cols_rectangularness_log2); |
211 | } |
212 | |
213 | // Computes a 'multithreading score'. When multithreading, we need there to |
214 | // be at least as many tiles as there are threads, and hopefully |
215 | // substantially more than that, so we benefit from ruy's ability to |
216 | // dispatch fine-grained workloads to threads. |
217 | int GetMultithreadingScore(int block_size_log2, int rows, int cols, |
218 | int tentative_thread_count) { |
219 | const int num_full_blocks_of_rows = rows >> block_size_log2; |
220 | const int num_full_blocks_of_cols = cols >> block_size_log2; |
221 | const int candidate_num_full_blocks_log2 = floor_log2( |
222 | std::max(1, num_full_blocks_of_rows * num_full_blocks_of_cols)); |
223 | |
224 | // The values here have been tuned on ARM Cortex-A55. |
225 | // We expect this to have to be tuned differently for other CPUs. |
226 | if (tentative_thread_count == 1) { |
227 | return 0; |
228 | } else { |
229 | const int blocks_per_thread_log2 = |
230 | candidate_num_full_blocks_log2 - ceil_log2(tentative_thread_count); |
231 | if (blocks_per_thread_log2 < 0) { |
232 | return -64; |
233 | } else if (blocks_per_thread_log2 == 0) { |
234 | return -16; |
235 | } else if (blocks_per_thread_log2 == 1) { |
236 | return -8; |
237 | } else if (blocks_per_thread_log2 == 2) { |
238 | return 0; |
239 | } else if (blocks_per_thread_log2 == 3) { |
240 | return 8; |
241 | } else { |
242 | return 16; |
243 | } |
244 | } |
245 | } |
246 | |
247 | // Computes a 'cache locality score'. |
248 | int GetCacheLocalityScore(int block_size_log2, int rows, int cols, int depth, |
249 | int kernel_rows_log2, int kernel_cols_log2, |
250 | int lhs_scalar_size, int rhs_scalar_size, |
251 | const CpuCacheParams& cpu_cache_params) { |
252 | // In the narrow case (e.g. matrix*vector), each byte of the big operand |
253 | // matrix (either LHS or RHS) is traversed only once, so any notion of data |
254 | // locality is irrelevant. Ignore the 'cache locality score' by forcing it to |
255 | // be 0 in that case. |
256 | if (rows <= (1 << kernel_rows_log2) || cols <= (1 << kernel_cols_log2)) { |
257 | return 0; |
258 | } |
259 | const int block_rows = std::min(1 << block_size_log2, rows); |
260 | const int block_cols = std::min(1 << block_size_log2, cols); |
261 | const int total_read_bytes = |
262 | (lhs_scalar_size * block_rows + rhs_scalar_size * block_cols) * depth; |
263 | const int total_read_bytes_log2 = ceil_log2(total_read_bytes); |
264 | const int nonlocality_log2 = |
265 | total_read_bytes_log2 - floor_log2(cpu_cache_params.local_cache_size); |
266 | // The values here have been tuned on ARM Cortex-A55. |
267 | // We expect this to have to be tuned differently for other CPUs. |
268 | if (nonlocality_log2 < -1) { |
269 | return 64; |
270 | } else if (nonlocality_log2 == -1) { |
271 | return 56; |
272 | } else if (nonlocality_log2 == 0) { |
273 | return 48; |
274 | } else if (nonlocality_log2 == 1) { |
275 | return 32; |
276 | } else if (nonlocality_log2 == 2) { |
277 | return 16; |
278 | } else if (nonlocality_log2 == 3) { |
279 | return 0; |
280 | } else { |
281 | return -64; |
282 | } |
283 | } |
284 | |
285 | // Compute a 'kernel amortization score'. This is the notion that very small |
286 | // tiles result in more overhead outside of kernels, more complex memory |
287 | // access patterns and less benefits from ruy's fat kernels, so we reward |
288 | // larger blocks more than smaller ones. |
289 | int GetKernelAmortizationScore(int block_size_log2, int rows, int cols, |
290 | int kernel_rows_log2, int kernel_cols_log2) { |
291 | const int block_rows = std::min(1 << block_size_log2, rows); |
292 | const int block_cols = std::min(1 << block_size_log2, cols); |
293 | const int kernels_per_block_log2 = |
294 | floor_log2(block_rows * block_cols) - kernel_rows_log2 - kernel_cols_log2; |
295 | RUY_DCHECK_GE(kernels_per_block_log2, 0); |
296 | // The values here have been tuned on ARM Cortex-A55. |
297 | // We expect this to have to be tuned differently for other CPUs. |
298 | if (kernels_per_block_log2 == 0) { |
299 | return 0; |
300 | } else if (kernels_per_block_log2 == 1) { |
301 | return 8; |
302 | } else if (kernels_per_block_log2 == 2) { |
303 | return 16; |
304 | } else if (kernels_per_block_log2 == 3) { |
305 | return 24; |
306 | } else if (kernels_per_block_log2 == 4) { |
307 | return 32; |
308 | } else if (kernels_per_block_log2 == 5) { |
309 | return 40; |
310 | } else if (kernels_per_block_log2 == 6) { |
311 | return 48; |
312 | } else if (kernels_per_block_log2 == 7) { |
313 | return 56; |
314 | } else { |
315 | return 64; |
316 | } |
317 | } |
318 | |
319 | } // namespace |
320 | |
321 | bool IsObviouslyLinearTraversal(int rows, int cols, int depth, |
322 | int lhs_scalar_size, int rhs_scalar_size, |
323 | const CpuCacheParams& cpu_cache_params) { |
324 | if (rows == 1 || cols == 1) { |
325 | return true; |
326 | } |
327 | // Normally, GetTraversalOrder wants the dimensions (rows x cols) divided |
328 | // by the rectangularness factors, since any non-linear traversal order will |
329 | // be local to each subdivision. In the present function, we don't know the |
330 | // rectangularness factors yet, and we can't just call GetRectangularness |
331 | // as that requires knowing the kernel block layout. Since we just want |
332 | // a coarse estimate with only the guarantee that if we return `true` then |
333 | // linear traversal will be used, it is OK here to over-estimate `rows` and |
334 | // `cols`, by omitting to divide them by the rectangularness factors. |
335 | return GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size, |
336 | cpu_cache_params) == BlockMapTraversalOrder::kLinear; |
337 | } |
338 | |
339 | void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, |
340 | int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, |
341 | int tentative_thread_count, |
342 | const CpuCacheParams& cpu_cache_params, BlockMap* block_map) { |
343 | RUY_TRACE_SCOPE; |
344 | profiler::ScopeLabel label("MakeBlockMap" ); |
345 | |
346 | RUY_DCHECK_GE(rows, kernel_rows); |
347 | RUY_DCHECK_GE(cols, kernel_cols); |
348 | RUY_DCHECK_EQ(rows % kernel_rows, 0); |
349 | RUY_DCHECK_EQ(cols % kernel_cols, 0); |
350 | |
351 | // Estimate the 'rectangularness', the first level of subdivision bringing |
352 | // the shape to within 2x of a square shape. |
353 | int rows_rectangularness_log2 = 0; |
354 | int cols_rectangularness_log2 = 0; |
355 | GetRectangularness(rows, cols, kernel_rows, kernel_cols, |
356 | &rows_rectangularness_log2, &cols_rectangularness_log2); |
357 | |
358 | const int kernel_rows_log2 = pot_log2(kernel_rows); |
359 | const int kernel_cols_log2 = pot_log2(kernel_cols); |
360 | const int kernel_size_log2 = std::max(kernel_cols_log2, kernel_rows_log2); |
361 | |
362 | const int size = std::min(rows, cols); |
363 | const int size_log2 = std::max(kernel_size_log2, floor_log2(size)); |
364 | |
365 | RUY_DCHECK_GE(size_log2, kernel_size_log2); |
366 | |
367 | // Heuristic selecting the power-of-two grid subdivision insider of each |
368 | // square-ish region (past the above subdivision by 'rectangularness'). |
369 | // Note that it is the number of subdivisions, not the resulting block size, |
370 | // that will be a power of two. But inside of that heuristic, it simplifies |
371 | // code to talk in terms of 'block_size_log2', as if it were the block size |
372 | // that were a power of two. This 'block_size_log2' is to be interpreted as |
373 | // "log2 rounded below", e.g. when block_size_log2=8 we might have a block |
374 | // size in [256, 511]. When the shape is non-square, rows!=cols, this |
375 | // refers to the smaller of the two, so the other might be as large as |
376 | // 1021 (can't be 1022 because following the above 'rectangularness' |
377 | // subdivision, the aspect ratio is already < 2). |
378 | |
379 | // We are going to try candidate values for block_size_log2 ranging from |
380 | // kernel_size_log2 to (kernel_size_log2 + kMaxKernelsPerBlockLog2). |
381 | // For each of them we will compute a 'score' by adding individual scores |
382 | // for a few different considerations, all of which is entirely empirical. |
383 | // The values (and possibly the logic) around here are all subject to tuning |
384 | // based on benchmarks on different hardware. The current values are based |
385 | // on benchmarking on Qualcomm S855 (big and little cores), arm64, |
386 | // kNeonDotprod, 8bit quantized path. Don't read too much into it, go ahead |
387 | // and tune this as needed to achieve good performance elsewhere. Use |
388 | // the unit test, block_map_test, to encode values that should be preserved |
389 | // on specific architectures. Use RUY_TRACE to debug the current heuristics |
390 | // and RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2 to test the impact of a |
391 | // different block_size_log2 choice, to empirically find the optimal value |
392 | // before getting to updating the heuristic so that it produces that value. |
393 | static constexpr int kMaxKernelsPerBlockLog2 = 6; |
394 | const int max_block_size_log2 = |
395 | std::min(size_log2, kernel_size_log2 + kMaxKernelsPerBlockLog2); |
396 | int best_score = std::numeric_limits<int>::min(); |
397 | int best_score_block_size_log2 = -1; |
398 | RUY_TRACE_INFO(MAKE_BLOCK_MAP_START); |
399 | for (int block_size_log2 = kernel_size_log2; |
400 | block_size_log2 <= max_block_size_log2; block_size_log2++) { |
401 | const int multithreading_score = GetMultithreadingScore( |
402 | block_size_log2, rows, cols, tentative_thread_count); |
403 | const int cache_locality_score = GetCacheLocalityScore( |
404 | block_size_log2, rows, cols, depth, kernel_rows_log2, kernel_cols_log2, |
405 | lhs_scalar_size, rhs_scalar_size, cpu_cache_params); |
406 | const int kernel_amortization_score = GetKernelAmortizationScore( |
407 | block_size_log2, rows, cols, kernel_rows_log2, kernel_cols_log2); |
408 | const int score = |
409 | multithreading_score + cache_locality_score + kernel_amortization_score; |
410 | if (score >= best_score) { |
411 | best_score = score; |
412 | best_score_block_size_log2 = block_size_log2; |
413 | } |
414 | RUY_TRACE_INFO(MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE); |
415 | } |
416 | |
417 | #ifdef RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2 |
418 | // Useful for tuning. |
419 | best_score_block_size_log2 = RUY_MAKEBLOCKMAP_EXPLICIT_BLOCK_SIZE_LOG2; |
420 | #endif |
421 | |
422 | // As explained in the above comment, phrasing the above code in terms of |
423 | // block_size_log2 was only convenience inside of that heuristic. Now we |
424 | // revert to talking in terms of grid subdivision. That is what will actually |
425 | // be powers of two. |
426 | int num_blocks_base_log2 = size_log2 - best_score_block_size_log2; |
427 | RUY_DCHECK_GE(num_blocks_base_log2, 0); |
428 | const int num_blocks_of_rows_log2 = |
429 | num_blocks_base_log2 + rows_rectangularness_log2; |
430 | const int num_blocks_of_cols_log2 = |
431 | num_blocks_base_log2 + cols_rectangularness_log2; |
432 | |
433 | // Now that we know the grid subdivision, we can pinpoint the exact block |
434 | // sizes. They can't be powers of two in general; they can't even be all |
435 | // equal in general; so the following few parameters will govern how blocks |
436 | // of slightly different shapes are put together in the block map. |
437 | const int small_block_rows = |
438 | round_down_pot(rows >> num_blocks_of_rows_log2, kernel_rows); |
439 | const int small_block_cols = |
440 | round_down_pot(cols >> num_blocks_of_cols_log2, kernel_cols); |
441 | const int rows_of_large_blocks = |
442 | round_up_pot(rows - (small_block_rows << num_blocks_of_rows_log2), |
443 | kernel_rows) >> |
444 | pot_log2(kernel_rows); |
445 | const int cols_of_large_blocks = |
446 | round_up_pot(cols - (small_block_cols << num_blocks_of_cols_log2), |
447 | kernel_cols) >> |
448 | pot_log2(kernel_cols); |
449 | |
450 | // We have everything! Write out to the destination block_map. |
451 | block_map->dims[Side::kLhs] = rows; |
452 | block_map->dims[Side::kRhs] = cols; |
453 | block_map->kernel_dims[Side::kLhs] = kernel_rows; |
454 | block_map->kernel_dims[Side::kRhs] = kernel_cols; |
455 | block_map->num_blocks_base_log2 = num_blocks_base_log2; |
456 | block_map->rectangularness_log2[Side::kLhs] = rows_rectangularness_log2; |
457 | block_map->rectangularness_log2[Side::kRhs] = cols_rectangularness_log2; |
458 | block_map->small_block_dims[Side::kLhs] = small_block_rows; |
459 | block_map->small_block_dims[Side::kRhs] = small_block_cols; |
460 | block_map->large_blocks[Side::kLhs] = rows_of_large_blocks; |
461 | block_map->large_blocks[Side::kRhs] = cols_of_large_blocks; |
462 | // See the comment on GetTraversalOrder for why we are dividing `rows` and |
463 | // `cols` by the rectangularness subdivision parameters here. |
464 | block_map->traversal_order = GetTraversalOrder( |
465 | rows >> rows_rectangularness_log2, cols >> cols_rectangularness_log2, |
466 | depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params); |
467 | // Done last: NumBlocks needs some of the block_map fields to be already set. |
468 | block_map->thread_count = |
469 | std::min(tentative_thread_count, NumBlocks(*block_map)); |
470 | RUY_TRACE_INFO(MAKE_BLOCK_MAP_END); |
471 | } |
472 | |
473 | void GetBlockMatrixCoords(Side side, const BlockMap& block_map, int block, |
474 | int* start, int* end) { |
475 | profiler::ScopeLabel label("GetBlockMatrixCoords" ); |
476 | *start = block * block_map.small_block_dims[side] + |
477 | std::min(block, block_map.large_blocks[side]) * |
478 | block_map.kernel_dims[side]; |
479 | *end = |
480 | *start + block_map.small_block_dims[side] + |
481 | (block < block_map.large_blocks[side] ? block_map.kernel_dims[side] : 0); |
482 | |
483 | RUY_DCHECK_EQ(0, *start % block_map.kernel_dims[side]); |
484 | RUY_DCHECK_EQ(0, *end % block_map.kernel_dims[side]); |
485 | RUY_DCHECK_LE(*end, block_map.dims[side]); |
486 | RUY_DCHECK_LT(*start, *end); |
487 | RUY_DCHECK_GE(*start, 0); |
488 | } |
489 | |
490 | void GetBlockMatrixCoords(const BlockMap& block_map, const SidePair<int>& block, |
491 | SidePair<int>* start, SidePair<int>* end) { |
492 | for (Side side : {Side::kLhs, Side::kRhs}) { |
493 | GetBlockMatrixCoords(side, block_map, block[side], &(*start)[side], |
494 | &(*end)[side]); |
495 | } |
496 | } |
497 | |
498 | } // namespace ruy |
499 | |