1 | /* Copyright 2020 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "ruy/prepare_packed_matrices.h" |
17 | |
18 | #include "ruy/allocator.h" |
19 | #include "ruy/ctx.h" |
20 | #include "ruy/matrix.h" |
21 | #include "ruy/prepacked_cache.h" |
22 | #include "ruy/side_pair.h" |
23 | #include "ruy/trace.h" |
24 | #include "ruy/trmul_params.h" |
25 | |
26 | namespace ruy { |
27 | namespace { |
28 | |
29 | // Returns true if the operand on the given side should use caching of the |
30 | // packed form. This may either be explicitly dictated by its cache_policy |
31 | // (if it is kNeverCache, the default, or kAlwaysCache), or it may depend |
32 | // on a heuristic decision based on the other operand's width. For example, |
33 | // in a matrix*vector product, for the LHS matrix operand, the other side is |
34 | // the RHS vector, with a width of 1, causing the packing of the LHS to be |
35 | // a large fraction of the overall work, so a heuristic would typically |
36 | // decide in favor of caching, if permitted at all by the cache_policy. |
37 | bool ShouldCache(const TrMulParams& params, Side side) { |
38 | const CachePolicy cache_policy = params.src[side].cache_policy; |
39 | // The width that matters is that of the other side, it is what determines |
40 | // the amortization of the packing work done on the present side. |
41 | const Side other_side = OtherSide(side); |
42 | const int other_width = params.src[other_side].layout.cols; |
43 | const int other_kernel_width = |
44 | params.packed_matrix[other_side].layout.kernel.cols; |
45 | switch (cache_policy) { |
46 | case CachePolicy::kNeverCache: |
47 | return false; |
48 | case CachePolicy::kAlwaysCache: |
49 | return true; |
50 | case CachePolicy::kCacheIfLargeSpeedup: |
51 | // The condition (other_width <= other_kernel_width) means that the kernel |
52 | // will traverse each value of the present side only once, meaning that |
53 | // the overhead of the packing work will be maximal, hence maximally |
54 | // worth caching. |
55 | return (other_width <= other_kernel_width); |
56 | case CachePolicy::kCacheIfSignificantSpeedup: |
57 | // Variant of the heuristic used in the kCacheIfLargeSpeedup case. The |
58 | // kernel will run on each value of the present side only a few times, |
59 | // so packing overhead will be significant. |
60 | return (other_width <= 4 * other_kernel_width); |
61 | default: |
62 | RUY_DCHECK(false); |
63 | return false; |
64 | } |
65 | } |
66 | |
67 | } // namespace |
68 | |
69 | void PreparePackedMatrices(Ctx* ctx, TrMulParams* params) { |
70 | RUY_TRACE_SCOPE; |
71 | for (Side side : {Side::kLhs, Side::kRhs}) { |
72 | PEMat& packed_matrix = params->packed_matrix[side]; |
73 | if (ShouldCache(*params, side)) { |
74 | // Use a cached packed matrix (possibly packing and caching now). |
75 | auto* cache = ctx->GetPrepackedCache(); |
76 | auto action = cache->Get(params->src[side].data, &packed_matrix); |
77 | RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_SHOULD_CACHE); |
78 | if (action == PrepackedCache::Action::kInsertedNewEntry) { |
79 | params->RunPack(side, ctx->GetMainThreadTuning(), 0, |
80 | packed_matrix.layout.cols); |
81 | } |
82 | params->is_prepacked[side] = true; |
83 | } else { |
84 | RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_NO_CACHE); |
85 | // Do not use a cached packed matrix. Only need to allocate buffers now. |
86 | Allocator* allocator = ctx->GetMainAllocator(); |
87 | packed_matrix.data = allocator->AllocateBytesAvoidingAliasingWith( |
88 | DataBytes(packed_matrix), params->src[side].data); |
89 | packed_matrix.sums = allocator->AllocateBytes(SumsBytes(packed_matrix)); |
90 | } |
91 | } |
92 | } |
93 | |
94 | } // namespace ruy |
95 | |