1/* Copyright 2020 Google LLC. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "ruy/prepare_packed_matrices.h"
17
18#include "ruy/allocator.h"
19#include "ruy/ctx.h"
20#include "ruy/matrix.h"
21#include "ruy/prepacked_cache.h"
22#include "ruy/side_pair.h"
23#include "ruy/trace.h"
24#include "ruy/trmul_params.h"
25
26namespace ruy {
27namespace {
28
29// Returns true if the operand on the given side should use caching of the
30// packed form. This may either be explicitly dictated by its cache_policy
31// (if it is kNeverCache, the default, or kAlwaysCache), or it may depend
32// on a heuristic decision based on the other operand's width. For example,
33// in a matrix*vector product, for the LHS matrix operand, the other side is
34// the RHS vector, with a width of 1, causing the packing of the LHS to be
35// a large fraction of the overall work, so a heuristic would typically
36// decide in favor of caching, if permitted at all by the cache_policy.
37bool ShouldCache(const TrMulParams& params, Side side) {
38 const CachePolicy cache_policy = params.src[side].cache_policy;
39 // The width that matters is that of the other side, it is what determines
40 // the amortization of the packing work done on the present side.
41 const Side other_side = OtherSide(side);
42 const int other_width = params.src[other_side].layout.cols;
43 const int other_kernel_width =
44 params.packed_matrix[other_side].layout.kernel.cols;
45 switch (cache_policy) {
46 case CachePolicy::kNeverCache:
47 return false;
48 case CachePolicy::kAlwaysCache:
49 return true;
50 case CachePolicy::kCacheIfLargeSpeedup:
51 // The condition (other_width <= other_kernel_width) means that the kernel
52 // will traverse each value of the present side only once, meaning that
53 // the overhead of the packing work will be maximal, hence maximally
54 // worth caching.
55 return (other_width <= other_kernel_width);
56 case CachePolicy::kCacheIfSignificantSpeedup:
57 // Variant of the heuristic used in the kCacheIfLargeSpeedup case. The
58 // kernel will run on each value of the present side only a few times,
59 // so packing overhead will be significant.
60 return (other_width <= 4 * other_kernel_width);
61 default:
62 RUY_DCHECK(false);
63 return false;
64 }
65}
66
67} // namespace
68
69void PreparePackedMatrices(Ctx* ctx, TrMulParams* params) {
70 RUY_TRACE_SCOPE;
71 for (Side side : {Side::kLhs, Side::kRhs}) {
72 PEMat& packed_matrix = params->packed_matrix[side];
73 if (ShouldCache(*params, side)) {
74 // Use a cached packed matrix (possibly packing and caching now).
75 auto* cache = ctx->GetPrepackedCache();
76 auto action = cache->Get(params->src[side].data, &packed_matrix);
77 RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_SHOULD_CACHE);
78 if (action == PrepackedCache::Action::kInsertedNewEntry) {
79 params->RunPack(side, ctx->GetMainThreadTuning(), 0,
80 packed_matrix.layout.cols);
81 }
82 params->is_prepacked[side] = true;
83 } else {
84 RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_NO_CACHE);
85 // Do not use a cached packed matrix. Only need to allocate buffers now.
86 Allocator* allocator = ctx->GetMainAllocator();
87 packed_matrix.data = allocator->AllocateBytesAvoidingAliasingWith(
88 DataBytes(packed_matrix), params->src[side].data);
89 packed_matrix.sums = allocator->AllocateBytes(SumsBytes(packed_matrix));
90 }
91 }
92}
93
94} // namespace ruy
95