1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// single_thread_gemm.h: Single-threaded GEMM implementation.
16// This is a good place to start reading code, as it shows the overall
17// structure of a GEMM and is much simpler than multi_thread_gemm.h.
18
19#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
20#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
21
22#include <cassert>
23
24#include "../public/map.h"
25#include "allocator.h"
26#include "compute.h"
27#include "kernel.h"
28#include "pack.h"
29#include "unpack.h"
30
31#ifdef GEMMLOWP_PROFILING_SIZES
32#ifndef GEMMLOWP_PROFILING
33#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
34#endif
35#include <string>
36#include <unordered_map>
37#endif
38
39namespace gemmlowp {
40
41class SingleThreadGemmContext {
42 public:
43 Allocator* allocator() { return &allocator_; }
44
45 void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
46 void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
47 void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
48
49 int l1_bytes_to_use() const { return l1_bytes_to_use_; }
50 int l2_bytes_to_use() const { return l2_bytes_to_use_; }
51 float l2_rhs_factor() const { return l2_rhs_factor_; }
52
53 protected:
54 Allocator allocator_;
55
56 // The cache configurationt to use.
57 int l1_bytes_to_use_ = kDefaultL1CacheSize;
58 int l2_bytes_to_use_ = kDefaultL2CacheSize;
59 float l2_rhs_factor_ = kDefaultL2RhsFactor;
60};
61
62template <typename KernelFormat, typename InputScalar, typename OutputScalar,
63 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
64 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
65 typename OutputPipelineType>
66void SingleThreadGemm(SingleThreadGemmContext* context,
67 const KernelBase& kernel,
68 const MatrixMap<const InputScalar, LhsOrder>& lhs,
69 const MatrixMap<const InputScalar, RhsOrder>& rhs,
70 MatrixMap<OutputScalar, ResultOrder>* result,
71 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
72 const OutputPipelineType& output_pipeline) {
73 ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
74
75 assert(lhs.cols() == rhs.rows());
76
77 int rows = result->rows();
78 int cols = result->cols();
79 int depth = lhs.cols();
80
81 // zero sizes should have been caught earlier and early-returned.
82 assert(rows > 0);
83 assert(cols > 0);
84 assert(depth > 0);
85
86 // The case of rows<cols should have been caught earlier and transposed.
87 assert(rows >= cols);
88
89 Allocator* allocator = context->allocator();
90
91 BlockParams block_params;
92 block_params.Init<KernelFormat>(
93 rows, cols, depth, 1, context->l1_bytes_to_use(),
94 context->l2_bytes_to_use(), context->l2_rhs_factor());
95
96#ifdef GEMMLOWP_PROFILING_SIZES
97 // Using a static map of label strings. Not reentrant at all!
98 static std::unordered_map<std::uint64_t, std::string> labels_map;
99 std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
100 (static_cast<std::uint64_t>(depth) << 16) ^
101 (static_cast<std::uint64_t>(cols) << 32);
102 if (!labels_map.count(sizes_hash)) {
103 char label[256];
104 snprintf(label, sizeof(label),
105 "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
106 "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
107 rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
108 block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
109 block_params.l1_cols);
110 labels_map[sizes_hash] = label;
111 }
112 ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
113#endif
114
115 PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
116 block_params);
117 PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
118 block_params);
119
120 PackedResult packed_result(allocator, block_params);
121
122 allocator->Commit();
123
124 const bool pack_rhs_once = block_params.l2_cols >= cols;
125
126 if (pack_rhs_once) {
127 PackRhs(&packed_rhs, rhs);
128 }
129
130 for (int r = 0; r < rows; r += block_params.l2_rows) {
131 int rs = std::min(block_params.l2_rows, rows - r);
132
133 PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
134
135 for (int c = 0; c < cols; c += block_params.l2_cols) {
136 int cs = std::min(block_params.l2_cols, cols - c);
137
138 if (!pack_rhs_once) {
139 PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
140 }
141
142 Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
143 depth);
144
145 UnpackResult<KernelFormat>(
146 result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
147 packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
148 lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
149 }
150 }
151
152 allocator->Decommit();
153}
154
155} // namespace gemmlowp
156
157#endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
158