1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | // single_thread_gemm.h: Single-threaded GEMM implementation. |
16 | // This is a good place to start reading code, as it shows the overall |
17 | // structure of a GEMM and is much simpler than multi_thread_gemm.h. |
18 | |
19 | #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ |
20 | #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ |
21 | |
22 | #include <cassert> |
23 | |
24 | #include "../public/map.h" |
25 | #include "allocator.h" |
26 | #include "compute.h" |
27 | #include "kernel.h" |
28 | #include "pack.h" |
29 | #include "unpack.h" |
30 | |
31 | #ifdef GEMMLOWP_PROFILING_SIZES |
32 | #ifndef GEMMLOWP_PROFILING |
33 | #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING |
34 | #endif |
35 | #include <string> |
36 | #include <unordered_map> |
37 | #endif |
38 | |
39 | namespace gemmlowp { |
40 | |
41 | class SingleThreadGemmContext { |
42 | public: |
43 | Allocator* allocator() { return &allocator_; } |
44 | |
45 | void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; } |
46 | void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; } |
47 | void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; } |
48 | |
49 | int l1_bytes_to_use() const { return l1_bytes_to_use_; } |
50 | int l2_bytes_to_use() const { return l2_bytes_to_use_; } |
51 | float l2_rhs_factor() const { return l2_rhs_factor_; } |
52 | |
53 | protected: |
54 | Allocator allocator_; |
55 | |
56 | // The cache configurationt to use. |
57 | int l1_bytes_to_use_ = kDefaultL1CacheSize; |
58 | int l2_bytes_to_use_ = kDefaultL2CacheSize; |
59 | float l2_rhs_factor_ = kDefaultL2RhsFactor; |
60 | }; |
61 | |
62 | template <typename KernelFormat, typename InputScalar, typename OutputScalar, |
63 | typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, |
64 | MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, |
65 | typename OutputPipelineType> |
66 | void SingleThreadGemm(SingleThreadGemmContext* context, |
67 | const KernelBase& kernel, |
68 | const MatrixMap<const InputScalar, LhsOrder>& lhs, |
69 | const MatrixMap<const InputScalar, RhsOrder>& rhs, |
70 | MatrixMap<OutputScalar, ResultOrder>* result, |
71 | const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
72 | const OutputPipelineType& output_pipeline) { |
73 | ScopedProfilingLabel label("gemmlowp::SingleThreadGemm" ); |
74 | |
75 | assert(lhs.cols() == rhs.rows()); |
76 | |
77 | int rows = result->rows(); |
78 | int cols = result->cols(); |
79 | int depth = lhs.cols(); |
80 | |
81 | // zero sizes should have been caught earlier and early-returned. |
82 | assert(rows > 0); |
83 | assert(cols > 0); |
84 | assert(depth > 0); |
85 | |
86 | // The case of rows<cols should have been caught earlier and transposed. |
87 | assert(rows >= cols); |
88 | |
89 | Allocator* allocator = context->allocator(); |
90 | |
91 | BlockParams block_params; |
92 | block_params.Init<KernelFormat>( |
93 | rows, cols, depth, 1, context->l1_bytes_to_use(), |
94 | context->l2_bytes_to_use(), context->l2_rhs_factor()); |
95 | |
96 | #ifdef GEMMLOWP_PROFILING_SIZES |
97 | // Using a static map of label strings. Not reentrant at all! |
98 | static std::unordered_map<std::uint64_t, std::string> labels_map; |
99 | std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^ |
100 | (static_cast<std::uint64_t>(depth) << 16) ^ |
101 | (static_cast<std::uint64_t>(cols) << 32); |
102 | if (!labels_map.count(sizes_hash)) { |
103 | char label[256]; |
104 | snprintf(label, sizeof(label), |
105 | "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, " |
106 | "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)" , |
107 | rows, depth, cols, block_params.l2_rows, block_params.l2_depth, |
108 | block_params.l2_cols, block_params.l1_rows, block_params.l1_depth, |
109 | block_params.l1_cols); |
110 | labels_map[sizes_hash] = label; |
111 | } |
112 | ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str()); |
113 | #endif |
114 | |
115 | PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator, |
116 | block_params); |
117 | PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator, |
118 | block_params); |
119 | |
120 | PackedResult packed_result(allocator, block_params); |
121 | |
122 | allocator->Commit(); |
123 | |
124 | const bool pack_rhs_once = block_params.l2_cols >= cols; |
125 | |
126 | if (pack_rhs_once) { |
127 | PackRhs(&packed_rhs, rhs); |
128 | } |
129 | |
130 | for (int r = 0; r < rows; r += block_params.l2_rows) { |
131 | int rs = std::min(block_params.l2_rows, rows - r); |
132 | |
133 | PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); |
134 | |
135 | for (int c = 0; c < cols; c += block_params.l2_cols) { |
136 | int cs = std::min(block_params.l2_cols, cols - c); |
137 | |
138 | if (!pack_rhs_once) { |
139 | PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); |
140 | } |
141 | |
142 | Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, |
143 | depth); |
144 | |
145 | UnpackResult<KernelFormat>( |
146 | result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth, |
147 | packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), |
148 | lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline); |
149 | } |
150 | } |
151 | |
152 | allocator->Decommit(); |
153 | } |
154 | |
155 | } // namespace gemmlowp |
156 | |
157 | #endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ |
158 | |