single_thread_gemm.h source code [tensorflow/external/gemmlowp/internal/single_thread_gemm.h]

1	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	// single_thread_gemm.h: Single-threaded GEMM implementation.
16	// This is a good place to start reading code, as it shows the overall
17	// structure of a GEMM and is much simpler than multi_thread_gemm.h.
18
19	#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
20	#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
21
22	#include <cassert>
23
24	#include "../public/map.h"
25	#include "allocator.h"
26	#include "compute.h"
27	#include "kernel.h"
28	#include "pack.h"
29	#include "unpack.h"
30
31	#ifdef GEMMLOWP_PROFILING_SIZES
32	#ifndef GEMMLOWP_PROFILING
33	#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
34	#endif
35	#include <string>
36	#include <unordered_map>
37	#endif
38
39	namespace gemmlowp {
40
41	class SingleThreadGemmContext {
42	public:
43	Allocator* allocator() { return &allocator_; }
44
45	void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
46	void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
47	void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
48
49	int l1_bytes_to_use() const { return l1_bytes_to_use_; }
50	int l2_bytes_to_use() const { return l2_bytes_to_use_; }
51	float l2_rhs_factor() const { return l2_rhs_factor_; }
52
53	protected:
54	Allocator allocator_;
55
56	// The cache configurationt to use.
57	int l1_bytes_to_use_ = kDefaultL1CacheSize;
58	int l2_bytes_to_use_ = kDefaultL2CacheSize;
59	float l2_rhs_factor_ = kDefaultL2RhsFactor;
60	};
61
62	template <typename KernelFormat, typename InputScalar, typename OutputScalar,
63	typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
64	MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
65	typename OutputPipelineType>
66	void SingleThreadGemm(SingleThreadGemmContext* context,
67	const KernelBase& kernel,
68	const MatrixMap<const InputScalar, LhsOrder>& lhs,
69	const MatrixMap<const InputScalar, RhsOrder>& rhs,
70	MatrixMap<OutputScalar, ResultOrder>* result,
71	const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
72	const OutputPipelineType& output_pipeline) {
73	ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
74
75	assert(lhs.cols() == rhs.rows());
76
77	int rows = result->rows();
78	int cols = result->cols();
79	int depth = lhs.cols();
80
81	// zero sizes should have been caught earlier and early-returned.
82	assert(rows > `0`);
83	assert(cols > `0`);
84	assert(depth > `0`);
85
86	// The case of rows<cols should have been caught earlier and transposed.
87	assert(rows >= cols);
88
89	Allocator* allocator = context->allocator();
90
91	BlockParams block_params;
92	block_params.Init<KernelFormat>(
93	rows, cols, depth, `1`, context->l1_bytes_to_use(),
94	context->l2_bytes_to_use(), context->l2_rhs_factor());
95
96	#ifdef GEMMLOWP_PROFILING_SIZES
97	// Using a static map of label strings. Not reentrant at all!
98	static std::unordered_map<std::uint64_t, std::string> labels_map;
99	std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
100	(static_cast<std::uint64_t>(depth) << `16`) ^
101	(static_cast<std::uint64_t>(cols) << `32`);
102	if (!labels_map.count(sizes_hash)) {
103	char label[`256`];
104	snprintf(label, sizeof(label),
105	"(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
106	"l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
107	rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
108	block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
109	block_params.l1_cols);
110	labels_map[sizes_hash] = label;
111	}
112	ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
113	#endif
114
115	PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
116	block_params);
117	PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
118	block_params);
119
120	PackedResult packed_result(allocator, block_params);
121
122	allocator->Commit();
123
124	const bool pack_rhs_once = block_params.l2_cols >= cols;
125
126	if (pack_rhs_once) {
127	PackRhs(&packed_rhs, rhs);
128	}
129
130	for (int r = `0`; r < rows; r += block_params.l2_rows) {
131	int rs = std::min(block_params.l2_rows, rows - r);
132
133	PackLhs(&packed_lhs, lhs.block(r, `0`, rs, depth));
134
135	for (int c = `0`; c < cols; c += block_params.l2_cols) {
136	int cs = std::min(block_params.l2_cols, cols - c);
137
138	if (!pack_rhs_once) {
139	PackRhs(&packed_rhs, rhs.block(`0`, c, depth, cs));
140	}
141
142	Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
143	depth);
144
145	UnpackResult<KernelFormat>(
146	result, MatrixBlockBounds (r, c, rs, cs), packed_result, depth,
147	packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
148	lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
149	}
150	}
151
152	allocator->Decommit();
153	}
154
155	} // namespace gemmlowp
156
157	#endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
158

Browse the source code of tensorflow/external/gemmlowp/internal/single_thread_gemm.h