pack.h source code [tensorflow/external/gemmlowp/internal/pack.h]

1	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	// pack.h: packing blocks of the LHS and RHS into the data layout
16	// that is expected by compute.h and eventually by kernels.
17	// Because this data layout depends on the kernel format, code here
18	// is templated in KernelLhsFormat/KernelRhsFormat.
19	//
20	// Readers note: an important theme around here is that we try hard
21	// to handle both Lhs and Rhs with a single piece of code. We indifferently
22	// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23	// by (row, column) indices, we address them by (width, depth), as explained
24	// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25	// at once.
26
27	#ifndef GEMMLOWP_INTERNAL_PACK_H_
28	#define GEMMLOWP_INTERNAL_PACK_H_
29
30	#include <cstring>
31
32	#include "allocator.h"
33	#include "block_params.h"
34	#include "common.h"
35	#include "kernel.h"
36
37	namespace gemmlowp {
38
39	// A PackedSideBlock instance is a packed block of either the LHS or RHS
40	// (whence the generic 'Side' name).
41	//
42	// 'Packed' means that it is laid out in the storage order that
43	// is expected by the specified kernel format. From a block of the input
44	// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
45	// or PackRhs().
46	template <typename tKernelSideFormat>
47	class PackedSideBlock {
48	public:
49	typedef tKernelSideFormat KernelSideFormat;
50
51	PackedSideBlock(Side side, Allocator* allocator,
52	const BlockParams& block_params)
53	: allocator_(allocator), pos_(`0`) {
54	GetSideBlockParams(side, &params_, block_params);
55	data_handle_ =
56	allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
57	sums_of_each_slice_handle_ =
58	allocator_->Reserve<std::int32_t>(params_.l2_width);
59	}
60
61	~PackedSideBlock() {}
62
63	void seek_run(int start_width, int start_depth) const {
64	int kernel_run_depth =
65	std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
66	pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
67	}
68
69	void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
70
71	void seek_forward_n_cells(int n) const {
72	pos_ += n * KernelSideFormat::Cell::kSize;
73	}
74
75	// TODO(suharshs): The datatype can now be int8 as well. We could introduce a
76	// new int8 current_data impl as well. This change would propagate to all pack
77	// impls and the Kernel::Run API, which all assume uint8. For now we leave
78	// this as-is pending future refactor.
79	const std::uint8_t* current_data() const {
80	return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
81	}
82
83	std::uint8_t* current_data() {
84	return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
85	}
86
87	std::int32_t* sums_of_each_slice() {
88	return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
89	}
90
91	const std::int32_t* sums_of_each_slice() const {
92	return allocator_->GetPointer<const std::int32_t>(
93	sums_of_each_slice_handle_);
94	}
95
96	const SideBlockParams& params() const { return params_; }
97
98	private:
99	// The block size parameters that this PackedSizeBlock follows.
100	// The L2 parameters determine its overall size, while the L1 parameters,
101	// together with the kernel format template parameter, determine
102	// the fine details of the storage/traversal order.
103	SideBlockParams params_;
104
105	// Pointer to the allocator provided by the caller. Not owned.
106	// The Allocator is assumed to outlive the PackedSideBlock.
107	Allocator* const allocator_;
108
109	// Handle on the buffer backing this packed block. Owned.
110	Allocator::Handle data_handle_;
111
112	// Handle on the additional buffer backing the vector of sums of slices
113	// associated with this block. Owned.
114	Allocator::Handle sums_of_each_slice_handle_;
115
116	// pos_ is the current position in the buffer, which we access
117	// sequentially, like a file.
118	// The idea is that we pack data in the same order as it is
119	// going to be traversed during the computation, which for
120	// cache-friendliness reasons is complicated to random-access,
121	// as the offsets calculations would be intricate. So we
122	// give up random-access addressing, and instead content ourselves
123	// with sequential access.
124	//
125	// pos_ is mutable because during the computation we will want to
126	// be able to iterate on the data in a const PackedSideBlock.
127	mutable int pos_;
128	};
129
130	// WidthMajor and DepthMajor are custom phrases modelled after the
131	// standard terminology 'row-major' and 'column-major'. Their meaning
132	// should be transparent once one has read the explanation in kernel.h:
133	// for example, in the Lhs, the 'width' dimension is the rows dimension,
134	// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
135	// Another way to put it: WidthMajor means that contiguous storage is used
136	// for entries having the same 'width' index.
137	enum class SideMapOrder { WidthMajor, DepthMajor };
138
139	// Similar to MatrixMap from map.h, but in terms of width/depth instead of
140	// rows/columns. Used to address blocks of the input LHS/RHS matrices when
141	// packing them.
142	template <typename tScalar, SideMapOrder tOrder>
143	class SideMap {
144	public:
145	typedef tScalar Scalar;
146	static constexpr SideMapOrder kOrder = tOrder;
147
148	SideMap(Scalar* data, int width, int depth, int stride)
149	: data_(data), width_(width), depth_(depth), stride_(stride) {}
150
151	SideMap(Scalar* data, int width, int depth)
152	: data_(data), width_(width), depth_(depth) {
153	stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
154	}
155
156	SideMap(const SideMap& other) = default;
157	SideMap& operator=(const SideMap& other) = default;
158
159	int width() const { return width_; }
160	int depth() const { return depth_; }
161	int stride() const { return stride_; }
162	int width_stride() const {
163	return kOrder == SideMapOrder::DepthMajor ? `1` : stride_;
164	}
165	int depth_stride() const {
166	return kOrder == SideMapOrder::WidthMajor ? `1` : stride_;
167	}
168	Scalar* data() const { return data_; }
169	Scalar* data(int w, int d) const {
170	return data_ + w * width_stride() + d * depth_stride();
171	}
172	Scalar operator()(int w, int d) const { return *data(w, d); }
173	Scalar& operator()(int w, int d) { return *data(w, d); }
174
175	SideMap block(int start_width, int start_depth, int block_width,
176	int block_depth) const {
177	assert(start_width >= `0`);
178	assert(start_width + block_width <= width_);
179	assert(start_depth >= `0`);
180	assert(start_depth + block_depth <= depth_);
181
182	return SideMap(data(start_width, start_depth), block_width, block_depth,
183	stride_);
184	}
185
186	private:
187	Scalar* data_; // not owned.
188	int width_, depth_, stride_;
189	};
190
191	// A PackingRegisterBlock is a small fixed-size block of a matrix being
192	// packed. This class is the generic non-optimized implementation,
193	// it is inherited by the generic implementation of PackingRegisterBlock,
194	// which may be overriden by template specialization. Overriding it is how
195	// one may provide optimized packing code paths.
196	//
197	// The packing of a block proceeds in two steps:
198	// 1. Ensuring that we have a complete block of source data, i.e. a block of
199	// the compile-time prescribed size. This is where we handle unaligned
200	// boundaries: if we don't have a complete block of source data, then
201	// we copy and zero-extend it into a local temporary (complete_src_),
202	// see MakeCompleteSrc. In the generic case, we do have a complete block,
203	// so we just use it in-place, see UseCompleteSrcInPlace.
204	// 2. Packing a complete block into the destination, see Pack. This is the
205	// most critical part, so it's convenient that unaligned boundaries have
206	// already been handled in step 1.
207	template <typename SrcMapType, typename PackedSideBlock>
208	class PackingRegisterBlockBase {
209	public:
210	typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
211	typedef typename KernelSideFormat::Cell CellFormat;
212	typedef typename KernelSideFormat::InputScalar KernelInputScalar;
213	typedef typename KernelSideFormat::Scalar KernelScalar;
214	static constexpr int kCells = KernelSideFormat::kCells;
215	static constexpr int kCellWidth = CellFormat::kWidth;
216	static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
217	static constexpr int kCellDepth = CellFormat::kDepth;
218	static constexpr int kCellSize = CellFormat::kSize;
219	static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder;
220	static constexpr int kZeroPointInputValue =
221	ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue;
222
223	PackingRegisterBlockBase() : complete_src_(nullptr, `0`, `0`, `0`) {}
224
225	protected:
226	// The source data that's ready for packing. May point to
227	// in-place actual source data if it's already a complete block,
228	// (see UseCompleteSrcInPlace)
229	// or to the local buf_ below into which we copy incomplete blocks
230	// (see MakeCompleteSrc)
231	SrcMapType complete_src_;
232
233	// Temporary buffer for loading incomplete blocks to,
234	// in the source storage order
235	std::uint8_t buf_[kKernelWidth * kRegisterSize];
236
237	public:
238	// Selects a block if in-place source data that's already a complete block.
239	void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
240	// Copies an incomplete block of source data into a local temporary
241	// complete block by zero-extending it.
242	void MakeCompleteSrc(const SrcMapType& src) {
243	memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
244	if (kSrcOrder == SideMapOrder::WidthMajor) {
245	for (int w = `0`; w < src.width(); w++) {
246	memcpy(buf_ + w * kRegisterSize, src.data(w, `0`), src.depth());
247	}
248	} else {
249	assert(kSrcOrder == SideMapOrder::DepthMajor);
250	for (int d = `0`; d < src.depth(); d++) {
251	memcpy(buf_ + d * kKernelWidth, src.data(`0`, d), src.width());
252	}
253	}
254
255	// Since the KernelInputScalar type may not be uint8, we need to cast buf_.
256	complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_),
257	kKernelWidth, kRegisterSize);
258	}
259	// Packs a complete block into the destination. This is the most
260	// critical part and the part that we most typically want to
261	// override in architecture-specific optimized specializations.
262	void Pack(PackedSideBlock* dst, int start_width) {
263	std::uint8_t* dst_ptr = dst->current_data();
264	for (int cell_start_depth = `0`; cell_start_depth < kRegisterSize;
265	cell_start_depth += kCellDepth) {
266	for (int cell_start_width = `0`; cell_start_width < kKernelWidth;
267	cell_start_width += kCellWidth) {
268	std::int32_t* cell_sums_of_each_slice_ptr =
269	dst->sums_of_each_slice() + start_width + cell_start_width;
270	const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
271	complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
272	kCellDepth));
273	for (int w = `0`; w < kCellWidth; w++) {
274	std::int32_t sum = `0`;
275	for (int d = `0`; d < kCellDepth; d++) {
276	const std::uint8_t src_val = src_cell_map(w, d);
277	const std::int16_t kernel_val_unwrapped =
278	src_val - kZeroPointInputValue;
279	const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
280	dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
281	sum += kernel_val_unwrapped;
282	}
283	cell_sums_of_each_slice_ptr[w] += sum;
284	}
285	dst_ptr += kCellSize;
286	}
287	}
288	dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
289	}
290	};
291
292	template <typename SrcMapType, typename PackedSideBlock>
293	class PackingRegisterBlock
294	: public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
295
296	// Large-scale implementation of packing.
297	template <typename SrcMapType, typename PackedSideBlock>
298	class PackSideBlockImpl {
299	public:
300	typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
301	typedef typename KernelSideFormat::Cell CellFormat;
302	static constexpr int kCells = KernelSideFormat::kCells;
303	static constexpr int kCellWidth = CellFormat::kWidth;
304	static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
305	static constexpr int kCellDepth = CellFormat::kDepth;
306
307	typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
308	PackingRegisterBlockType;
309
310	PackSideBlockImpl(PackedSideBlock* packed_side_block,
311	const SrcMapType& src_map)
312	: packed_side_block_(packed_side_block), src_map_(src_map) {}
313
314	PackedSideBlock* packed_side_block() const { return packed_side_block_; }
315
316	const SrcMapType& src_map() const { return src_map_; }
317
318	// The public entry point to pack a block.
319	void PackL2() {
320	memset(packed_side_block_->sums_of_each_slice(), `0`,
321	sizeof(std::int32_t) * packed_side_block_->params().l2_width);
322	for (int d = `0`; d < src_map_.depth();
323	d += packed_side_block_->params().l1_depth) {
324	int ds = std::min<int>(packed_side_block_->params().l1_depth,
325	src_map_.depth() - d);
326
327	for (int w = `0`; w < src_map_.width();
328	w += packed_side_block_->params().l1_width) {
329	int ws = std::min<int>(packed_side_block_->params().l1_width,
330	src_map_.width() - w);
331
332	PrefetchL1(w, ws, d, ds);
333	PackL1(w, ws, d, ds);
334	}
335	}
336	}
337
338	protected:
339	// The intermediate-level loops, between PackL2 and PackRun.
340	void PackL1(int start_width, int width, int start_depth, int depth) {
341	for (int w = `0`; w < width; w += kKernelWidth) {
342	int ws = std::min(+kKernelWidth, width - w);
343	packed_side_block_->seek_run(start_width + w, start_depth);
344	PackRun(start_width + w, ws, start_depth, depth);
345	}
346	}
347
348	// Prefetches the data that will be read by PackL1.
349	void PrefetchL1(int start_width, int width, int start_depth, int depth) {
350	if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
351	for (int d = `0`; d < depth; d += kDefaultCacheLineSize) {
352	for (int w = `0`; w < width; w += `1`) {
353	Prefetch(src_map_.data(start_width + w, start_depth + d));
354	}
355	}
356	} else {
357	for (int d = `0`; d < depth; d++) {
358	for (int w = `0`; w < width; w += kDefaultCacheLineSize) {
359	Prefetch(src_map_.data(start_width + w, start_depth + d));
360	}
361	}
362	}
363	}
364
365	// PackRun packs only a run i.e. is the inner loop in the depth dimension.
366	void PackRun(int start_width, int width, int start_depth, int depth) {
367	PackingRegisterBlockType b;
368	if (width == kKernelWidth) {
369	const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
370	if (register_aligned_depth) {
371	for (int d = `0`; d < register_aligned_depth; d += kRegisterSize) {
372	b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
373	width, kRegisterSize));
374	b.Pack(packed_side_block_, start_width);
375	}
376	}
377	if (register_aligned_depth < depth) {
378	b.MakeCompleteSrc(
379	src_map_.block(start_width, start_depth + register_aligned_depth,
380	width, depth - register_aligned_depth));
381	b.Pack(packed_side_block_, start_width);
382	}
383	} else {
384	assert(width < kKernelWidth);
385	for (int d = `0`; d < depth; d += kRegisterSize) {
386	const int ds = std::min(+kRegisterSize, depth - d);
387	b.MakeCompleteSrc(
388	src_map_.block(start_width, start_depth + d, width, ds));
389	b.Pack(packed_side_block_, start_width);
390	}
391	}
392	}
393
394	// The PackedSideBlock being packed, i.e. the 'destination'.
395	PackedSideBlock* const packed_side_block_;
396
397	// A map on the block of the original matrix block being packed,
398	// i.e. the 'source'.
399	const SrcMapType& src_map_;
400	};
401
402	// Packs a block of the input LHS matrix, into a PackedSideBlock.
403	template <typename PackedSideBlock, typename MatrixMapType>
404	void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
405	ScopedProfilingLabel label("pack LHS");
406	static const SideMapOrder kSideMapOrder =
407	MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
408	: SideMapOrder::DepthMajor;
409	typedef typename MatrixMapType::Scalar Scalar;
410	typedef SideMap<Scalar, kSideMapOrder> SideMapType;
411	SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
412	typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
413	ImplType impl(dst, src_side_map);
414	impl.PackL2();
415	}
416
417	// Packs a block of the input RHS matrix, into a PackedSideBlock.
418	template <typename PackedSideBlock, typename MatrixMapType>
419	void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
420	ScopedProfilingLabel label("pack RHS");
421	static const SideMapOrder kSideMapOrder =
422	MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
423	: SideMapOrder::DepthMajor;
424	typedef typename MatrixMapType::Scalar Scalar;
425	typedef SideMap<Scalar, kSideMapOrder> SideMapType;
426	SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
427	typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
428	ImplType impl(dst, src_side_map);
429	impl.PackL2();
430	}
431
432	} // namespace gemmlowp
433
434	#ifdef GEMMLOWP_NEON
435	#include "pack_neon.h"
436	#elif defined(GEMMLOWP_SSE4)
437	#include "pack_sse.h"
438	#elif defined(GEMMLOWP_AVX2)
439	#include "pack_avx.h"
440	#elif defined(GEMMLOWP_MSA)
441	#include "pack_msa.h"
442	#endif
443
444	#endif // GEMMLOWP_INTERNAL_PACK_H_
445

Browse the source code of tensorflow/external/gemmlowp/internal/pack.h