1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// pack.h: packing blocks of the LHS and RHS into the data layout
16// that is expected by compute.h and eventually by kernels.
17// Because this data layout depends on the kernel format, code here
18// is templated in KernelLhsFormat/KernelRhsFormat.
19//
20// Readers note: an important theme around here is that we try hard
21// to handle both Lhs and Rhs with a single piece of code. We indifferently
22// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23// by (row, column) indices, we address them by (width, depth), as explained
24// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25// at once.
26
27#ifndef GEMMLOWP_INTERNAL_PACK_H_
28#define GEMMLOWP_INTERNAL_PACK_H_
29
30#include <cstring>
31
32#include "allocator.h"
33#include "block_params.h"
34#include "common.h"
35#include "kernel.h"
36
37namespace gemmlowp {
38
39// A PackedSideBlock instance is a packed block of either the LHS or RHS
40// (whence the generic 'Side' name).
41//
42// 'Packed' means that it is laid out in the storage order that
43// is expected by the specified kernel format. From a block of the input
44// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
45// or PackRhs().
46template <typename tKernelSideFormat>
47class PackedSideBlock {
48 public:
49 typedef tKernelSideFormat KernelSideFormat;
50
51 PackedSideBlock(Side side, Allocator* allocator,
52 const BlockParams& block_params)
53 : allocator_(allocator), pos_(0) {
54 GetSideBlockParams(side, &params_, block_params);
55 data_handle_ =
56 allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
57 sums_of_each_slice_handle_ =
58 allocator_->Reserve<std::int32_t>(params_.l2_width);
59 }
60
61 ~PackedSideBlock() {}
62
63 void seek_run(int start_width, int start_depth) const {
64 int kernel_run_depth =
65 std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
66 pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
67 }
68
69 void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
70
71 void seek_forward_n_cells(int n) const {
72 pos_ += n * KernelSideFormat::Cell::kSize;
73 }
74
75 // TODO(suharshs): The datatype can now be int8 as well. We could introduce a
76 // new int8 current_data impl as well. This change would propagate to all pack
77 // impls and the Kernel::Run API, which all assume uint8. For now we leave
78 // this as-is pending future refactor.
79 const std::uint8_t* current_data() const {
80 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
81 }
82
83 std::uint8_t* current_data() {
84 return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
85 }
86
87 std::int32_t* sums_of_each_slice() {
88 return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
89 }
90
91 const std::int32_t* sums_of_each_slice() const {
92 return allocator_->GetPointer<const std::int32_t>(
93 sums_of_each_slice_handle_);
94 }
95
96 const SideBlockParams& params() const { return params_; }
97
98 private:
99 // The block size parameters that this PackedSizeBlock follows.
100 // The L2 parameters determine its overall size, while the L1 parameters,
101 // together with the kernel format template parameter, determine
102 // the fine details of the storage/traversal order.
103 SideBlockParams params_;
104
105 // Pointer to the allocator provided by the caller. Not owned.
106 // The Allocator is assumed to outlive the PackedSideBlock.
107 Allocator* const allocator_;
108
109 // Handle on the buffer backing this packed block. Owned.
110 Allocator::Handle data_handle_;
111
112 // Handle on the additional buffer backing the vector of sums of slices
113 // associated with this block. Owned.
114 Allocator::Handle sums_of_each_slice_handle_;
115
116 // pos_ is the current position in the buffer, which we access
117 // sequentially, like a file.
118 // The idea is that we pack data in the same order as it is
119 // going to be traversed during the computation, which for
120 // cache-friendliness reasons is complicated to random-access,
121 // as the offsets calculations would be intricate. So we
122 // give up random-access addressing, and instead content ourselves
123 // with sequential access.
124 //
125 // pos_ is mutable because during the computation we will want to
126 // be able to iterate on the data in a const PackedSideBlock.
127 mutable int pos_;
128};
129
130// WidthMajor and DepthMajor are custom phrases modelled after the
131// standard terminology 'row-major' and 'column-major'. Their meaning
132// should be transparent once one has read the explanation in kernel.h:
133// for example, in the Lhs, the 'width' dimension is the rows dimension,
134// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
135// Another way to put it: WidthMajor means that contiguous storage is used
136// for entries having the same 'width' index.
137enum class SideMapOrder { WidthMajor, DepthMajor };
138
139// Similar to MatrixMap from map.h, but in terms of width/depth instead of
140// rows/columns. Used to address blocks of the input LHS/RHS matrices when
141// packing them.
142template <typename tScalar, SideMapOrder tOrder>
143class SideMap {
144 public:
145 typedef tScalar Scalar;
146 static constexpr SideMapOrder kOrder = tOrder;
147
148 SideMap(Scalar* data, int width, int depth, int stride)
149 : data_(data), width_(width), depth_(depth), stride_(stride) {}
150
151 SideMap(Scalar* data, int width, int depth)
152 : data_(data), width_(width), depth_(depth) {
153 stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
154 }
155
156 SideMap(const SideMap& other) = default;
157 SideMap& operator=(const SideMap& other) = default;
158
159 int width() const { return width_; }
160 int depth() const { return depth_; }
161 int stride() const { return stride_; }
162 int width_stride() const {
163 return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
164 }
165 int depth_stride() const {
166 return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
167 }
168 Scalar* data() const { return data_; }
169 Scalar* data(int w, int d) const {
170 return data_ + w * width_stride() + d * depth_stride();
171 }
172 Scalar operator()(int w, int d) const { return *data(w, d); }
173 Scalar& operator()(int w, int d) { return *data(w, d); }
174
175 SideMap block(int start_width, int start_depth, int block_width,
176 int block_depth) const {
177 assert(start_width >= 0);
178 assert(start_width + block_width <= width_);
179 assert(start_depth >= 0);
180 assert(start_depth + block_depth <= depth_);
181
182 return SideMap(data(start_width, start_depth), block_width, block_depth,
183 stride_);
184 }
185
186 private:
187 Scalar* data_; // not owned.
188 int width_, depth_, stride_;
189};
190
191// A PackingRegisterBlock is a small fixed-size block of a matrix being
192// packed. This class is the generic non-optimized implementation,
193// it is inherited by the generic implementation of PackingRegisterBlock,
194// which may be overriden by template specialization. Overriding it is how
195// one may provide optimized packing code paths.
196//
197// The packing of a block proceeds in two steps:
198// 1. Ensuring that we have a complete block of source data, i.e. a block of
199// the compile-time prescribed size. This is where we handle unaligned
200// boundaries: if we don't have a complete block of source data, then
201// we copy and zero-extend it into a local temporary (complete_src_),
202// see MakeCompleteSrc. In the generic case, we do have a complete block,
203// so we just use it in-place, see UseCompleteSrcInPlace.
204// 2. Packing a complete block into the destination, see Pack. This is the
205// most critical part, so it's convenient that unaligned boundaries have
206// already been handled in step 1.
207template <typename SrcMapType, typename PackedSideBlock>
208class PackingRegisterBlockBase {
209 public:
210 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
211 typedef typename KernelSideFormat::Cell CellFormat;
212 typedef typename KernelSideFormat::InputScalar KernelInputScalar;
213 typedef typename KernelSideFormat::Scalar KernelScalar;
214 static constexpr int kCells = KernelSideFormat::kCells;
215 static constexpr int kCellWidth = CellFormat::kWidth;
216 static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
217 static constexpr int kCellDepth = CellFormat::kDepth;
218 static constexpr int kCellSize = CellFormat::kSize;
219 static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder;
220 static constexpr int kZeroPointInputValue =
221 ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue;
222
223 PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
224
225 protected:
226 // The source data that's ready for packing. May point to
227 // in-place actual source data if it's already a complete block,
228 // (see UseCompleteSrcInPlace)
229 // or to the local buf_ below into which we copy incomplete blocks
230 // (see MakeCompleteSrc)
231 SrcMapType complete_src_;
232
233 // Temporary buffer for loading incomplete blocks to,
234 // in the source storage order
235 std::uint8_t buf_[kKernelWidth * kRegisterSize];
236
237 public:
238 // Selects a block if in-place source data that's already a complete block.
239 void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
240 // Copies an incomplete block of source data into a local temporary
241 // complete block by zero-extending it.
242 void MakeCompleteSrc(const SrcMapType& src) {
243 memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
244 if (kSrcOrder == SideMapOrder::WidthMajor) {
245 for (int w = 0; w < src.width(); w++) {
246 memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
247 }
248 } else {
249 assert(kSrcOrder == SideMapOrder::DepthMajor);
250 for (int d = 0; d < src.depth(); d++) {
251 memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
252 }
253 }
254
255 // Since the KernelInputScalar type may not be uint8, we need to cast buf_.
256 complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_),
257 kKernelWidth, kRegisterSize);
258 }
259 // Packs a complete block into the destination. This is the most
260 // critical part and the part that we most typically want to
261 // override in architecture-specific optimized specializations.
262 void Pack(PackedSideBlock* dst, int start_width) {
263 std::uint8_t* dst_ptr = dst->current_data();
264 for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
265 cell_start_depth += kCellDepth) {
266 for (int cell_start_width = 0; cell_start_width < kKernelWidth;
267 cell_start_width += kCellWidth) {
268 std::int32_t* cell_sums_of_each_slice_ptr =
269 dst->sums_of_each_slice() + start_width + cell_start_width;
270 const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
271 complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
272 kCellDepth));
273 for (int w = 0; w < kCellWidth; w++) {
274 std::int32_t sum = 0;
275 for (int d = 0; d < kCellDepth; d++) {
276 const std::uint8_t src_val = src_cell_map(w, d);
277 const std::int16_t kernel_val_unwrapped =
278 src_val - kZeroPointInputValue;
279 const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
280 dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
281 sum += kernel_val_unwrapped;
282 }
283 cell_sums_of_each_slice_ptr[w] += sum;
284 }
285 dst_ptr += kCellSize;
286 }
287 }
288 dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
289 }
290};
291
292template <typename SrcMapType, typename PackedSideBlock>
293class PackingRegisterBlock
294 : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
295
296// Large-scale implementation of packing.
297template <typename SrcMapType, typename PackedSideBlock>
298class PackSideBlockImpl {
299 public:
300 typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
301 typedef typename KernelSideFormat::Cell CellFormat;
302 static constexpr int kCells = KernelSideFormat::kCells;
303 static constexpr int kCellWidth = CellFormat::kWidth;
304 static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
305 static constexpr int kCellDepth = CellFormat::kDepth;
306
307 typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
308 PackingRegisterBlockType;
309
310 PackSideBlockImpl(PackedSideBlock* packed_side_block,
311 const SrcMapType& src_map)
312 : packed_side_block_(packed_side_block), src_map_(src_map) {}
313
314 PackedSideBlock* packed_side_block() const { return packed_side_block_; }
315
316 const SrcMapType& src_map() const { return src_map_; }
317
318 // The public entry point to pack a block.
319 void PackL2() {
320 memset(packed_side_block_->sums_of_each_slice(), 0,
321 sizeof(std::int32_t) * packed_side_block_->params().l2_width);
322 for (int d = 0; d < src_map_.depth();
323 d += packed_side_block_->params().l1_depth) {
324 int ds = std::min<int>(packed_side_block_->params().l1_depth,
325 src_map_.depth() - d);
326
327 for (int w = 0; w < src_map_.width();
328 w += packed_side_block_->params().l1_width) {
329 int ws = std::min<int>(packed_side_block_->params().l1_width,
330 src_map_.width() - w);
331
332 PrefetchL1(w, ws, d, ds);
333 PackL1(w, ws, d, ds);
334 }
335 }
336 }
337
338 protected:
339 // The intermediate-level loops, between PackL2 and PackRun.
340 void PackL1(int start_width, int width, int start_depth, int depth) {
341 for (int w = 0; w < width; w += kKernelWidth) {
342 int ws = std::min(+kKernelWidth, width - w);
343 packed_side_block_->seek_run(start_width + w, start_depth);
344 PackRun(start_width + w, ws, start_depth, depth);
345 }
346 }
347
348 // Prefetches the data that will be read by PackL1.
349 void PrefetchL1(int start_width, int width, int start_depth, int depth) {
350 if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
351 for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
352 for (int w = 0; w < width; w += 1) {
353 Prefetch(src_map_.data(start_width + w, start_depth + d));
354 }
355 }
356 } else {
357 for (int d = 0; d < depth; d++) {
358 for (int w = 0; w < width; w += kDefaultCacheLineSize) {
359 Prefetch(src_map_.data(start_width + w, start_depth + d));
360 }
361 }
362 }
363 }
364
365 // PackRun packs only a run i.e. is the inner loop in the depth dimension.
366 void PackRun(int start_width, int width, int start_depth, int depth) {
367 PackingRegisterBlockType b;
368 if (width == kKernelWidth) {
369 const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
370 if (register_aligned_depth) {
371 for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
372 b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
373 width, kRegisterSize));
374 b.Pack(packed_side_block_, start_width);
375 }
376 }
377 if (register_aligned_depth < depth) {
378 b.MakeCompleteSrc(
379 src_map_.block(start_width, start_depth + register_aligned_depth,
380 width, depth - register_aligned_depth));
381 b.Pack(packed_side_block_, start_width);
382 }
383 } else {
384 assert(width < kKernelWidth);
385 for (int d = 0; d < depth; d += kRegisterSize) {
386 const int ds = std::min(+kRegisterSize, depth - d);
387 b.MakeCompleteSrc(
388 src_map_.block(start_width, start_depth + d, width, ds));
389 b.Pack(packed_side_block_, start_width);
390 }
391 }
392 }
393
394 // The PackedSideBlock being packed, i.e. the 'destination'.
395 PackedSideBlock* const packed_side_block_;
396
397 // A map on the block of the original matrix block being packed,
398 // i.e. the 'source'.
399 const SrcMapType& src_map_;
400};
401
402// Packs a block of the input LHS matrix, into a PackedSideBlock.
403template <typename PackedSideBlock, typename MatrixMapType>
404void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
405 ScopedProfilingLabel label("pack LHS");
406 static const SideMapOrder kSideMapOrder =
407 MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
408 : SideMapOrder::DepthMajor;
409 typedef typename MatrixMapType::Scalar Scalar;
410 typedef SideMap<Scalar, kSideMapOrder> SideMapType;
411 SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
412 typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
413 ImplType impl(dst, src_side_map);
414 impl.PackL2();
415}
416
417// Packs a block of the input RHS matrix, into a PackedSideBlock.
418template <typename PackedSideBlock, typename MatrixMapType>
419void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
420 ScopedProfilingLabel label("pack RHS");
421 static const SideMapOrder kSideMapOrder =
422 MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
423 : SideMapOrder::DepthMajor;
424 typedef typename MatrixMapType::Scalar Scalar;
425 typedef SideMap<Scalar, kSideMapOrder> SideMapType;
426 SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
427 typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
428 ImplType impl(dst, src_side_map);
429 impl.PackL2();
430}
431
432} // namespace gemmlowp
433
434#ifdef GEMMLOWP_NEON
435#include "pack_neon.h"
436#elif defined(GEMMLOWP_SSE4)
437#include "pack_sse.h"
438#elif defined(GEMMLOWP_AVX2)
439#include "pack_avx.h"
440#elif defined(GEMMLOWP_MSA)
441#include "pack_msa.h"
442#endif
443
444#endif // GEMMLOWP_INTERNAL_PACK_H_
445