1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | // pack.h: packing blocks of the LHS and RHS into the data layout |
16 | // that is expected by compute.h and eventually by kernels. |
17 | // Because this data layout depends on the kernel format, code here |
18 | // is templated in KernelLhsFormat/KernelRhsFormat. |
19 | // |
20 | // Readers note: an important theme around here is that we try hard |
21 | // to handle both Lhs and Rhs with a single piece of code. We indifferently |
22 | // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices |
23 | // by (row, column) indices, we address them by (width, depth), as explained |
24 | // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, |
25 | // at once. |
26 | |
27 | #ifndef GEMMLOWP_INTERNAL_PACK_H_ |
28 | #define GEMMLOWP_INTERNAL_PACK_H_ |
29 | |
30 | #include <cstring> |
31 | |
32 | #include "allocator.h" |
33 | #include "block_params.h" |
34 | #include "common.h" |
35 | #include "kernel.h" |
36 | |
37 | namespace gemmlowp { |
38 | |
39 | // A PackedSideBlock instance is a packed block of either the LHS or RHS |
40 | // (whence the generic 'Side' name). |
41 | // |
42 | // 'Packed' means that it is laid out in the storage order that |
43 | // is expected by the specified kernel format. From a block of the input |
44 | // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() |
45 | // or PackRhs(). |
46 | template <typename tKernelSideFormat> |
47 | class PackedSideBlock { |
48 | public: |
49 | typedef tKernelSideFormat KernelSideFormat; |
50 | |
51 | PackedSideBlock(Side side, Allocator* allocator, |
52 | const BlockParams& block_params) |
53 | : allocator_(allocator), pos_(0) { |
54 | GetSideBlockParams(side, ¶ms_, block_params); |
55 | data_handle_ = |
56 | allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); |
57 | sums_of_each_slice_handle_ = |
58 | allocator_->Reserve<std::int32_t>(params_.l2_width); |
59 | } |
60 | |
61 | ~PackedSideBlock() {} |
62 | |
63 | void seek_run(int start_width, int start_depth) const { |
64 | int kernel_run_depth = |
65 | std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); |
66 | pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; |
67 | } |
68 | |
69 | void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } |
70 | |
71 | void seek_forward_n_cells(int n) const { |
72 | pos_ += n * KernelSideFormat::Cell::kSize; |
73 | } |
74 | |
75 | // TODO(suharshs): The datatype can now be int8 as well. We could introduce a |
76 | // new int8 current_data impl as well. This change would propagate to all pack |
77 | // impls and the Kernel::Run API, which all assume uint8. For now we leave |
78 | // this as-is pending future refactor. |
79 | const std::uint8_t* current_data() const { |
80 | return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; |
81 | } |
82 | |
83 | std::uint8_t* current_data() { |
84 | return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; |
85 | } |
86 | |
87 | std::int32_t* sums_of_each_slice() { |
88 | return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); |
89 | } |
90 | |
91 | const std::int32_t* sums_of_each_slice() const { |
92 | return allocator_->GetPointer<const std::int32_t>( |
93 | sums_of_each_slice_handle_); |
94 | } |
95 | |
96 | const SideBlockParams& params() const { return params_; } |
97 | |
98 | private: |
99 | // The block size parameters that this PackedSizeBlock follows. |
100 | // The L2 parameters determine its overall size, while the L1 parameters, |
101 | // together with the kernel format template parameter, determine |
102 | // the fine details of the storage/traversal order. |
103 | SideBlockParams params_; |
104 | |
105 | // Pointer to the allocator provided by the caller. Not owned. |
106 | // The Allocator is assumed to outlive the PackedSideBlock. |
107 | Allocator* const allocator_; |
108 | |
109 | // Handle on the buffer backing this packed block. Owned. |
110 | Allocator::Handle data_handle_; |
111 | |
112 | // Handle on the additional buffer backing the vector of sums of slices |
113 | // associated with this block. Owned. |
114 | Allocator::Handle sums_of_each_slice_handle_; |
115 | |
116 | // pos_ is the current position in the buffer, which we access |
117 | // sequentially, like a file. |
118 | // The idea is that we pack data in the same order as it is |
119 | // going to be traversed during the computation, which for |
120 | // cache-friendliness reasons is complicated to random-access, |
121 | // as the offsets calculations would be intricate. So we |
122 | // give up random-access addressing, and instead content ourselves |
123 | // with sequential access. |
124 | // |
125 | // pos_ is mutable because during the computation we will want to |
126 | // be able to iterate on the data in a const PackedSideBlock. |
127 | mutable int pos_; |
128 | }; |
129 | |
130 | // WidthMajor and DepthMajor are custom phrases modelled after the |
131 | // standard terminology 'row-major' and 'column-major'. Their meaning |
132 | // should be transparent once one has read the explanation in kernel.h: |
133 | // for example, in the Lhs, the 'width' dimension is the rows dimension, |
134 | // so there WidthMajor means RowMajor, while in the Rhs it is the opposite. |
135 | // Another way to put it: WidthMajor means that contiguous storage is used |
136 | // for entries having the same 'width' index. |
137 | enum class SideMapOrder { WidthMajor, DepthMajor }; |
138 | |
139 | // Similar to MatrixMap from map.h, but in terms of width/depth instead of |
140 | // rows/columns. Used to address blocks of the input LHS/RHS matrices when |
141 | // packing them. |
142 | template <typename tScalar, SideMapOrder tOrder> |
143 | class SideMap { |
144 | public: |
145 | typedef tScalar Scalar; |
146 | static constexpr SideMapOrder kOrder = tOrder; |
147 | |
148 | SideMap(Scalar* data, int width, int depth, int stride) |
149 | : data_(data), width_(width), depth_(depth), stride_(stride) {} |
150 | |
151 | SideMap(Scalar* data, int width, int depth) |
152 | : data_(data), width_(width), depth_(depth) { |
153 | stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; |
154 | } |
155 | |
156 | SideMap(const SideMap& other) = default; |
157 | SideMap& operator=(const SideMap& other) = default; |
158 | |
159 | int width() const { return width_; } |
160 | int depth() const { return depth_; } |
161 | int stride() const { return stride_; } |
162 | int width_stride() const { |
163 | return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; |
164 | } |
165 | int depth_stride() const { |
166 | return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; |
167 | } |
168 | Scalar* data() const { return data_; } |
169 | Scalar* data(int w, int d) const { |
170 | return data_ + w * width_stride() + d * depth_stride(); |
171 | } |
172 | Scalar operator()(int w, int d) const { return *data(w, d); } |
173 | Scalar& operator()(int w, int d) { return *data(w, d); } |
174 | |
175 | SideMap block(int start_width, int start_depth, int block_width, |
176 | int block_depth) const { |
177 | assert(start_width >= 0); |
178 | assert(start_width + block_width <= width_); |
179 | assert(start_depth >= 0); |
180 | assert(start_depth + block_depth <= depth_); |
181 | |
182 | return SideMap(data(start_width, start_depth), block_width, block_depth, |
183 | stride_); |
184 | } |
185 | |
186 | private: |
187 | Scalar* data_; // not owned. |
188 | int width_, depth_, stride_; |
189 | }; |
190 | |
191 | // A PackingRegisterBlock is a small fixed-size block of a matrix being |
192 | // packed. This class is the generic non-optimized implementation, |
193 | // it is inherited by the generic implementation of PackingRegisterBlock, |
194 | // which may be overriden by template specialization. Overriding it is how |
195 | // one may provide optimized packing code paths. |
196 | // |
197 | // The packing of a block proceeds in two steps: |
198 | // 1. Ensuring that we have a complete block of source data, i.e. a block of |
199 | // the compile-time prescribed size. This is where we handle unaligned |
200 | // boundaries: if we don't have a complete block of source data, then |
201 | // we copy and zero-extend it into a local temporary (complete_src_), |
202 | // see MakeCompleteSrc. In the generic case, we do have a complete block, |
203 | // so we just use it in-place, see UseCompleteSrcInPlace. |
204 | // 2. Packing a complete block into the destination, see Pack. This is the |
205 | // most critical part, so it's convenient that unaligned boundaries have |
206 | // already been handled in step 1. |
207 | template <typename SrcMapType, typename PackedSideBlock> |
208 | class PackingRegisterBlockBase { |
209 | public: |
210 | typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; |
211 | typedef typename KernelSideFormat::Cell CellFormat; |
212 | typedef typename KernelSideFormat::InputScalar KernelInputScalar; |
213 | typedef typename KernelSideFormat::Scalar KernelScalar; |
214 | static constexpr int kCells = KernelSideFormat::kCells; |
215 | static constexpr int kCellWidth = CellFormat::kWidth; |
216 | static constexpr int kKernelWidth = CellFormat::kWidth * kCells; |
217 | static constexpr int kCellDepth = CellFormat::kDepth; |
218 | static constexpr int kCellSize = CellFormat::kSize; |
219 | static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder; |
220 | static constexpr int kZeroPointInputValue = |
221 | ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue; |
222 | |
223 | PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} |
224 | |
225 | protected: |
226 | // The source data that's ready for packing. May point to |
227 | // in-place actual source data if it's already a complete block, |
228 | // (see UseCompleteSrcInPlace) |
229 | // or to the local buf_ below into which we copy incomplete blocks |
230 | // (see MakeCompleteSrc) |
231 | SrcMapType complete_src_; |
232 | |
233 | // Temporary buffer for loading incomplete blocks to, |
234 | // in the source storage order |
235 | std::uint8_t buf_[kKernelWidth * kRegisterSize]; |
236 | |
237 | public: |
238 | // Selects a block if in-place source data that's already a complete block. |
239 | void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } |
240 | // Copies an incomplete block of source data into a local temporary |
241 | // complete block by zero-extending it. |
242 | void MakeCompleteSrc(const SrcMapType& src) { |
243 | memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize); |
244 | if (kSrcOrder == SideMapOrder::WidthMajor) { |
245 | for (int w = 0; w < src.width(); w++) { |
246 | memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); |
247 | } |
248 | } else { |
249 | assert(kSrcOrder == SideMapOrder::DepthMajor); |
250 | for (int d = 0; d < src.depth(); d++) { |
251 | memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); |
252 | } |
253 | } |
254 | |
255 | // Since the KernelInputScalar type may not be uint8, we need to cast buf_. |
256 | complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_), |
257 | kKernelWidth, kRegisterSize); |
258 | } |
259 | // Packs a complete block into the destination. This is the most |
260 | // critical part and the part that we most typically want to |
261 | // override in architecture-specific optimized specializations. |
262 | void Pack(PackedSideBlock* dst, int start_width) { |
263 | std::uint8_t* dst_ptr = dst->current_data(); |
264 | for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; |
265 | cell_start_depth += kCellDepth) { |
266 | for (int cell_start_width = 0; cell_start_width < kKernelWidth; |
267 | cell_start_width += kCellWidth) { |
268 | std::int32_t* cell_sums_of_each_slice_ptr = |
269 | dst->sums_of_each_slice() + start_width + cell_start_width; |
270 | const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( |
271 | complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, |
272 | kCellDepth)); |
273 | for (int w = 0; w < kCellWidth; w++) { |
274 | std::int32_t sum = 0; |
275 | for (int d = 0; d < kCellDepth; d++) { |
276 | const std::uint8_t src_val = src_cell_map(w, d); |
277 | const std::int16_t kernel_val_unwrapped = |
278 | src_val - kZeroPointInputValue; |
279 | const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped; |
280 | dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8; |
281 | sum += kernel_val_unwrapped; |
282 | } |
283 | cell_sums_of_each_slice_ptr[w] += sum; |
284 | } |
285 | dst_ptr += kCellSize; |
286 | } |
287 | } |
288 | dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); |
289 | } |
290 | }; |
291 | |
292 | template <typename SrcMapType, typename PackedSideBlock> |
293 | class PackingRegisterBlock |
294 | : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {}; |
295 | |
296 | // Large-scale implementation of packing. |
297 | template <typename SrcMapType, typename PackedSideBlock> |
298 | class PackSideBlockImpl { |
299 | public: |
300 | typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; |
301 | typedef typename KernelSideFormat::Cell CellFormat; |
302 | static constexpr int kCells = KernelSideFormat::kCells; |
303 | static constexpr int kCellWidth = CellFormat::kWidth; |
304 | static constexpr int kKernelWidth = CellFormat::kWidth * kCells; |
305 | static constexpr int kCellDepth = CellFormat::kDepth; |
306 | |
307 | typedef PackingRegisterBlock<SrcMapType, PackedSideBlock> |
308 | PackingRegisterBlockType; |
309 | |
310 | PackSideBlockImpl(PackedSideBlock* packed_side_block, |
311 | const SrcMapType& src_map) |
312 | : packed_side_block_(packed_side_block), src_map_(src_map) {} |
313 | |
314 | PackedSideBlock* packed_side_block() const { return packed_side_block_; } |
315 | |
316 | const SrcMapType& src_map() const { return src_map_; } |
317 | |
318 | // The public entry point to pack a block. |
319 | void PackL2() { |
320 | memset(packed_side_block_->sums_of_each_slice(), 0, |
321 | sizeof(std::int32_t) * packed_side_block_->params().l2_width); |
322 | for (int d = 0; d < src_map_.depth(); |
323 | d += packed_side_block_->params().l1_depth) { |
324 | int ds = std::min<int>(packed_side_block_->params().l1_depth, |
325 | src_map_.depth() - d); |
326 | |
327 | for (int w = 0; w < src_map_.width(); |
328 | w += packed_side_block_->params().l1_width) { |
329 | int ws = std::min<int>(packed_side_block_->params().l1_width, |
330 | src_map_.width() - w); |
331 | |
332 | PrefetchL1(w, ws, d, ds); |
333 | PackL1(w, ws, d, ds); |
334 | } |
335 | } |
336 | } |
337 | |
338 | protected: |
339 | // The intermediate-level loops, between PackL2 and PackRun. |
340 | void PackL1(int start_width, int width, int start_depth, int depth) { |
341 | for (int w = 0; w < width; w += kKernelWidth) { |
342 | int ws = std::min(+kKernelWidth, width - w); |
343 | packed_side_block_->seek_run(start_width + w, start_depth); |
344 | PackRun(start_width + w, ws, start_depth, depth); |
345 | } |
346 | } |
347 | |
348 | // Prefetches the data that will be read by PackL1. |
349 | void PrefetchL1(int start_width, int width, int start_depth, int depth) { |
350 | if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { |
351 | for (int d = 0; d < depth; d += kDefaultCacheLineSize) { |
352 | for (int w = 0; w < width; w += 1) { |
353 | Prefetch(src_map_.data(start_width + w, start_depth + d)); |
354 | } |
355 | } |
356 | } else { |
357 | for (int d = 0; d < depth; d++) { |
358 | for (int w = 0; w < width; w += kDefaultCacheLineSize) { |
359 | Prefetch(src_map_.data(start_width + w, start_depth + d)); |
360 | } |
361 | } |
362 | } |
363 | } |
364 | |
365 | // PackRun packs only a run i.e. is the inner loop in the depth dimension. |
366 | void PackRun(int start_width, int width, int start_depth, int depth) { |
367 | PackingRegisterBlockType b; |
368 | if (width == kKernelWidth) { |
369 | const int register_aligned_depth = RoundDown<kRegisterSize>(depth); |
370 | if (register_aligned_depth) { |
371 | for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { |
372 | b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, |
373 | width, kRegisterSize)); |
374 | b.Pack(packed_side_block_, start_width); |
375 | } |
376 | } |
377 | if (register_aligned_depth < depth) { |
378 | b.MakeCompleteSrc( |
379 | src_map_.block(start_width, start_depth + register_aligned_depth, |
380 | width, depth - register_aligned_depth)); |
381 | b.Pack(packed_side_block_, start_width); |
382 | } |
383 | } else { |
384 | assert(width < kKernelWidth); |
385 | for (int d = 0; d < depth; d += kRegisterSize) { |
386 | const int ds = std::min(+kRegisterSize, depth - d); |
387 | b.MakeCompleteSrc( |
388 | src_map_.block(start_width, start_depth + d, width, ds)); |
389 | b.Pack(packed_side_block_, start_width); |
390 | } |
391 | } |
392 | } |
393 | |
394 | // The PackedSideBlock being packed, i.e. the 'destination'. |
395 | PackedSideBlock* const packed_side_block_; |
396 | |
397 | // A map on the block of the original matrix block being packed, |
398 | // i.e. the 'source'. |
399 | const SrcMapType& src_map_; |
400 | }; |
401 | |
402 | // Packs a block of the input LHS matrix, into a PackedSideBlock. |
403 | template <typename PackedSideBlock, typename MatrixMapType> |
404 | void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { |
405 | ScopedProfilingLabel label("pack LHS" ); |
406 | static const SideMapOrder kSideMapOrder = |
407 | MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor |
408 | : SideMapOrder::DepthMajor; |
409 | typedef typename MatrixMapType::Scalar Scalar; |
410 | typedef SideMap<Scalar, kSideMapOrder> SideMapType; |
411 | SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); |
412 | typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; |
413 | ImplType impl(dst, src_side_map); |
414 | impl.PackL2(); |
415 | } |
416 | |
417 | // Packs a block of the input RHS matrix, into a PackedSideBlock. |
418 | template <typename PackedSideBlock, typename MatrixMapType> |
419 | void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { |
420 | ScopedProfilingLabel label("pack RHS" ); |
421 | static const SideMapOrder kSideMapOrder = |
422 | MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor |
423 | : SideMapOrder::DepthMajor; |
424 | typedef typename MatrixMapType::Scalar Scalar; |
425 | typedef SideMap<Scalar, kSideMapOrder> SideMapType; |
426 | SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); |
427 | typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; |
428 | ImplType impl(dst, src_side_map); |
429 | impl.PackL2(); |
430 | } |
431 | |
432 | } // namespace gemmlowp |
433 | |
434 | #ifdef GEMMLOWP_NEON |
435 | #include "pack_neon.h" |
436 | #elif defined(GEMMLOWP_SSE4) |
437 | #include "pack_sse.h" |
438 | #elif defined(GEMMLOWP_AVX2) |
439 | #include "pack_avx.h" |
440 | #elif defined(GEMMLOWP_MSA) |
441 | #include "pack_msa.h" |
442 | #endif |
443 | |
444 | #endif // GEMMLOWP_INTERNAL_PACK_H_ |
445 | |