1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 * This source code is licensed under the BSD-style license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7#pragma once
8
9/**
10 * Top level include file for FBGEMM.
11 */
12#include <cassert>
13#include <cmath>
14#include <limits>
15#include <memory>
16#include <type_traits>
17#include "./ConvUtils.h"
18#include "./FbgemmBuild.h"
19#include "./FbgemmEmbedding.h"
20#include "./FbgemmI8DepthwiseAvx2.h"
21#include "./FbgemmI8DirectconvAvx2.h"
22#include "./FbgemmI8Spmdm.h"
23#include "./QuantUtilsAvx2.h"
24#include "./Types.h"
25#include "./Utils.h"
26
27// Turning on this option will print out time breakdown of each stage (e.g.,
28// input packing, the main GEMM kernel, each output processing pipeline).
29// Please note that currently this option won't report accurate timing if
30// multiple threads are used.
31// #define FBGEMM_MEASURE_TIME_BREAKDOWN
32
33#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
34#include <chrono>
35#include <iostream>
36extern double packing_time;
37extern double computing_time;
38extern double kernel_time;
39extern double postprocessing_time;
40extern double run_time;
41#endif
42
43namespace fbgemm {
44
45/**
46 * @brief Templatized struct for packing parameters for A and B matrices.
47 *
48 * @tparam T input type
49 * @tparam accT the type used for accumulation
50 * @tparam instSet anyarch/avx2/avx512
51 * @tparam int8Type an auxiliary template parameter to specialize for 8-bit
52 * input types.
53 */
54template <
55 typename T,
56 typename accT,
57 inst_set_t instSet,
58 typename int8Type = void>
59struct PackingTraits;
60
61// type specialized implementation in an include file
62#include "./PackingTraits-inl.h"
63
64/**
65 * @brief Base class for packing matrices for higher GEMM performance.
66 *
67 * Matrix is tiled into blockRows() * blockCols() blocks.
68 * Each block is with size blockRowSize() * blockColSize().
69 * This class is designed using CRTP
70 * (https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern)
71 *
72 * @tparam PT actual packing type, e.g., PackAWithRowOffset
73 */
74template <typename PT, typename inpType, typename accType = std::int32_t>
75class PackMatrix {
76 public:
77 PackMatrix() = delete; // no default constructor
78
79 /**
80 * @param rows total number of rows in the matrix
81 * (packed rows can be less than rows).
82 * @param cols total number of columns in the matrix
83 * @param pmat A buffer to contain the packed matrix.
84 * If nullptr, a buffer owned by PackMatrix will be allocated
85 * internally to contain the packed matrix.
86 * For non-constant matrices like activation matrices, the client
87 * code may want to pass a pre-allocated pmat to avoid the
88 * overhead of internal memory allocation everytime a PackMatrix
89 * is constructed. The client code can query how big patm should
90 * be with packedBufferSize function.
91 * @param groups when groups > 1, we compute groups number of GEMMs each
92 * multiplies A.rows by A.cols/A.groups matrix with
93 * B.rows/B.groups by B.cols matrix (in conventional BLAS
94 * terminology, this is a batched GEMM but we use the name group
95 * to follow deep learning terminology). The result matrix has
96 * dimension A.rows by B.cols*B.groups .
97 * A.groups must be same as B.groups, A.groups must divide
98 * A.cols, and B.groups must divide B.rows and C.cols.
99 */
100 PackMatrix(
101 std::int32_t rows,
102 std::int32_t cols,
103 inpType* pmat,
104 int groups = 1,
105 const BlockingFactors* params = nullptr);
106
107 /**
108 * @return true usually when the matrix is constant matrix (e.g., weight
109 * matrices) that can be prepacked
110 */
111 bool isPrePacked() const {
112 return static_cast<const PT*>(this)->isPrePacked();
113 }
114
115 /**
116 * @return true if this is the first input matrix in GEMM (i.e., A in C = A *
117 * B)
118 */
119 static constexpr bool isA() {
120 return PT::isA();
121 }
122
123 /**
124 * @brief The size of the buffer used for packing (The size is in number of
125 * elements).
126 *
127 * rows and cols are only used for fully packing, i.e., for B matrix. The
128 * client code can use this function to query how big the buffer used for
129 * packing should be.
130 */
131 static int packedBufferSize(
132 int rows = 0,
133 int cols = 0,
134 const BlockingFactors* params = nullptr);
135
136 /**
137 * @return Pointer to a buffer containing row offset results. Some packing
138 * objects fuse row offset computation for later requantization step.
139 */
140 std::int32_t* getRowOffsetBuffer() const {
141 return static_cast<const PT*>(this)->getRowOffsetBuffer();
142 }
143
144 /**
145 * @brief When k loop is also tiled/blocked, this function is used to check if
146 * have executed computations for the last k block so that we can perform
147 * post-GEMM operations.
148 */
149 bool isThisLastKBlock(int block_id) const {
150 return static_cast<const PT*>(this)->isThisLastKBlock(block_id);
151 }
152
153 /**
154 * @brief Actual packing of a block of the source matrix in pmat buffer.
155 */
156 void pack(const block_type_t& block) {
157 static_cast<PT*>(this)->pack(block);
158 }
159
160 std::int32_t numRows() const {
161 return nrows_;
162 }
163
164 std::int32_t numCols() const {
165 return ncols_;
166 }
167
168 /**
169 * @return The number of rows in each block
170 */
171 std::int32_t blockRowSize() const {
172 return brow_;
173 }
174
175 /**
176 * @return The number of columns in each block
177 */
178 std::int32_t blockColSize() const {
179 return bcol_;
180 }
181
182 /**
183 * @return The number of blocks along rows
184 */
185 std::int32_t blockRows() const {
186 return nbrow_;
187 }
188
189 /**
190 * @return The number of blocks along columns
191 */
192 std::int32_t blockCols() const {
193 return nbcol_;
194 }
195
196 /**
197 * @return The number of the rows in the currently packed block of a matrix.
198 * For pre-packed (i.e., fully-packed), it's equal to the total number
199 * of rows.
200 */
201 std::int32_t numPackedRows() const {
202 return packedBlock_.row_size;
203 }
204
205 /**
206 * @return The number of columns in the currently packed block of a matrix.
207 * For pre-packed (i.e., fully-packed), it's equal to the number of
208 * columns.
209 */
210 std::int32_t numPackedCols() const {
211 return packedBlock_.col_size;
212 }
213
214 /**
215 * @return The first row of the block we're working on.
216 */
217 std::int32_t packedRowStart() const {
218 return packedBlock_.row_start;
219 }
220
221 /**
222 * @return The first column of the block we're working on.
223 */
224 std::int32_t packedColStart() const {
225 return packedBlock_.col_start;
226 }
227
228 /**
229 * @return The beginning of (rowBlockNum, colBlockNum)th block
230 */
231 inpType* getBuf(std::int32_t rowBlockNum = 0, std::int32_t colBlockNum = 0) {
232 return buf_ + blockRowSize() * blockColSize() * rowBlockNum +
233 blockRowSize() * blockColSize() * blockCols() * colBlockNum;
234 }
235
236 /**
237 * @brief Print the packed block.
238 */
239 void printPackedMatrix(std::string name) {
240 static_cast<PT*>(this)->printPackedMatrix(name);
241 }
242
243 /**
244 * @return The number of rows in the last row block.
245 */
246 std::int32_t lastBrow() const {
247 return last_brow_;
248 }
249
250 /**
251 * @return The number of columns in the last column block.
252 */
253 std::int32_t lastBcol() const {
254 return last_bcol_;
255 }
256
257 int numGroups() const {
258 return G_;
259 }
260
261 /**
262 * @return True if the last column block has fewer columns than the block
263 * size.
264 */
265 bool isThereColRemainder() const {
266 return last_bcol_ != blockColSize();
267 }
268
269 virtual ~PackMatrix() {
270 if (bufAllocatedHere_) {
271 fbgemmAlignedFree(buf_);
272 }
273 }
274
275 protected:
276 /**
277 * Set which block we're packing
278 */
279 void packedBlock(const block_type_t& block) {
280 packedBlock_ = block;
281 nbrow_ = (numPackedRows() + blockRowSize() - 1) / blockRowSize();
282 nbcol_ = (numPackedCols() + blockColSize() - 1) / blockColSize();
283
284 last_brow_ = ((numPackedRows() % blockRowSize()) == 0)
285 ? blockRowSize()
286 : (numPackedRows() % blockRowSize());
287 last_bcol_ = ((numPackedCols() % blockColSize()) == 0)
288 ? blockColSize()
289 : (numPackedCols() % blockColSize());
290 }
291
292 inpType* buf_;
293 std::int32_t brow_; ///< the number of rows in each block
294 std::int32_t bcol_; ///< the number of columns in each block
295 std::int32_t nbrow_; ///< the number of blocks along rows
296 std::int32_t nbcol_; ///< the number of blocks along columns
297 bool bufAllocatedHere_;
298 const BlockingFactors*
299 blocking_params; ///< MCB, KCB, NCB, MR, NR, NR_MIN, ROW_INTERLEAVE;
300
301 private:
302 std::int32_t nrows_, ncols_;
303 int G_;
304 block_type_t packedBlock_; ///< The block in the source matrix just packed
305 std::int32_t last_brow_, last_bcol_;
306};
307
308/**
309 * @brief Matrix packed for the first input matrix in GEMM (usually
310 * activation). The source matrix is already quantized. Default
311 * accumulation type is int32.
312 */
313template <typename T, typename accT = std::int32_t>
314class FBGEMM_API PackAMatrix final
315 : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
316 public:
317 using This = PackAMatrix<T, accT>;
318 using BaseType = PackMatrix<This, T, accT>;
319 using inpType = T;
320 using accType = accT;
321
322 PackAMatrix() = delete; // no default constructor
323
324 PackAMatrix(
325 matrix_op_t trans,
326 std::int32_t nRow,
327 std::int32_t nCol,
328 const inpType* smat,
329 std::int32_t ld,
330 inpType* pmat = nullptr,
331 int groups = 1,
332 const BlockingFactors* params = nullptr);
333
334 /**
335 * Activation matrices are not constant so cannot amortize the cost of
336 * pre-packing.
337 */
338 bool isPrePacked() const {
339 return false;
340 }
341
342 /**
343 * @return True if this is used as A matrix.
344 */
345 static constexpr bool isA() {
346 return true;
347 }
348
349 /**
350 * @return A pointer to the row offset buffer. There is no row offset buffer
351 * calculations with this packing class, hence, it returns nullptr.
352 */
353 std::int32_t* getRowOffsetBuffer() const {
354 return nullptr;
355 }
356
357 /**
358 * @return Offset of the element in the packed matrix that was at (i, j) in
359 * the source matrix.
360 */
361 std::int32_t addr(std::int32_t i, std::int32_t j) const;
362
363 /**
364 * @brief Packs a block of source matrix into pmat buffer.
365 */
366 void pack(const block_type_t& block);
367
368 /**
369 * @brief Print the packed block.
370 */
371 void printPackedMatrix(std::string name);
372
373 private:
374 matrix_op_t trans_;
375 const T* smat_;
376 std::int32_t ld_;
377 std::int32_t row_interleave_B_;
378};
379
380/**
381 * @brief Matrix packed for the second input matrix in GEMM (usually weight).
382 * The source matrix is already quantized. Default accumulation
383 * type is int32.
384 */
385template <typename T, typename accT = std::int32_t>
386class FBGEMM_API PackBMatrix final
387 : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
388 public:
389 using This = PackBMatrix<T, accT>;
390 using BaseType = PackMatrix<This, T, accT>;
391 using inpType = T;
392 using accType = accT;
393
394 PackBMatrix() = delete; // no default constructor
395
396 /**
397 * @param groups if > 1 and trans == NoTranspose, smat is nRow x nCol with
398 * groups are vertically concatenated: each group is
399 * (nRow / groups) x nCol .
400 * if > 1 and trans == Transpose, smat is (nCol * groups) x
401 * (nRow / groups) with groups are horizontally concatenated:
402 * each group is nCol x (nRow / groups) . Each group is
403 * transposed and vertically concatenated to match with the
404 * NoTranspose case.
405 */
406 PackBMatrix(
407 matrix_op_t trans,
408 std::int32_t nRow,
409 std::int32_t nCol,
410 const inpType* smat,
411 std::int32_t ld,
412 inpType* pmat = nullptr,
413 int groups = 1,
414 const BlockingFactors* params = nullptr);
415
416 /**
417 * Weight matrices are usually constant so worth pre-packing.
418 */
419 bool isPrePacked() const {
420 return true;
421 }
422
423 /**
424 * @return True if to be used as A matrix, False otherwise.
425 */
426 static constexpr bool isA() {
427 return false;
428 }
429
430 /**
431 * @brief When k loop is also tiled/blocked, this function is used to check if
432 * have executed computations for the last k block so that we can perform
433 * post-GEMM operations.
434 */
435 bool isThisLastKBlock(int block_id) const {
436 return (BaseType::blockRows() - 1) == block_id;
437 }
438
439 /**
440 * @return Offset of the element in the packed matrix that was at (i, j) in
441 * the source matrix.
442 */
443 std::int32_t addr(std::int32_t i, std::int32_t j) const;
444
445 /**
446 * @brief Packs a block of source matrix into pmat buffer. The blocking
447 * parameters are needed to compute the buffer size of each group.
448 * It will use default blocking parameters if params is not provided.
449 */
450 void pack(const block_type_t& block, const BlockingFactors* params = nullptr);
451
452 /**
453 * @brief Print the packed block.
454 */
455 void printPackedMatrix(
456 std::string name,
457 const BlockingFactors* params = nullptr);
458
459 /**
460 * @return true if meta information like matrix shape is the same.
461 */
462 bool metaEquals(const PackBMatrix<T, accT>& that) const;
463 /**
464 * @return true if matrices are the same.
465 */
466 bool equals(const PackBMatrix<T, accT>& that) const;
467
468 /**
469 * @brief Unpack pmat buffer to the origin_buf (Used for the serialization to
470 * recover weight matrix).
471 */
472 void unpack(T* origin_buf, const BlockingFactors* params = nullptr);
473
474 ~PackBMatrix() {}
475
476 private:
477 matrix_op_t trans_;
478 const T* smat_;
479 std::int32_t ld_;
480 std::int32_t row_interleave_;
481
482 /**
483 * @brief Internal function performing both pack & unpack
484 */
485 void pack_unpack_(
486 const block_type_t& block,
487 T* unpack_buf,
488 T* pack_buf,
489 bool ispack,
490 const BlockingFactors* params = nullptr);
491};
492
493/**
494 * @brief Matrix packed for direct group convolution.
495 * The source matrix is already quantized. Default accumulation
496 * type is int32.
497 */
498template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2>
499class FBGEMM_API PackWeightMatrixForGConv {
500 public:
501 using This = PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>;
502 using inpType = T;
503 using accType = accT;
504
505 PackWeightMatrixForGConv() = delete; // no default constructor
506
507 /**
508 * @param pmat if nullptr, a buffer is allocated and owned by this class.
509 */
510 PackWeightMatrixForGConv(
511 matrix_op_t trans,
512 const conv_param_t<SPATIAL_DIM>& conv_param,
513 const inpType* sdata,
514 inpType* pdata = nullptr);
515
516 /**
517 * Number of groups we work at a time to fill the full simd width
518 * e.g., IC_PER_G = 4 and OC_PER_G = 4, we work on two groups at a time
519 * to fill the avx2 width of 256 bits.
520 */
521 static int numOfGroupsTogether(const conv_param_t<SPATIAL_DIM>& conv_param);
522
523 /**
524 * @brief Packs a block of source matrix into pmat buffer.
525 */
526 void pack();
527
528 /**
529 * @brief Unpacks a pmat buffer into source matrix.
530 */
531 void unpack(T* origin_buf);
532
533 /**
534 * @brief Return packed data
535 */
536 inpType* getBuf() {
537 return pdata_;
538 }
539
540 ~PackWeightMatrixForGConv() {
541 if (bufAllocatedHere_) {
542 fbgemmAlignedFree(pdata_);
543 }
544 }
545
546 private:
547 matrix_op_t trans_;
548 const conv_param_t<SPATIAL_DIM> conv_param_;
549 const T* sdata_;
550 T* pdata_;
551 bool bufAllocatedHere_;
552 // Number of groups we work at a time to fill the full simd width
553 int GTogether_;
554
555 /**
556 * @brief Internal function performing both pack & unpack
557 */
558 void pack_unpack_(const T* src, T* dst, bool ispack);
559
560 /**
561 * @brief Get the index of the unpacked data
562 */
563 int unpacked_index_(int t, int r, int s, int k, int g, int c, bool tr);
564
565 /**
566 * @brief Get the index of the packed data
567 */
568 int packed_index_(int t, int r, int s, int k, int g, int c);
569};
570
571/**
572 * @brief A container class to keep packed weight tensor for convolution.
573 * The source tensor should already be quantized.
574 *
575 * @tparam SPATIAL_DIM is equal to 2 for 2D convolutions and 3 for 3D
576 * convolutions. Default value is 2.
577 * @tparam T is the datatype for source tensor. Default value is int8.
578 * @tparam accT is the datatype to accumulate into. Default value is int32.
579 */
580template <
581 int SPATIAL_DIM = 2,
582 typename T = std::int8_t,
583 typename accT = std::int32_t>
584class FBGEMM_API PackWeightsForConv {
585 public:
586 using This = PackWeightsForConv<SPATIAL_DIM, T, accT>;
587 using inpType = T;
588 using accType = accT;
589
590 PackWeightsForConv() = delete; // no default constructor
591
592 PackWeightsForConv(
593 const conv_param_t<SPATIAL_DIM>& conv_param,
594 const inpType* sdata,
595 const BlockingFactors* blocking_params = nullptr);
596
597 std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() {
598 return W_im2col_packed_;
599 }
600
601 std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
602 return W_dw_packed_;
603 }
604
605 std::shared_ptr<PackedDirectConvMatrix> getPackedWForDirectconv() {
606 return W_dc_packed_;
607 }
608
609 std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
610 getPackedWForGroupwise() {
611 return W_gconv_packed_;
612 }
613
614 std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() {
615 return W_pointwise_packed_;
616 }
617
618 int inputChannels() {
619 return conv_param_.IC;
620 }
621
622 int outputChannels() {
623 return conv_param_.OC;
624 }
625
626 std::array<int, SPATIAL_DIM> kernelDims() {
627 return conv_param_.K;
628 }
629
630 int groups() {
631 return conv_param_.G;
632 }
633
634 /**
635 * @brief Returns true if the packed weights would work for the given
636 * convolution parameters, and false otherwise
637 */
638 bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p);
639
640 /**
641 * @brief Returns a string of mismatching parameters
642 */
643 std::string mismatchingParams(const conv_param_t<SPATIAL_DIM>& conv_p);
644
645 /**
646 * @brief Unpack packed matric into origin_buf (Used for the serialization to
647 * recover weight matrix).
648 */
649 void unpack(T* origin_buf);
650
651 private:
652 const conv_param_t<SPATIAL_DIM> conv_param_;
653 // Packed weights if we use im2col based convolution implementation
654 std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
655 // Packed weights if we use depthwise convolution implementation
656 std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
657 // Packed weights if we use direct convolution implementation
658 std::shared_ptr<PackedDirectConvMatrix> W_dc_packed_;
659 // Packed weights if we use groupwise (small channels per group) convolution
660 // implementation
661 std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
662 W_gconv_packed_;
663 // Packed weights if we use direct gemm for pointwise convolution
664 std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_;
665};
666
667/**
668 * @brief Matrix packed for the first input matrix in GEMM (usually activation),
669 * and row offsets used for requantization is computed during packing.
670 * Im2col is fused with packing here. The source matrix is already
671 * quantized.
672 */
673template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2>
674class FBGEMM_API PackAWithIm2Col
675 : public PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT> {
676 public:
677 using This = PackAWithIm2Col<T, accT, SPATIAL_DIM>;
678 using BaseType = PackMatrix<This, T, accT>;
679 using inpType = T;
680 using accType = accT;
681
682 PackAWithIm2Col() = delete; // no default constructor
683 /**
684 * @param zero_pt the quantized value that maps to 0.0f floating-point number.
685 * @param row_offset If nullptr, this constructor internally allocates a
686 * buffer and owns it. Otherwise, this class doesn't own
687 * the buffer. The buffer will be populated when pack
688 * function is called.
689 * @param b_symmetric if true we skip row offset computation
690 */
691 PackAWithIm2Col(
692 const conv_param_t<SPATIAL_DIM>& conv_param,
693 const T* sdata,
694 inpType* pmat = nullptr,
695 std::int32_t a_zero_pt = 0,
696 std::int32_t* row_offset = nullptr,
697 bool b_symmetric = false,
698 const BlockingFactors* params = nullptr);
699
700 /**
701 * Activation matrices are not constant so cannot amortize the cost of
702 * pre-packing.
703 */
704 bool isPrePacked() const {
705 return false;
706 }
707
708 /**
709 * @return True if this is used as A matrix.
710 */
711 static constexpr bool isA() {
712 return true;
713 }
714
715 /**
716 * @brief Packs a block of source matrix into pmat buffer.
717 */
718 void pack(const block_type_t& block);
719
720 /**
721 * @return A pointer to the row offset buffer.
722 */
723 std::int32_t* getRowOffsetBuffer() const {
724 return row_offset_;
725 }
726
727 /**
728 * @brief Print the packed block.
729 */
730 void printPackedMatrix(std::string name);
731
732 /**
733 * @return Size of row offset buffer in number of elements
734 */
735 static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
736
737 ~PackAWithIm2Col() {
738 if (rowOffsetAllocatedHere) {
739 fbgemmAlignedFree(row_offset_);
740 }
741 }
742
743 private:
744 const conv_param_t<SPATIAL_DIM> conv_p_;
745 const T* sdata_;
746 std::int32_t a_zero_pt_;
747 std::int32_t* row_offset_{nullptr};
748 bool rowOffsetAllocatedHere{false};
749 std::int32_t row_interleave_B_;
750};
751
752/**
753 * @brief Matrix packed for the first input matrix in GEMM (usually activation),
754 * and row offsets used for requantization is computed during packing.
755 * The source matrix is already quantized.
756 */
757template <typename T, typename accT = std::int32_t>
758class FBGEMM_API PackAWithRowOffset final
759 : public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> {
760 public:
761 using This = PackAWithRowOffset<T, accT>;
762 using BaseType = PackMatrix<This, T, accT>;
763 using inpType = T;
764 using accType = accT;
765
766 PackAWithRowOffset() = delete; // no default constructor
767 /**
768 * @param row_offset If nullptr, this constructor internally allocates a
769 * buffer and owns it. Otherwise, this class doesn't own
770 * the buffer. The buffer will be populated when pack
771 * function is called.
772 */
773 PackAWithRowOffset(
774 matrix_op_t trans,
775 std::uint32_t nRow,
776 std::uint32_t nCol,
777 const T* smat,
778 std::uint32_t ld,
779 inpType* pmat = nullptr,
780 int groups = 1,
781 std::int32_t* row_offset = nullptr,
782 const BlockingFactors* params = nullptr);
783
784 /**
785 * Activation matrices are not constant so cannot amortize the cost of
786 * pre-packing.
787 */
788 bool isPrePacked() const {
789 return false;
790 }
791
792 /**
793 * @return True if this is used as A matrix.
794 */
795 static constexpr bool isA() {
796 return true;
797 }
798
799 /**
800 * @return Offset of the element in the packed matrix that was at (i, j) in
801 * the source matrix
802 */
803 std::int32_t addr(std::int32_t i, std::int32_t j) const;
804
805 /**
806 * @brief Packs a block of source matrix into pmat buffer.
807 */
808 void pack(const block_type_t& block);
809
810 /**
811 * @return A pointer to the row offset buffer.
812 */
813 std::int32_t* getRowOffsetBuffer() const {
814 return row_offset_;
815 }
816
817 /**
818 * @brief Print the packed block.
819 */
820 void printPackedMatrix(std::string name);
821
822 /**
823 * @return size of row offset buffer in number of elements
824 */
825 static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
826
827 ~PackAWithRowOffset() {
828 if (rowOffsetAllocatedHere) {
829 fbgemmAlignedFree(row_offset_);
830 }
831 }
832
833 private:
834 matrix_op_t trans_;
835 const T* smat_;
836 std::uint32_t ld_;
837 std::int32_t* row_offset_;
838 bool rowOffsetAllocatedHere;
839 std::int32_t row_interleave_B_;
840};
841
842/**
843 * @brief Matrix packed for the first input matrix in GEMM (usually activation),
844 * and row offsets used for requantization is computed during packing.
845 * The source matrix is in fp32 and quantized during packing.
846 */
847template <typename T, typename accT = std::int32_t>
848class FBGEMM_API PackAWithQuantRowOffset final
849 : public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> {
850 public:
851 using This = PackAWithQuantRowOffset<T, accT>;
852 using BaseType = PackMatrix<This, T, accT>;
853 using inpType = T;
854 using accType = accT;
855
856 PackAWithQuantRowOffset() = delete; // no default constructor
857 /**
858 * @param row_offset If nullptr, this constructor internally allocates a
859 * buffer and owns it. Otherwise, this class doesn't own
860 * the buffer. The buffer will be populated when pack
861 * function is called.
862 */
863 PackAWithQuantRowOffset(
864 matrix_op_t trans,
865 std::int32_t nRow,
866 std::int32_t nCol,
867 const float* smat,
868 std::int32_t ld,
869 inpType* pmat = nullptr,
870 float scale = 1.0f,
871 std::int32_t zero_pt = 0,
872 int groups = 1,
873 std::int32_t* row_offset = nullptr,
874 const BlockingFactors* params = nullptr);
875
876 /**
877 * Activation matrices are not constant so cannot amortize the cost of
878 * pre-packing.
879 */
880 bool isPrePacked() const {
881 return false;
882 }
883
884 /**
885 * @return True if this is used as A matrix.
886 */
887 static constexpr bool isA() {
888 return true;
889 }
890
891 /**
892 * @return offset of the element in the packed matrix that was at (i, j) in
893 * the source matrix
894 */
895 std::int32_t addr(std::int32_t i, std::int32_t j) const;
896
897 /**
898 * @brief Packs a block of source matrix into pmat buffer.
899 */
900 void pack(const block_type_t& block);
901
902 /**
903 * @return A pointer to the row offset buffer.
904 */
905 std::int32_t* getRowOffsetBuffer() const {
906 return row_offset_;
907 }
908
909 /**
910 * @brief Print the packed block.
911 */
912 void printPackedMatrix(std::string name);
913
914 /**
915 * @return Size of row offset buffer in number of elements
916 */
917 static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
918
919 ~PackAWithQuantRowOffset() {
920 if (rowOffsetAllocatedHere) {
921 fbgemmAlignedFree(row_offset_);
922 }
923 }
924
925 private:
926 matrix_op_t trans_;
927 const float* smat_;
928 std::int32_t ld_;
929 float scale_;
930 std::int32_t zero_pt_;
931 std::int32_t* row_offset_;
932 bool rowOffsetAllocatedHere;
933 std::int32_t row_interleave_B_;
934};
935
936/*
937 *
938 * Post Processing of outputs
939 *
940 */
941
942/**
943 * @brief Does nothing. NoOp. Used as the last operation in the output
944 * processing pipeline.
945 *
946 */
947template <typename outT = std::uint8_t, typename inT = std::uint8_t>
948class FBGEMM_API DoNothing {
949 public:
950 using outType = outT;
951 using inpType = inT;
952 DoNothing() {}
953 template <inst_set_t instSet>
954 int f(
955 outType* /* unused */,
956 inpType* /* unused */,
957 const block_type_t& /* unused */,
958 int /* unused */,
959 int /* unused */) const {
960 return 0;
961 }
962};
963
964/**
965 * @brief Copy data pointed by inp ptr to out ptr when
966 * inp ptr and out ptr are not the same.
967 * inp buffer: row and column start points: (0, 0)
968 * output buffer: row and column start points:
969 * (block.row_start, block.col_start)
970 *
971 * This is the output processing stage that should passed when there is no
972 * requantization and output is required in the same format as internal buffer
973 * used for accumulation.
974 */
975template <
976 typename outT = std::int32_t,
977 typename inT = std::int32_t,
978 typename nextOPType = DoNothing<outT, outT>>
979class FBGEMM_API memCopy {
980 public:
981 using outType = outT;
982 using inpType = inT;
983 explicit memCopy(nextOPType& nextop) : nextop_(nextop) {}
984 template <inst_set_t instSet>
985 inline int f(
986 outType* out,
987 inpType* inp,
988 const block_type_t& block,
989 int ld_out,
990 int ld_in) const;
991
992 private:
993 nextOPType& nextop_;
994};
995
996/**
997 * @brief Perform scaling on accumulated data.
998 */
999template <
1000 typename outT = std::int32_t,
1001 typename inT = std::int32_t,
1002 typename nextOPType = DoNothing<outT, outT>>
1003class ScaleOP {
1004 public:
1005 using outType = outT;
1006 using inpType = inT;
1007 explicit ScaleOP(inpType scalingFactor) : scalingFactor_(scalingFactor) {}
1008
1009 template <inst_set_t instSet>
1010 inline int f(
1011 outType* out,
1012 inpType* inp,
1013 const block_type_t& block,
1014 int ld_out,
1015 int ld_in) const;
1016
1017 private:
1018 inpType scalingFactor_;
1019};
1020
1021/**
1022 * @brief Perform Relu on accumulated data.
1023 */
1024template <
1025 typename outT = std::int32_t,
1026 typename inT = std::int32_t,
1027 typename nextOPType = DoNothing<outT, outT>>
1028class ReluOutput {
1029 public:
1030 using outType = outT;
1031 using inpType = inT;
1032 explicit ReluOutput(inpType zero_pt) : zero_pt_(zero_pt) {}
1033
1034 template <inst_set_t instSet>
1035 inline int f(
1036 outType* out,
1037 inpType* inp,
1038 const block_type_t& block,
1039 int ld_out,
1040 int ld_in) const;
1041
1042 private:
1043 inpType zero_pt_;
1044};
1045
1046/**
1047 * @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output
1048 * processing pipeline.
1049 *
1050 * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
1051 * (inp). After modifying the input buffer, pass it to the next op.
1052 * When groups > 1, each group is numRows() x (numCols()/groups) matrix.
1053 */
1054template <
1055 typename outT = std::int32_t,
1056 typename inT = std::int32_t,
1057 typename nextOPType = DoNothing<inT, inT>>
1058class FBGEMM_API DoSpmdmOnInpBuffer {
1059 public:
1060 using outType = outT;
1061 using inpType = inT;
1062 DoSpmdmOnInpBuffer(
1063 nextOPType& nextop,
1064 const std::uint8_t* A,
1065 int lda,
1066 const CompressedSparseColumn& B_csc,
1067 int groups = 1)
1068 : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {}
1069
1070 template <inst_set_t instSet>
1071 inline int f(
1072 outT* out,
1073 inT* inp,
1074 const block_type_t& block,
1075 int ld_out,
1076 int ld_in) const;
1077
1078 private:
1079 nextOPType& nextop_;
1080 const std::uint8_t* A_;
1081 const int lda_;
1082 const CompressedSparseColumn& B_csc_;
1083 const int groups_;
1084};
1085
1086/**
1087 * @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output
1088 * processing pipeline.
1089 *
1090 * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
1091 * (inp). After modifying the input buffer, pass it to the next op.
1092 * When groups > 1, each group is numRows() x (numCols()/groups) matrix.
1093 */
1094template <
1095 typename outT = std::int32_t,
1096 typename inT = std::int32_t,
1097 typename nextOPType = DoNothing<inT, inT>>
1098class FBGEMM_API DoSConvOnInpBuffer {
1099 public:
1100 using outType = outT;
1101 using inpType = inT;
1102 DoSConvOnInpBuffer(
1103 nextOPType& nextop,
1104 const std::uint8_t* A,
1105 const conv_param_t<>& conv_p,
1106 std::int32_t A_zero_point,
1107 const CompressedSparseColumn& B_csc)
1108 : nextop_(nextop),
1109 A_(A),
1110 conv_p_(conv_p),
1111 A_zero_point_(A_zero_point),
1112 B_csc_(B_csc) {}
1113
1114 template <inst_set_t instSet>
1115 inline int f(
1116 outT* out,
1117 inT* inp,
1118 const block_type_t& block,
1119 int ld_out,
1120 int ld_in) const;
1121
1122 private:
1123 nextOPType& nextop_;
1124 const std::uint8_t* A_;
1125 const conv_param_t<> conv_p_;
1126 const std::int32_t A_zero_point_;
1127 const CompressedSparseColumn& B_csc_;
1128};
1129
1130/**
1131 * @brief Requantize values in inp buffer and write to out buffer.
1132 * pass the out buffer to next op for further processing.
1133 */
1134template <
1135 bool FUSE_RELU,
1136 QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
1137 typename BIAS_TYPE = std::int32_t,
1138 typename outT = std::uint8_t,
1139 typename inT = std::int32_t,
1140 typename nextOPType = DoNothing<outT, outT>>
1141class FBGEMM_API ReQuantizeOutput {
1142 public:
1143 static constexpr int RELU_FUSED = FUSE_RELU;
1144 static constexpr QuantizationGranularity QGRANType = Q_GRAN;
1145 using BIAS_T = BIAS_TYPE;
1146 using outType = outT;
1147 using inpType = inT;
1148 /**
1149 * @param C_multiplier The length of this array is
1150 * 1 when Q_GRAN == QuantizationGranularity::TENSOR,
1151 * groups when Q_GRAN == QuantizationGranularity::GROUP,
1152 * nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL
1153 * @param Bq_zero_point The length of this array should be the same as
1154 * C_multiplier.
1155 * @param row_offsets Typically, this should've been computed by a
1156 * PackAMatrix and should be obtained by
1157 * PackMatrix::getRowOffsetBuffer().
1158 * If Bq_zero_point == 0 (symmetric quantization of B
1159 * matrix), we can pass nullptr.
1160 * @param col_offsets This should be pre-computed for example using
1161 * col_offsets_with_zero_pt_s8acc32_ref.
1162 * The length should be nCol.
1163 * See PackedRequantizeTest.cc for an example.
1164 * TODO: if Aq_zero_point == 0, allow passing nullptr.
1165 * @param bias can be nullptr otherwise the length should be nCol
1166 * @param act_times_w_scale activation_scale * weight_scale. This is only
1167 * used if bias is unquantized (i.e., float).
1168 */
1169 ReQuantizeOutput(
1170 nextOPType& nextop,
1171 const float* C_multiplier,
1172 std::int32_t C_zero_point,
1173 std::int32_t Aq_zero_point,
1174 const std::int32_t* Bq_zero_point,
1175 const std::int32_t* row_offsets,
1176 const std::int32_t* col_offsets,
1177 const BIAS_T* bias,
1178 std::uint32_t nCol,
1179 int groups = 1,
1180 const float* act_times_w_scale = nullptr)
1181 : nextop_(nextop),
1182 C_multiplier_(C_multiplier),
1183 C_zero_point_(C_zero_point),
1184 Aq_zero_point_(Aq_zero_point),
1185 Bq_zero_point_(Bq_zero_point),
1186 q_row_offsets_(row_offsets),
1187 q_col_offsets_(col_offsets),
1188 bias_(bias),
1189 ncols_(nCol),
1190 groups_(groups),
1191 act_times_w_scale_(act_times_w_scale) {}
1192
1193 template <inst_set_t instSet>
1194 inline int f(
1195 outT* out,
1196 const inT* inp,
1197 const block_type_t& block,
1198 int ld_out,
1199 int ld_in) const;
1200
1201 const float* getCMultiplier() const {
1202 return C_multiplier_;
1203 }
1204 std::int32_t getAZeroPoint() const {
1205 return Aq_zero_point_;
1206 }
1207 std::int32_t getCZeroPoint() const {
1208 return C_zero_point_;
1209 }
1210 const std::int32_t* getBZeroPoint() const {
1211 return Bq_zero_point_;
1212 }
1213 const std::int32_t* getRowOffsets() const {
1214 return q_row_offsets_;
1215 }
1216 const std::int32_t* getColOffsets() const {
1217 return q_col_offsets_;
1218 }
1219 const BIAS_T* getBias() const {
1220 return bias_;
1221 }
1222 std::uint32_t getNCols() const {
1223 return ncols_;
1224 }
1225 const float* getActWScale() const {
1226 return act_times_w_scale_;
1227 }
1228
1229 void setRowOffsets(const std::int32_t* row_offsets) {
1230 q_row_offsets_ = row_offsets;
1231 }
1232
1233 private:
1234 nextOPType& nextop_;
1235 const float* C_multiplier_;
1236 std::int32_t C_zero_point_;
1237 std::int32_t Aq_zero_point_;
1238 const std::int32_t* Bq_zero_point_;
1239 const std::int32_t* q_row_offsets_;
1240 const std::int32_t* q_col_offsets_;
1241 const BIAS_T* bias_;
1242 std::uint32_t ncols_;
1243 int groups_;
1244 const float* act_times_w_scale_;
1245};
1246
1247/**
1248 * @brief Requantize to convert accumulated data to be used as float, i.e., the
1249 * output would be used as float.
1250 */
1251template <
1252 bool FUSE_RELU,
1253 QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
1254 typename outT = float,
1255 typename inT = std::int32_t,
1256 typename nextOPType = DoNothing<outT, outT>>
1257class FBGEMM_API ReQuantizeForFloat {
1258 public:
1259 using outType = outT;
1260 using inpType = inT;
1261 /**
1262 * @param Bq_scale The length of this array is
1263 * 1 when Q_GRAN == QuantizationGranularity::TENSOR,
1264 * groups when Q_GRAN == QuantizationGranularity::GROUP,
1265 * nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL
1266 * @param Bq_zero_point The length of this array should be the same as
1267 * Bq_scale.
1268 * @param row_offsets Typically, this should've been computed by a
1269 * PackAMatrix and should be obtained by
1270 * PackMatrix::getRowOffsetBuffer().
1271 * If Bq_zero_point == 0 (symmetric quantization of B
1272 * matrix), we can pass nullptr.
1273 * @param col_offsets This should be pre-computed for example using
1274 * col_offsets_with_zero_pt_s8acc32_ref.
1275 * The length should be nCol.
1276 * See PackedRequantizeTest.cc for an example.
1277 * TODO: if Aq_zero_point == 0, allow passing nullptr.
1278 * @param bias can be nullptr otherwise the length should be nCol
1279 */
1280 ReQuantizeForFloat(
1281 nextOPType& nextop,
1282 float Aq_scale,
1283 const float* Bq_scale,
1284 std::int32_t Aq_zero_point,
1285 const std::int32_t* Bq_zero_point,
1286 const std::int32_t* row_offsets,
1287 const std::int32_t* col_offsets,
1288 const float* bias,
1289 std::uint32_t nCol,
1290 int groups = 1)
1291 : nextop_(nextop),
1292 Aq_scale_(Aq_scale),
1293 Bq_scale_(Bq_scale),
1294 Aq_zero_point_(Aq_zero_point),
1295 Bq_zero_point_(Bq_zero_point),
1296 q_row_offsets_(row_offsets),
1297 q_col_offsets_(col_offsets),
1298 bias_(bias),
1299 ncols_(nCol),
1300 groups_(groups) {}
1301
1302 template <inst_set_t instSet>
1303 inline int f(
1304 outT* out,
1305 inT* inp,
1306 const block_type_t& block,
1307 int ld_out,
1308 int ld_in) const;
1309
1310 private:
1311 nextOPType& nextop_;
1312 float Aq_scale_;
1313 const float* Bq_scale_;
1314 std::int32_t Aq_zero_point_;
1315 const std::int32_t* Bq_zero_point_;
1316 const std::int32_t* q_row_offsets_;
1317 const std::int32_t* q_col_offsets_;
1318 const float* bias_;
1319 std::uint32_t ncols_;
1320 int groups_;
1321};
1322
1323// type specialized implementation in an include file
1324#include "./OutputProcessing-inl.h"
1325
1326/*
1327 *
1328 * ####### GEMM related functions #######
1329 *
1330 */
1331
1332/**
1333 * Matrix B must be prepacked. For matrix A, packA.pack function is called to
1334 * pack it.
1335 *
1336 * @tparam packingAMatrix processing of A matrix while packing,
1337 * e.g., PackAWithQuantRowOffset
1338 *
1339 * @tparam packingBMatrix processing of B matrix while packing,
1340 * e.g., pre-multiply by alpha
1341 * @tparam cT data type of C matrix
1342 * @tparam processOutputType further processing of outputs, e.g., Relu
1343 */
1344template <
1345 typename packingAMatrix,
1346 typename packingBMatrix,
1347 typename cT,
1348 typename processOutputType>
1349FBGEMM_API void fbgemmPacked(
1350 PackMatrix<
1351 packingAMatrix,
1352 typename packingAMatrix::inpType,
1353 typename packingAMatrix::accType>& packA,
1354 PackMatrix<
1355 packingBMatrix,
1356 typename packingBMatrix::inpType,
1357 typename packingBMatrix::accType>& packB,
1358 cT* C,
1359 std::int32_t* C_buffer,
1360 std::uint32_t ldc,
1361 const processOutputType& outProcess,
1362 int thread_id,
1363 int num_threads,
1364 const BlockingFactors* blocking_params = nullptr);
1365
1366/**
1367 * @brief Perform small-channels-per-group groupwise convolution
1368 * Note: Currently threading is not supported. This function does
1369 * nothing for thread_ids > 0, i.e., returns early.
1370 *
1371 * @param rowOffsetBuf nullptr if B uses symmetric quantization
1372 * Note: Currently threading is not supported. This function does
1373 * nothing for thread_ids > 0, i.e., returns early.
1374 */
1375template <
1376 typename packed_W,
1377 typename outType,
1378 bool FUSE_RELU,
1379 QuantizationGranularity Q_GRAN,
1380 int SPATIAL_DIM = 2,
1381 typename BIAS_TYPE = std::int32_t>
1382FBGEMM_API void fbgemmGroupwiseConv(
1383 const conv_param_t<SPATIAL_DIM>& conv_param,
1384 const std::uint8_t* activations,
1385 std::int32_t a_zero_point,
1386 std::int32_t* rowOffsetBuf,
1387 packed_W& packed_weights,
1388 outType* out,
1389 std::int32_t* outBuffer,
1390 const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
1391 int thread_id,
1392 int num_threads);
1393
1394template <
1395 int SPATIAL_DIM,
1396 QuantizationGranularity Q_GRAN,
1397 bool FUSE_RELU,
1398 typename BIAS_TYPE = std::int32_t>
1399FBGEMM_API void fbgemmDirectConv(
1400 const conv_param_t<SPATIAL_DIM>& conv_p,
1401 const uint8_t* Aint8,
1402 PackedDirectConvMatrix& Bint8_tr,
1403 uint8_t* C,
1404 int32_t* C_buffer,
1405 const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
1406 const BIAS_TYPE* bias,
1407 int thread_id,
1408 int num_threads);
1409
1410/**
1411 * @return Size of row offset buffer in number of elements needed for
1412 * fbgemmGroupwiseConv
1413 */
1414template <int SPATIAL_DIM = 2>
1415FBGEMM_API int rowOffsetBufferSizeGConv(
1416 const conv_param_t<SPATIAL_DIM>& conv_param);
1417
1418/**
1419 * @brief Is this depthwise convolution optimized?
1420 */
1421template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
1422bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
1423
1424/**
1425 * @brief Is this groupwise convolution supported?
1426 */
1427template <int SPATIAL_DIM>
1428FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p);
1429
1430/**
1431 * @brief Is this convolution a direct matrix-matrix multiplication, i.e., 1x1
1432 * (aka pointwise) with right paddings etc.?
1433 */
1434template <int SPATIAL_DIM>
1435FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
1436
1437/**
1438 * @brief Are we running on a fbgemm supported cpu?
1439 */
1440FBGEMM_API bool fbgemmSupportedCPU();
1441
1442/**
1443 * @brief Performs convolution using fastest path available.
1444 *
1445 * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
1446 */
1447template <
1448 typename processOutputType,
1449 int SPATIAL_DIM = 2,
1450 typename ACC_T = std::int32_t>
1451FBGEMM_API int fbgemmConv(
1452 const conv_param_t<SPATIAL_DIM>& conv_p,
1453 const std::uint8_t* activations,
1454 PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights,
1455 typename processOutputType::outType* out,
1456 std::int32_t* outBuffer,
1457 processOutputType& outProcess,
1458 int thread_id,
1459 int num_threads,
1460 const BlockingFactors* blocking_params = nullptr);
1461
1462/**
1463 * @brief Returns which fast path to take
1464 *
1465 * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
1466 *
1467 * @return optimized_conv_t::depthwise, optimized_conv_t::groupwise or
1468 * optimized_conv_t::im2col
1469 *
1470 */
1471template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
1472FBGEMM_API optimized_conv_t
1473ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
1474} // namespace fbgemm
1475