1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * This source code is licensed under the BSD-style license found in the |
5 | * LICENSE file in the root directory of this source tree. |
6 | */ |
7 | #pragma once |
8 | |
9 | /** |
10 | * Top level include file for FBGEMM. |
11 | */ |
12 | #include <cassert> |
13 | #include <cmath> |
14 | #include <limits> |
15 | #include <memory> |
16 | #include <type_traits> |
17 | #include "./ConvUtils.h" |
18 | #include "./FbgemmBuild.h" |
19 | #include "./FbgemmEmbedding.h" |
20 | #include "./FbgemmI8DepthwiseAvx2.h" |
21 | #include "./FbgemmI8DirectconvAvx2.h" |
22 | #include "./FbgemmI8Spmdm.h" |
23 | #include "./QuantUtilsAvx2.h" |
24 | #include "./Types.h" |
25 | #include "./Utils.h" |
26 | |
27 | // Turning on this option will print out time breakdown of each stage (e.g., |
28 | // input packing, the main GEMM kernel, each output processing pipeline). |
29 | // Please note that currently this option won't report accurate timing if |
30 | // multiple threads are used. |
31 | // #define FBGEMM_MEASURE_TIME_BREAKDOWN |
32 | |
33 | #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN |
34 | #include <chrono> |
35 | #include <iostream> |
36 | extern double packing_time; |
37 | extern double computing_time; |
38 | extern double kernel_time; |
39 | extern double postprocessing_time; |
40 | extern double run_time; |
41 | #endif |
42 | |
43 | namespace fbgemm { |
44 | |
45 | /** |
46 | * @brief Templatized struct for packing parameters for A and B matrices. |
47 | * |
48 | * @tparam T input type |
49 | * @tparam accT the type used for accumulation |
50 | * @tparam instSet anyarch/avx2/avx512 |
51 | * @tparam int8Type an auxiliary template parameter to specialize for 8-bit |
52 | * input types. |
53 | */ |
54 | template < |
55 | typename T, |
56 | typename accT, |
57 | inst_set_t instSet, |
58 | typename int8Type = void> |
59 | struct PackingTraits; |
60 | |
61 | // type specialized implementation in an include file |
62 | #include "./PackingTraits-inl.h" |
63 | |
64 | /** |
65 | * @brief Base class for packing matrices for higher GEMM performance. |
66 | * |
67 | * Matrix is tiled into blockRows() * blockCols() blocks. |
68 | * Each block is with size blockRowSize() * blockColSize(). |
69 | * This class is designed using CRTP |
70 | * (https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern) |
71 | * |
72 | * @tparam PT actual packing type, e.g., PackAWithRowOffset |
73 | */ |
74 | template <typename PT, typename inpType, typename accType = std::int32_t> |
75 | class PackMatrix { |
76 | public: |
77 | PackMatrix() = delete; // no default constructor |
78 | |
79 | /** |
80 | * @param rows total number of rows in the matrix |
81 | * (packed rows can be less than rows). |
82 | * @param cols total number of columns in the matrix |
83 | * @param pmat A buffer to contain the packed matrix. |
84 | * If nullptr, a buffer owned by PackMatrix will be allocated |
85 | * internally to contain the packed matrix. |
86 | * For non-constant matrices like activation matrices, the client |
87 | * code may want to pass a pre-allocated pmat to avoid the |
88 | * overhead of internal memory allocation everytime a PackMatrix |
89 | * is constructed. The client code can query how big patm should |
90 | * be with packedBufferSize function. |
91 | * @param groups when groups > 1, we compute groups number of GEMMs each |
92 | * multiplies A.rows by A.cols/A.groups matrix with |
93 | * B.rows/B.groups by B.cols matrix (in conventional BLAS |
94 | * terminology, this is a batched GEMM but we use the name group |
95 | * to follow deep learning terminology). The result matrix has |
96 | * dimension A.rows by B.cols*B.groups . |
97 | * A.groups must be same as B.groups, A.groups must divide |
98 | * A.cols, and B.groups must divide B.rows and C.cols. |
99 | */ |
100 | PackMatrix( |
101 | std::int32_t rows, |
102 | std::int32_t cols, |
103 | inpType* pmat, |
104 | int groups = 1, |
105 | const BlockingFactors* params = nullptr); |
106 | |
107 | /** |
108 | * @return true usually when the matrix is constant matrix (e.g., weight |
109 | * matrices) that can be prepacked |
110 | */ |
111 | bool isPrePacked() const { |
112 | return static_cast<const PT*>(this)->isPrePacked(); |
113 | } |
114 | |
115 | /** |
116 | * @return true if this is the first input matrix in GEMM (i.e., A in C = A * |
117 | * B) |
118 | */ |
119 | static constexpr bool isA() { |
120 | return PT::isA(); |
121 | } |
122 | |
123 | /** |
124 | * @brief The size of the buffer used for packing (The size is in number of |
125 | * elements). |
126 | * |
127 | * rows and cols are only used for fully packing, i.e., for B matrix. The |
128 | * client code can use this function to query how big the buffer used for |
129 | * packing should be. |
130 | */ |
131 | static int packedBufferSize( |
132 | int rows = 0, |
133 | int cols = 0, |
134 | const BlockingFactors* params = nullptr); |
135 | |
136 | /** |
137 | * @return Pointer to a buffer containing row offset results. Some packing |
138 | * objects fuse row offset computation for later requantization step. |
139 | */ |
140 | std::int32_t* getRowOffsetBuffer() const { |
141 | return static_cast<const PT*>(this)->getRowOffsetBuffer(); |
142 | } |
143 | |
144 | /** |
145 | * @brief When k loop is also tiled/blocked, this function is used to check if |
146 | * have executed computations for the last k block so that we can perform |
147 | * post-GEMM operations. |
148 | */ |
149 | bool isThisLastKBlock(int block_id) const { |
150 | return static_cast<const PT*>(this)->isThisLastKBlock(block_id); |
151 | } |
152 | |
153 | /** |
154 | * @brief Actual packing of a block of the source matrix in pmat buffer. |
155 | */ |
156 | void pack(const block_type_t& block) { |
157 | static_cast<PT*>(this)->pack(block); |
158 | } |
159 | |
160 | std::int32_t numRows() const { |
161 | return nrows_; |
162 | } |
163 | |
164 | std::int32_t numCols() const { |
165 | return ncols_; |
166 | } |
167 | |
168 | /** |
169 | * @return The number of rows in each block |
170 | */ |
171 | std::int32_t blockRowSize() const { |
172 | return brow_; |
173 | } |
174 | |
175 | /** |
176 | * @return The number of columns in each block |
177 | */ |
178 | std::int32_t blockColSize() const { |
179 | return bcol_; |
180 | } |
181 | |
182 | /** |
183 | * @return The number of blocks along rows |
184 | */ |
185 | std::int32_t blockRows() const { |
186 | return nbrow_; |
187 | } |
188 | |
189 | /** |
190 | * @return The number of blocks along columns |
191 | */ |
192 | std::int32_t blockCols() const { |
193 | return nbcol_; |
194 | } |
195 | |
196 | /** |
197 | * @return The number of the rows in the currently packed block of a matrix. |
198 | * For pre-packed (i.e., fully-packed), it's equal to the total number |
199 | * of rows. |
200 | */ |
201 | std::int32_t numPackedRows() const { |
202 | return packedBlock_.row_size; |
203 | } |
204 | |
205 | /** |
206 | * @return The number of columns in the currently packed block of a matrix. |
207 | * For pre-packed (i.e., fully-packed), it's equal to the number of |
208 | * columns. |
209 | */ |
210 | std::int32_t numPackedCols() const { |
211 | return packedBlock_.col_size; |
212 | } |
213 | |
214 | /** |
215 | * @return The first row of the block we're working on. |
216 | */ |
217 | std::int32_t packedRowStart() const { |
218 | return packedBlock_.row_start; |
219 | } |
220 | |
221 | /** |
222 | * @return The first column of the block we're working on. |
223 | */ |
224 | std::int32_t packedColStart() const { |
225 | return packedBlock_.col_start; |
226 | } |
227 | |
228 | /** |
229 | * @return The beginning of (rowBlockNum, colBlockNum)th block |
230 | */ |
231 | inpType* getBuf(std::int32_t rowBlockNum = 0, std::int32_t colBlockNum = 0) { |
232 | return buf_ + blockRowSize() * blockColSize() * rowBlockNum + |
233 | blockRowSize() * blockColSize() * blockCols() * colBlockNum; |
234 | } |
235 | |
236 | /** |
237 | * @brief Print the packed block. |
238 | */ |
239 | void printPackedMatrix(std::string name) { |
240 | static_cast<PT*>(this)->printPackedMatrix(name); |
241 | } |
242 | |
243 | /** |
244 | * @return The number of rows in the last row block. |
245 | */ |
246 | std::int32_t lastBrow() const { |
247 | return last_brow_; |
248 | } |
249 | |
250 | /** |
251 | * @return The number of columns in the last column block. |
252 | */ |
253 | std::int32_t lastBcol() const { |
254 | return last_bcol_; |
255 | } |
256 | |
257 | int numGroups() const { |
258 | return G_; |
259 | } |
260 | |
261 | /** |
262 | * @return True if the last column block has fewer columns than the block |
263 | * size. |
264 | */ |
265 | bool isThereColRemainder() const { |
266 | return last_bcol_ != blockColSize(); |
267 | } |
268 | |
269 | virtual ~PackMatrix() { |
270 | if (bufAllocatedHere_) { |
271 | fbgemmAlignedFree(buf_); |
272 | } |
273 | } |
274 | |
275 | protected: |
276 | /** |
277 | * Set which block we're packing |
278 | */ |
279 | void packedBlock(const block_type_t& block) { |
280 | packedBlock_ = block; |
281 | nbrow_ = (numPackedRows() + blockRowSize() - 1) / blockRowSize(); |
282 | nbcol_ = (numPackedCols() + blockColSize() - 1) / blockColSize(); |
283 | |
284 | last_brow_ = ((numPackedRows() % blockRowSize()) == 0) |
285 | ? blockRowSize() |
286 | : (numPackedRows() % blockRowSize()); |
287 | last_bcol_ = ((numPackedCols() % blockColSize()) == 0) |
288 | ? blockColSize() |
289 | : (numPackedCols() % blockColSize()); |
290 | } |
291 | |
292 | inpType* buf_; |
293 | std::int32_t brow_; ///< the number of rows in each block |
294 | std::int32_t bcol_; ///< the number of columns in each block |
295 | std::int32_t nbrow_; ///< the number of blocks along rows |
296 | std::int32_t nbcol_; ///< the number of blocks along columns |
297 | bool bufAllocatedHere_; |
298 | const BlockingFactors* |
299 | blocking_params; ///< MCB, KCB, NCB, MR, NR, NR_MIN, ROW_INTERLEAVE; |
300 | |
301 | private: |
302 | std::int32_t nrows_, ncols_; |
303 | int G_; |
304 | block_type_t packedBlock_; ///< The block in the source matrix just packed |
305 | std::int32_t last_brow_, last_bcol_; |
306 | }; |
307 | |
308 | /** |
309 | * @brief Matrix packed for the first input matrix in GEMM (usually |
310 | * activation). The source matrix is already quantized. Default |
311 | * accumulation type is int32. |
312 | */ |
313 | template <typename T, typename accT = std::int32_t> |
314 | class FBGEMM_API PackAMatrix final |
315 | : public PackMatrix<PackAMatrix<T, accT>, T, accT> { |
316 | public: |
317 | using This = PackAMatrix<T, accT>; |
318 | using BaseType = PackMatrix<This, T, accT>; |
319 | using inpType = T; |
320 | using accType = accT; |
321 | |
322 | PackAMatrix() = delete; // no default constructor |
323 | |
324 | PackAMatrix( |
325 | matrix_op_t trans, |
326 | std::int32_t nRow, |
327 | std::int32_t nCol, |
328 | const inpType* smat, |
329 | std::int32_t ld, |
330 | inpType* pmat = nullptr, |
331 | int groups = 1, |
332 | const BlockingFactors* params = nullptr); |
333 | |
334 | /** |
335 | * Activation matrices are not constant so cannot amortize the cost of |
336 | * pre-packing. |
337 | */ |
338 | bool isPrePacked() const { |
339 | return false; |
340 | } |
341 | |
342 | /** |
343 | * @return True if this is used as A matrix. |
344 | */ |
345 | static constexpr bool isA() { |
346 | return true; |
347 | } |
348 | |
349 | /** |
350 | * @return A pointer to the row offset buffer. There is no row offset buffer |
351 | * calculations with this packing class, hence, it returns nullptr. |
352 | */ |
353 | std::int32_t* getRowOffsetBuffer() const { |
354 | return nullptr; |
355 | } |
356 | |
357 | /** |
358 | * @return Offset of the element in the packed matrix that was at (i, j) in |
359 | * the source matrix. |
360 | */ |
361 | std::int32_t addr(std::int32_t i, std::int32_t j) const; |
362 | |
363 | /** |
364 | * @brief Packs a block of source matrix into pmat buffer. |
365 | */ |
366 | void pack(const block_type_t& block); |
367 | |
368 | /** |
369 | * @brief Print the packed block. |
370 | */ |
371 | void printPackedMatrix(std::string name); |
372 | |
373 | private: |
374 | matrix_op_t trans_; |
375 | const T* smat_; |
376 | std::int32_t ld_; |
377 | std::int32_t row_interleave_B_; |
378 | }; |
379 | |
380 | /** |
381 | * @brief Matrix packed for the second input matrix in GEMM (usually weight). |
382 | * The source matrix is already quantized. Default accumulation |
383 | * type is int32. |
384 | */ |
385 | template <typename T, typename accT = std::int32_t> |
386 | class FBGEMM_API PackBMatrix final |
387 | : public PackMatrix<PackBMatrix<T, accT>, T, accT> { |
388 | public: |
389 | using This = PackBMatrix<T, accT>; |
390 | using BaseType = PackMatrix<This, T, accT>; |
391 | using inpType = T; |
392 | using accType = accT; |
393 | |
394 | PackBMatrix() = delete; // no default constructor |
395 | |
396 | /** |
397 | * @param groups if > 1 and trans == NoTranspose, smat is nRow x nCol with |
398 | * groups are vertically concatenated: each group is |
399 | * (nRow / groups) x nCol . |
400 | * if > 1 and trans == Transpose, smat is (nCol * groups) x |
401 | * (nRow / groups) with groups are horizontally concatenated: |
402 | * each group is nCol x (nRow / groups) . Each group is |
403 | * transposed and vertically concatenated to match with the |
404 | * NoTranspose case. |
405 | */ |
406 | PackBMatrix( |
407 | matrix_op_t trans, |
408 | std::int32_t nRow, |
409 | std::int32_t nCol, |
410 | const inpType* smat, |
411 | std::int32_t ld, |
412 | inpType* pmat = nullptr, |
413 | int groups = 1, |
414 | const BlockingFactors* params = nullptr); |
415 | |
416 | /** |
417 | * Weight matrices are usually constant so worth pre-packing. |
418 | */ |
419 | bool isPrePacked() const { |
420 | return true; |
421 | } |
422 | |
423 | /** |
424 | * @return True if to be used as A matrix, False otherwise. |
425 | */ |
426 | static constexpr bool isA() { |
427 | return false; |
428 | } |
429 | |
430 | /** |
431 | * @brief When k loop is also tiled/blocked, this function is used to check if |
432 | * have executed computations for the last k block so that we can perform |
433 | * post-GEMM operations. |
434 | */ |
435 | bool isThisLastKBlock(int block_id) const { |
436 | return (BaseType::blockRows() - 1) == block_id; |
437 | } |
438 | |
439 | /** |
440 | * @return Offset of the element in the packed matrix that was at (i, j) in |
441 | * the source matrix. |
442 | */ |
443 | std::int32_t addr(std::int32_t i, std::int32_t j) const; |
444 | |
445 | /** |
446 | * @brief Packs a block of source matrix into pmat buffer. The blocking |
447 | * parameters are needed to compute the buffer size of each group. |
448 | * It will use default blocking parameters if params is not provided. |
449 | */ |
450 | void pack(const block_type_t& block, const BlockingFactors* params = nullptr); |
451 | |
452 | /** |
453 | * @brief Print the packed block. |
454 | */ |
455 | void printPackedMatrix( |
456 | std::string name, |
457 | const BlockingFactors* params = nullptr); |
458 | |
459 | /** |
460 | * @return true if meta information like matrix shape is the same. |
461 | */ |
462 | bool metaEquals(const PackBMatrix<T, accT>& that) const; |
463 | /** |
464 | * @return true if matrices are the same. |
465 | */ |
466 | bool equals(const PackBMatrix<T, accT>& that) const; |
467 | |
468 | /** |
469 | * @brief Unpack pmat buffer to the origin_buf (Used for the serialization to |
470 | * recover weight matrix). |
471 | */ |
472 | void unpack(T* origin_buf, const BlockingFactors* params = nullptr); |
473 | |
474 | ~PackBMatrix() {} |
475 | |
476 | private: |
477 | matrix_op_t trans_; |
478 | const T* smat_; |
479 | std::int32_t ld_; |
480 | std::int32_t row_interleave_; |
481 | |
482 | /** |
483 | * @brief Internal function performing both pack & unpack |
484 | */ |
485 | void pack_unpack_( |
486 | const block_type_t& block, |
487 | T* unpack_buf, |
488 | T* pack_buf, |
489 | bool ispack, |
490 | const BlockingFactors* params = nullptr); |
491 | }; |
492 | |
493 | /** |
494 | * @brief Matrix packed for direct group convolution. |
495 | * The source matrix is already quantized. Default accumulation |
496 | * type is int32. |
497 | */ |
498 | template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2> |
499 | class FBGEMM_API PackWeightMatrixForGConv { |
500 | public: |
501 | using This = PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>; |
502 | using inpType = T; |
503 | using accType = accT; |
504 | |
505 | PackWeightMatrixForGConv() = delete; // no default constructor |
506 | |
507 | /** |
508 | * @param pmat if nullptr, a buffer is allocated and owned by this class. |
509 | */ |
510 | PackWeightMatrixForGConv( |
511 | matrix_op_t trans, |
512 | const conv_param_t<SPATIAL_DIM>& conv_param, |
513 | const inpType* sdata, |
514 | inpType* pdata = nullptr); |
515 | |
516 | /** |
517 | * Number of groups we work at a time to fill the full simd width |
518 | * e.g., IC_PER_G = 4 and OC_PER_G = 4, we work on two groups at a time |
519 | * to fill the avx2 width of 256 bits. |
520 | */ |
521 | static int numOfGroupsTogether(const conv_param_t<SPATIAL_DIM>& conv_param); |
522 | |
523 | /** |
524 | * @brief Packs a block of source matrix into pmat buffer. |
525 | */ |
526 | void pack(); |
527 | |
528 | /** |
529 | * @brief Unpacks a pmat buffer into source matrix. |
530 | */ |
531 | void unpack(T* origin_buf); |
532 | |
533 | /** |
534 | * @brief Return packed data |
535 | */ |
536 | inpType* getBuf() { |
537 | return pdata_; |
538 | } |
539 | |
540 | ~PackWeightMatrixForGConv() { |
541 | if (bufAllocatedHere_) { |
542 | fbgemmAlignedFree(pdata_); |
543 | } |
544 | } |
545 | |
546 | private: |
547 | matrix_op_t trans_; |
548 | const conv_param_t<SPATIAL_DIM> conv_param_; |
549 | const T* sdata_; |
550 | T* pdata_; |
551 | bool bufAllocatedHere_; |
552 | // Number of groups we work at a time to fill the full simd width |
553 | int GTogether_; |
554 | |
555 | /** |
556 | * @brief Internal function performing both pack & unpack |
557 | */ |
558 | void pack_unpack_(const T* src, T* dst, bool ispack); |
559 | |
560 | /** |
561 | * @brief Get the index of the unpacked data |
562 | */ |
563 | int unpacked_index_(int t, int r, int s, int k, int g, int c, bool tr); |
564 | |
565 | /** |
566 | * @brief Get the index of the packed data |
567 | */ |
568 | int packed_index_(int t, int r, int s, int k, int g, int c); |
569 | }; |
570 | |
571 | /** |
572 | * @brief A container class to keep packed weight tensor for convolution. |
573 | * The source tensor should already be quantized. |
574 | * |
575 | * @tparam SPATIAL_DIM is equal to 2 for 2D convolutions and 3 for 3D |
576 | * convolutions. Default value is 2. |
577 | * @tparam T is the datatype for source tensor. Default value is int8. |
578 | * @tparam accT is the datatype to accumulate into. Default value is int32. |
579 | */ |
580 | template < |
581 | int SPATIAL_DIM = 2, |
582 | typename T = std::int8_t, |
583 | typename accT = std::int32_t> |
584 | class FBGEMM_API PackWeightsForConv { |
585 | public: |
586 | using This = PackWeightsForConv<SPATIAL_DIM, T, accT>; |
587 | using inpType = T; |
588 | using accType = accT; |
589 | |
590 | PackWeightsForConv() = delete; // no default constructor |
591 | |
592 | PackWeightsForConv( |
593 | const conv_param_t<SPATIAL_DIM>& conv_param, |
594 | const inpType* sdata, |
595 | const BlockingFactors* blocking_params = nullptr); |
596 | |
597 | std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() { |
598 | return W_im2col_packed_; |
599 | } |
600 | |
601 | std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() { |
602 | return W_dw_packed_; |
603 | } |
604 | |
605 | std::shared_ptr<PackedDirectConvMatrix> getPackedWForDirectconv() { |
606 | return W_dc_packed_; |
607 | } |
608 | |
609 | std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>> |
610 | getPackedWForGroupwise() { |
611 | return W_gconv_packed_; |
612 | } |
613 | |
614 | std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() { |
615 | return W_pointwise_packed_; |
616 | } |
617 | |
618 | int inputChannels() { |
619 | return conv_param_.IC; |
620 | } |
621 | |
622 | int outputChannels() { |
623 | return conv_param_.OC; |
624 | } |
625 | |
626 | std::array<int, SPATIAL_DIM> kernelDims() { |
627 | return conv_param_.K; |
628 | } |
629 | |
630 | int groups() { |
631 | return conv_param_.G; |
632 | } |
633 | |
634 | /** |
635 | * @brief Returns true if the packed weights would work for the given |
636 | * convolution parameters, and false otherwise |
637 | */ |
638 | bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p); |
639 | |
640 | /** |
641 | * @brief Returns a string of mismatching parameters |
642 | */ |
643 | std::string mismatchingParams(const conv_param_t<SPATIAL_DIM>& conv_p); |
644 | |
645 | /** |
646 | * @brief Unpack packed matric into origin_buf (Used for the serialization to |
647 | * recover weight matrix). |
648 | */ |
649 | void unpack(T* origin_buf); |
650 | |
651 | private: |
652 | const conv_param_t<SPATIAL_DIM> conv_param_; |
653 | // Packed weights if we use im2col based convolution implementation |
654 | std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_; |
655 | // Packed weights if we use depthwise convolution implementation |
656 | std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_; |
657 | // Packed weights if we use direct convolution implementation |
658 | std::shared_ptr<PackedDirectConvMatrix> W_dc_packed_; |
659 | // Packed weights if we use groupwise (small channels per group) convolution |
660 | // implementation |
661 | std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>> |
662 | W_gconv_packed_; |
663 | // Packed weights if we use direct gemm for pointwise convolution |
664 | std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_; |
665 | }; |
666 | |
667 | /** |
668 | * @brief Matrix packed for the first input matrix in GEMM (usually activation), |
669 | * and row offsets used for requantization is computed during packing. |
670 | * Im2col is fused with packing here. The source matrix is already |
671 | * quantized. |
672 | */ |
673 | template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2> |
674 | class FBGEMM_API PackAWithIm2Col |
675 | : public PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT> { |
676 | public: |
677 | using This = PackAWithIm2Col<T, accT, SPATIAL_DIM>; |
678 | using BaseType = PackMatrix<This, T, accT>; |
679 | using inpType = T; |
680 | using accType = accT; |
681 | |
682 | PackAWithIm2Col() = delete; // no default constructor |
683 | /** |
684 | * @param zero_pt the quantized value that maps to 0.0f floating-point number. |
685 | * @param row_offset If nullptr, this constructor internally allocates a |
686 | * buffer and owns it. Otherwise, this class doesn't own |
687 | * the buffer. The buffer will be populated when pack |
688 | * function is called. |
689 | * @param b_symmetric if true we skip row offset computation |
690 | */ |
691 | PackAWithIm2Col( |
692 | const conv_param_t<SPATIAL_DIM>& conv_param, |
693 | const T* sdata, |
694 | inpType* pmat = nullptr, |
695 | std::int32_t a_zero_pt = 0, |
696 | std::int32_t* row_offset = nullptr, |
697 | bool b_symmetric = false, |
698 | const BlockingFactors* params = nullptr); |
699 | |
700 | /** |
701 | * Activation matrices are not constant so cannot amortize the cost of |
702 | * pre-packing. |
703 | */ |
704 | bool isPrePacked() const { |
705 | return false; |
706 | } |
707 | |
708 | /** |
709 | * @return True if this is used as A matrix. |
710 | */ |
711 | static constexpr bool isA() { |
712 | return true; |
713 | } |
714 | |
715 | /** |
716 | * @brief Packs a block of source matrix into pmat buffer. |
717 | */ |
718 | void pack(const block_type_t& block); |
719 | |
720 | /** |
721 | * @return A pointer to the row offset buffer. |
722 | */ |
723 | std::int32_t* getRowOffsetBuffer() const { |
724 | return row_offset_; |
725 | } |
726 | |
727 | /** |
728 | * @brief Print the packed block. |
729 | */ |
730 | void printPackedMatrix(std::string name); |
731 | |
732 | /** |
733 | * @return Size of row offset buffer in number of elements |
734 | */ |
735 | static int rowOffsetBufferSize(const BlockingFactors* params = nullptr); |
736 | |
737 | ~PackAWithIm2Col() { |
738 | if (rowOffsetAllocatedHere) { |
739 | fbgemmAlignedFree(row_offset_); |
740 | } |
741 | } |
742 | |
743 | private: |
744 | const conv_param_t<SPATIAL_DIM> conv_p_; |
745 | const T* sdata_; |
746 | std::int32_t a_zero_pt_; |
747 | std::int32_t* row_offset_{nullptr}; |
748 | bool rowOffsetAllocatedHere{false}; |
749 | std::int32_t row_interleave_B_; |
750 | }; |
751 | |
752 | /** |
753 | * @brief Matrix packed for the first input matrix in GEMM (usually activation), |
754 | * and row offsets used for requantization is computed during packing. |
755 | * The source matrix is already quantized. |
756 | */ |
757 | template <typename T, typename accT = std::int32_t> |
758 | class FBGEMM_API PackAWithRowOffset final |
759 | : public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> { |
760 | public: |
761 | using This = PackAWithRowOffset<T, accT>; |
762 | using BaseType = PackMatrix<This, T, accT>; |
763 | using inpType = T; |
764 | using accType = accT; |
765 | |
766 | PackAWithRowOffset() = delete; // no default constructor |
767 | /** |
768 | * @param row_offset If nullptr, this constructor internally allocates a |
769 | * buffer and owns it. Otherwise, this class doesn't own |
770 | * the buffer. The buffer will be populated when pack |
771 | * function is called. |
772 | */ |
773 | PackAWithRowOffset( |
774 | matrix_op_t trans, |
775 | std::uint32_t nRow, |
776 | std::uint32_t nCol, |
777 | const T* smat, |
778 | std::uint32_t ld, |
779 | inpType* pmat = nullptr, |
780 | int groups = 1, |
781 | std::int32_t* row_offset = nullptr, |
782 | const BlockingFactors* params = nullptr); |
783 | |
784 | /** |
785 | * Activation matrices are not constant so cannot amortize the cost of |
786 | * pre-packing. |
787 | */ |
788 | bool isPrePacked() const { |
789 | return false; |
790 | } |
791 | |
792 | /** |
793 | * @return True if this is used as A matrix. |
794 | */ |
795 | static constexpr bool isA() { |
796 | return true; |
797 | } |
798 | |
799 | /** |
800 | * @return Offset of the element in the packed matrix that was at (i, j) in |
801 | * the source matrix |
802 | */ |
803 | std::int32_t addr(std::int32_t i, std::int32_t j) const; |
804 | |
805 | /** |
806 | * @brief Packs a block of source matrix into pmat buffer. |
807 | */ |
808 | void pack(const block_type_t& block); |
809 | |
810 | /** |
811 | * @return A pointer to the row offset buffer. |
812 | */ |
813 | std::int32_t* getRowOffsetBuffer() const { |
814 | return row_offset_; |
815 | } |
816 | |
817 | /** |
818 | * @brief Print the packed block. |
819 | */ |
820 | void printPackedMatrix(std::string name); |
821 | |
822 | /** |
823 | * @return size of row offset buffer in number of elements |
824 | */ |
825 | static int rowOffsetBufferSize(const BlockingFactors* params = nullptr); |
826 | |
827 | ~PackAWithRowOffset() { |
828 | if (rowOffsetAllocatedHere) { |
829 | fbgemmAlignedFree(row_offset_); |
830 | } |
831 | } |
832 | |
833 | private: |
834 | matrix_op_t trans_; |
835 | const T* smat_; |
836 | std::uint32_t ld_; |
837 | std::int32_t* row_offset_; |
838 | bool rowOffsetAllocatedHere; |
839 | std::int32_t row_interleave_B_; |
840 | }; |
841 | |
842 | /** |
843 | * @brief Matrix packed for the first input matrix in GEMM (usually activation), |
844 | * and row offsets used for requantization is computed during packing. |
845 | * The source matrix is in fp32 and quantized during packing. |
846 | */ |
847 | template <typename T, typename accT = std::int32_t> |
848 | class FBGEMM_API PackAWithQuantRowOffset final |
849 | : public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> { |
850 | public: |
851 | using This = PackAWithQuantRowOffset<T, accT>; |
852 | using BaseType = PackMatrix<This, T, accT>; |
853 | using inpType = T; |
854 | using accType = accT; |
855 | |
856 | PackAWithQuantRowOffset() = delete; // no default constructor |
857 | /** |
858 | * @param row_offset If nullptr, this constructor internally allocates a |
859 | * buffer and owns it. Otherwise, this class doesn't own |
860 | * the buffer. The buffer will be populated when pack |
861 | * function is called. |
862 | */ |
863 | PackAWithQuantRowOffset( |
864 | matrix_op_t trans, |
865 | std::int32_t nRow, |
866 | std::int32_t nCol, |
867 | const float* smat, |
868 | std::int32_t ld, |
869 | inpType* pmat = nullptr, |
870 | float scale = 1.0f, |
871 | std::int32_t zero_pt = 0, |
872 | int groups = 1, |
873 | std::int32_t* row_offset = nullptr, |
874 | const BlockingFactors* params = nullptr); |
875 | |
876 | /** |
877 | * Activation matrices are not constant so cannot amortize the cost of |
878 | * pre-packing. |
879 | */ |
880 | bool isPrePacked() const { |
881 | return false; |
882 | } |
883 | |
884 | /** |
885 | * @return True if this is used as A matrix. |
886 | */ |
887 | static constexpr bool isA() { |
888 | return true; |
889 | } |
890 | |
891 | /** |
892 | * @return offset of the element in the packed matrix that was at (i, j) in |
893 | * the source matrix |
894 | */ |
895 | std::int32_t addr(std::int32_t i, std::int32_t j) const; |
896 | |
897 | /** |
898 | * @brief Packs a block of source matrix into pmat buffer. |
899 | */ |
900 | void pack(const block_type_t& block); |
901 | |
902 | /** |
903 | * @return A pointer to the row offset buffer. |
904 | */ |
905 | std::int32_t* getRowOffsetBuffer() const { |
906 | return row_offset_; |
907 | } |
908 | |
909 | /** |
910 | * @brief Print the packed block. |
911 | */ |
912 | void printPackedMatrix(std::string name); |
913 | |
914 | /** |
915 | * @return Size of row offset buffer in number of elements |
916 | */ |
917 | static int rowOffsetBufferSize(const BlockingFactors* params = nullptr); |
918 | |
919 | ~PackAWithQuantRowOffset() { |
920 | if (rowOffsetAllocatedHere) { |
921 | fbgemmAlignedFree(row_offset_); |
922 | } |
923 | } |
924 | |
925 | private: |
926 | matrix_op_t trans_; |
927 | const float* smat_; |
928 | std::int32_t ld_; |
929 | float scale_; |
930 | std::int32_t zero_pt_; |
931 | std::int32_t* row_offset_; |
932 | bool rowOffsetAllocatedHere; |
933 | std::int32_t row_interleave_B_; |
934 | }; |
935 | |
936 | /* |
937 | * |
938 | * Post Processing of outputs |
939 | * |
940 | */ |
941 | |
942 | /** |
943 | * @brief Does nothing. NoOp. Used as the last operation in the output |
944 | * processing pipeline. |
945 | * |
946 | */ |
947 | template <typename outT = std::uint8_t, typename inT = std::uint8_t> |
948 | class FBGEMM_API DoNothing { |
949 | public: |
950 | using outType = outT; |
951 | using inpType = inT; |
952 | DoNothing() {} |
953 | template <inst_set_t instSet> |
954 | int f( |
955 | outType* /* unused */, |
956 | inpType* /* unused */, |
957 | const block_type_t& /* unused */, |
958 | int /* unused */, |
959 | int /* unused */) const { |
960 | return 0; |
961 | } |
962 | }; |
963 | |
964 | /** |
965 | * @brief Copy data pointed by inp ptr to out ptr when |
966 | * inp ptr and out ptr are not the same. |
967 | * inp buffer: row and column start points: (0, 0) |
968 | * output buffer: row and column start points: |
969 | * (block.row_start, block.col_start) |
970 | * |
971 | * This is the output processing stage that should passed when there is no |
972 | * requantization and output is required in the same format as internal buffer |
973 | * used for accumulation. |
974 | */ |
975 | template < |
976 | typename outT = std::int32_t, |
977 | typename inT = std::int32_t, |
978 | typename nextOPType = DoNothing<outT, outT>> |
979 | class FBGEMM_API memCopy { |
980 | public: |
981 | using outType = outT; |
982 | using inpType = inT; |
983 | explicit memCopy(nextOPType& nextop) : nextop_(nextop) {} |
984 | template <inst_set_t instSet> |
985 | inline int f( |
986 | outType* out, |
987 | inpType* inp, |
988 | const block_type_t& block, |
989 | int ld_out, |
990 | int ld_in) const; |
991 | |
992 | private: |
993 | nextOPType& nextop_; |
994 | }; |
995 | |
996 | /** |
997 | * @brief Perform scaling on accumulated data. |
998 | */ |
999 | template < |
1000 | typename outT = std::int32_t, |
1001 | typename inT = std::int32_t, |
1002 | typename nextOPType = DoNothing<outT, outT>> |
1003 | class ScaleOP { |
1004 | public: |
1005 | using outType = outT; |
1006 | using inpType = inT; |
1007 | explicit ScaleOP(inpType scalingFactor) : scalingFactor_(scalingFactor) {} |
1008 | |
1009 | template <inst_set_t instSet> |
1010 | inline int f( |
1011 | outType* out, |
1012 | inpType* inp, |
1013 | const block_type_t& block, |
1014 | int ld_out, |
1015 | int ld_in) const; |
1016 | |
1017 | private: |
1018 | inpType scalingFactor_; |
1019 | }; |
1020 | |
1021 | /** |
1022 | * @brief Perform Relu on accumulated data. |
1023 | */ |
1024 | template < |
1025 | typename outT = std::int32_t, |
1026 | typename inT = std::int32_t, |
1027 | typename nextOPType = DoNothing<outT, outT>> |
1028 | class ReluOutput { |
1029 | public: |
1030 | using outType = outT; |
1031 | using inpType = inT; |
1032 | explicit ReluOutput(inpType zero_pt) : zero_pt_(zero_pt) {} |
1033 | |
1034 | template <inst_set_t instSet> |
1035 | inline int f( |
1036 | outType* out, |
1037 | inpType* inp, |
1038 | const block_type_t& block, |
1039 | int ld_out, |
1040 | int ld_in) const; |
1041 | |
1042 | private: |
1043 | inpType zero_pt_; |
1044 | }; |
1045 | |
1046 | /** |
1047 | * @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output |
1048 | * processing pipeline. |
1049 | * |
1050 | * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer |
1051 | * (inp). After modifying the input buffer, pass it to the next op. |
1052 | * When groups > 1, each group is numRows() x (numCols()/groups) matrix. |
1053 | */ |
1054 | template < |
1055 | typename outT = std::int32_t, |
1056 | typename inT = std::int32_t, |
1057 | typename nextOPType = DoNothing<inT, inT>> |
1058 | class FBGEMM_API DoSpmdmOnInpBuffer { |
1059 | public: |
1060 | using outType = outT; |
1061 | using inpType = inT; |
1062 | DoSpmdmOnInpBuffer( |
1063 | nextOPType& nextop, |
1064 | const std::uint8_t* A, |
1065 | int lda, |
1066 | const CompressedSparseColumn& B_csc, |
1067 | int groups = 1) |
1068 | : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {} |
1069 | |
1070 | template <inst_set_t instSet> |
1071 | inline int f( |
1072 | outT* out, |
1073 | inT* inp, |
1074 | const block_type_t& block, |
1075 | int ld_out, |
1076 | int ld_in) const; |
1077 | |
1078 | private: |
1079 | nextOPType& nextop_; |
1080 | const std::uint8_t* A_; |
1081 | const int lda_; |
1082 | const CompressedSparseColumn& B_csc_; |
1083 | const int groups_; |
1084 | }; |
1085 | |
1086 | /** |
1087 | * @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output |
1088 | * processing pipeline. |
1089 | * |
1090 | * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer |
1091 | * (inp). After modifying the input buffer, pass it to the next op. |
1092 | * When groups > 1, each group is numRows() x (numCols()/groups) matrix. |
1093 | */ |
1094 | template < |
1095 | typename outT = std::int32_t, |
1096 | typename inT = std::int32_t, |
1097 | typename nextOPType = DoNothing<inT, inT>> |
1098 | class FBGEMM_API DoSConvOnInpBuffer { |
1099 | public: |
1100 | using outType = outT; |
1101 | using inpType = inT; |
1102 | DoSConvOnInpBuffer( |
1103 | nextOPType& nextop, |
1104 | const std::uint8_t* A, |
1105 | const conv_param_t<>& conv_p, |
1106 | std::int32_t A_zero_point, |
1107 | const CompressedSparseColumn& B_csc) |
1108 | : nextop_(nextop), |
1109 | A_(A), |
1110 | conv_p_(conv_p), |
1111 | A_zero_point_(A_zero_point), |
1112 | B_csc_(B_csc) {} |
1113 | |
1114 | template <inst_set_t instSet> |
1115 | inline int f( |
1116 | outT* out, |
1117 | inT* inp, |
1118 | const block_type_t& block, |
1119 | int ld_out, |
1120 | int ld_in) const; |
1121 | |
1122 | private: |
1123 | nextOPType& nextop_; |
1124 | const std::uint8_t* A_; |
1125 | const conv_param_t<> conv_p_; |
1126 | const std::int32_t A_zero_point_; |
1127 | const CompressedSparseColumn& B_csc_; |
1128 | }; |
1129 | |
1130 | /** |
1131 | * @brief Requantize values in inp buffer and write to out buffer. |
1132 | * pass the out buffer to next op for further processing. |
1133 | */ |
1134 | template < |
1135 | bool FUSE_RELU, |
1136 | QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR, |
1137 | typename BIAS_TYPE = std::int32_t, |
1138 | typename outT = std::uint8_t, |
1139 | typename inT = std::int32_t, |
1140 | typename nextOPType = DoNothing<outT, outT>> |
1141 | class FBGEMM_API ReQuantizeOutput { |
1142 | public: |
1143 | static constexpr int RELU_FUSED = FUSE_RELU; |
1144 | static constexpr QuantizationGranularity QGRANType = Q_GRAN; |
1145 | using BIAS_T = BIAS_TYPE; |
1146 | using outType = outT; |
1147 | using inpType = inT; |
1148 | /** |
1149 | * @param C_multiplier The length of this array is |
1150 | * 1 when Q_GRAN == QuantizationGranularity::TENSOR, |
1151 | * groups when Q_GRAN == QuantizationGranularity::GROUP, |
1152 | * nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL |
1153 | * @param Bq_zero_point The length of this array should be the same as |
1154 | * C_multiplier. |
1155 | * @param row_offsets Typically, this should've been computed by a |
1156 | * PackAMatrix and should be obtained by |
1157 | * PackMatrix::getRowOffsetBuffer(). |
1158 | * If Bq_zero_point == 0 (symmetric quantization of B |
1159 | * matrix), we can pass nullptr. |
1160 | * @param col_offsets This should be pre-computed for example using |
1161 | * col_offsets_with_zero_pt_s8acc32_ref. |
1162 | * The length should be nCol. |
1163 | * See PackedRequantizeTest.cc for an example. |
1164 | * TODO: if Aq_zero_point == 0, allow passing nullptr. |
1165 | * @param bias can be nullptr otherwise the length should be nCol |
1166 | * @param act_times_w_scale activation_scale * weight_scale. This is only |
1167 | * used if bias is unquantized (i.e., float). |
1168 | */ |
1169 | ReQuantizeOutput( |
1170 | nextOPType& nextop, |
1171 | const float* C_multiplier, |
1172 | std::int32_t C_zero_point, |
1173 | std::int32_t Aq_zero_point, |
1174 | const std::int32_t* Bq_zero_point, |
1175 | const std::int32_t* row_offsets, |
1176 | const std::int32_t* col_offsets, |
1177 | const BIAS_T* bias, |
1178 | std::uint32_t nCol, |
1179 | int groups = 1, |
1180 | const float* act_times_w_scale = nullptr) |
1181 | : nextop_(nextop), |
1182 | C_multiplier_(C_multiplier), |
1183 | C_zero_point_(C_zero_point), |
1184 | Aq_zero_point_(Aq_zero_point), |
1185 | Bq_zero_point_(Bq_zero_point), |
1186 | q_row_offsets_(row_offsets), |
1187 | q_col_offsets_(col_offsets), |
1188 | bias_(bias), |
1189 | ncols_(nCol), |
1190 | groups_(groups), |
1191 | act_times_w_scale_(act_times_w_scale) {} |
1192 | |
1193 | template <inst_set_t instSet> |
1194 | inline int f( |
1195 | outT* out, |
1196 | const inT* inp, |
1197 | const block_type_t& block, |
1198 | int ld_out, |
1199 | int ld_in) const; |
1200 | |
1201 | const float* getCMultiplier() const { |
1202 | return C_multiplier_; |
1203 | } |
1204 | std::int32_t getAZeroPoint() const { |
1205 | return Aq_zero_point_; |
1206 | } |
1207 | std::int32_t getCZeroPoint() const { |
1208 | return C_zero_point_; |
1209 | } |
1210 | const std::int32_t* getBZeroPoint() const { |
1211 | return Bq_zero_point_; |
1212 | } |
1213 | const std::int32_t* getRowOffsets() const { |
1214 | return q_row_offsets_; |
1215 | } |
1216 | const std::int32_t* getColOffsets() const { |
1217 | return q_col_offsets_; |
1218 | } |
1219 | const BIAS_T* getBias() const { |
1220 | return bias_; |
1221 | } |
1222 | std::uint32_t getNCols() const { |
1223 | return ncols_; |
1224 | } |
1225 | const float* getActWScale() const { |
1226 | return act_times_w_scale_; |
1227 | } |
1228 | |
1229 | void setRowOffsets(const std::int32_t* row_offsets) { |
1230 | q_row_offsets_ = row_offsets; |
1231 | } |
1232 | |
1233 | private: |
1234 | nextOPType& nextop_; |
1235 | const float* C_multiplier_; |
1236 | std::int32_t C_zero_point_; |
1237 | std::int32_t Aq_zero_point_; |
1238 | const std::int32_t* Bq_zero_point_; |
1239 | const std::int32_t* q_row_offsets_; |
1240 | const std::int32_t* q_col_offsets_; |
1241 | const BIAS_T* bias_; |
1242 | std::uint32_t ncols_; |
1243 | int groups_; |
1244 | const float* act_times_w_scale_; |
1245 | }; |
1246 | |
1247 | /** |
1248 | * @brief Requantize to convert accumulated data to be used as float, i.e., the |
1249 | * output would be used as float. |
1250 | */ |
1251 | template < |
1252 | bool FUSE_RELU, |
1253 | QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR, |
1254 | typename outT = float, |
1255 | typename inT = std::int32_t, |
1256 | typename nextOPType = DoNothing<outT, outT>> |
1257 | class FBGEMM_API ReQuantizeForFloat { |
1258 | public: |
1259 | using outType = outT; |
1260 | using inpType = inT; |
1261 | /** |
1262 | * @param Bq_scale The length of this array is |
1263 | * 1 when Q_GRAN == QuantizationGranularity::TENSOR, |
1264 | * groups when Q_GRAN == QuantizationGranularity::GROUP, |
1265 | * nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL |
1266 | * @param Bq_zero_point The length of this array should be the same as |
1267 | * Bq_scale. |
1268 | * @param row_offsets Typically, this should've been computed by a |
1269 | * PackAMatrix and should be obtained by |
1270 | * PackMatrix::getRowOffsetBuffer(). |
1271 | * If Bq_zero_point == 0 (symmetric quantization of B |
1272 | * matrix), we can pass nullptr. |
1273 | * @param col_offsets This should be pre-computed for example using |
1274 | * col_offsets_with_zero_pt_s8acc32_ref. |
1275 | * The length should be nCol. |
1276 | * See PackedRequantizeTest.cc for an example. |
1277 | * TODO: if Aq_zero_point == 0, allow passing nullptr. |
1278 | * @param bias can be nullptr otherwise the length should be nCol |
1279 | */ |
1280 | ReQuantizeForFloat( |
1281 | nextOPType& nextop, |
1282 | float Aq_scale, |
1283 | const float* Bq_scale, |
1284 | std::int32_t Aq_zero_point, |
1285 | const std::int32_t* Bq_zero_point, |
1286 | const std::int32_t* row_offsets, |
1287 | const std::int32_t* col_offsets, |
1288 | const float* bias, |
1289 | std::uint32_t nCol, |
1290 | int groups = 1) |
1291 | : nextop_(nextop), |
1292 | Aq_scale_(Aq_scale), |
1293 | Bq_scale_(Bq_scale), |
1294 | Aq_zero_point_(Aq_zero_point), |
1295 | Bq_zero_point_(Bq_zero_point), |
1296 | q_row_offsets_(row_offsets), |
1297 | q_col_offsets_(col_offsets), |
1298 | bias_(bias), |
1299 | ncols_(nCol), |
1300 | groups_(groups) {} |
1301 | |
1302 | template <inst_set_t instSet> |
1303 | inline int f( |
1304 | outT* out, |
1305 | inT* inp, |
1306 | const block_type_t& block, |
1307 | int ld_out, |
1308 | int ld_in) const; |
1309 | |
1310 | private: |
1311 | nextOPType& nextop_; |
1312 | float Aq_scale_; |
1313 | const float* Bq_scale_; |
1314 | std::int32_t Aq_zero_point_; |
1315 | const std::int32_t* Bq_zero_point_; |
1316 | const std::int32_t* q_row_offsets_; |
1317 | const std::int32_t* q_col_offsets_; |
1318 | const float* bias_; |
1319 | std::uint32_t ncols_; |
1320 | int groups_; |
1321 | }; |
1322 | |
1323 | // type specialized implementation in an include file |
1324 | #include "./OutputProcessing-inl.h" |
1325 | |
1326 | /* |
1327 | * |
1328 | * ####### GEMM related functions ####### |
1329 | * |
1330 | */ |
1331 | |
1332 | /** |
1333 | * Matrix B must be prepacked. For matrix A, packA.pack function is called to |
1334 | * pack it. |
1335 | * |
1336 | * @tparam packingAMatrix processing of A matrix while packing, |
1337 | * e.g., PackAWithQuantRowOffset |
1338 | * |
1339 | * @tparam packingBMatrix processing of B matrix while packing, |
1340 | * e.g., pre-multiply by alpha |
1341 | * @tparam cT data type of C matrix |
1342 | * @tparam processOutputType further processing of outputs, e.g., Relu |
1343 | */ |
1344 | template < |
1345 | typename packingAMatrix, |
1346 | typename packingBMatrix, |
1347 | typename cT, |
1348 | typename processOutputType> |
1349 | FBGEMM_API void fbgemmPacked( |
1350 | PackMatrix< |
1351 | packingAMatrix, |
1352 | typename packingAMatrix::inpType, |
1353 | typename packingAMatrix::accType>& packA, |
1354 | PackMatrix< |
1355 | packingBMatrix, |
1356 | typename packingBMatrix::inpType, |
1357 | typename packingBMatrix::accType>& packB, |
1358 | cT* C, |
1359 | std::int32_t* C_buffer, |
1360 | std::uint32_t ldc, |
1361 | const processOutputType& outProcess, |
1362 | int thread_id, |
1363 | int num_threads, |
1364 | const BlockingFactors* blocking_params = nullptr); |
1365 | |
1366 | /** |
1367 | * @brief Perform small-channels-per-group groupwise convolution |
1368 | * Note: Currently threading is not supported. This function does |
1369 | * nothing for thread_ids > 0, i.e., returns early. |
1370 | * |
1371 | * @param rowOffsetBuf nullptr if B uses symmetric quantization |
1372 | * Note: Currently threading is not supported. This function does |
1373 | * nothing for thread_ids > 0, i.e., returns early. |
1374 | */ |
1375 | template < |
1376 | typename packed_W, |
1377 | typename outType, |
1378 | bool FUSE_RELU, |
1379 | QuantizationGranularity Q_GRAN, |
1380 | int SPATIAL_DIM = 2, |
1381 | typename BIAS_TYPE = std::int32_t> |
1382 | FBGEMM_API void fbgemmGroupwiseConv( |
1383 | const conv_param_t<SPATIAL_DIM>& conv_param, |
1384 | const std::uint8_t* activations, |
1385 | std::int32_t a_zero_point, |
1386 | std::int32_t* rowOffsetBuf, |
1387 | packed_W& packed_weights, |
1388 | outType* out, |
1389 | std::int32_t* outBuffer, |
1390 | const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess, |
1391 | int thread_id, |
1392 | int num_threads); |
1393 | |
1394 | template < |
1395 | int SPATIAL_DIM, |
1396 | QuantizationGranularity Q_GRAN, |
1397 | bool FUSE_RELU, |
1398 | typename BIAS_TYPE = std::int32_t> |
1399 | FBGEMM_API void fbgemmDirectConv( |
1400 | const conv_param_t<SPATIAL_DIM>& conv_p, |
1401 | const uint8_t* Aint8, |
1402 | PackedDirectConvMatrix& Bint8_tr, |
1403 | uint8_t* C, |
1404 | int32_t* C_buffer, |
1405 | const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess, |
1406 | const BIAS_TYPE* bias, |
1407 | int thread_id, |
1408 | int num_threads); |
1409 | |
1410 | /** |
1411 | * @return Size of row offset buffer in number of elements needed for |
1412 | * fbgemmGroupwiseConv |
1413 | */ |
1414 | template <int SPATIAL_DIM = 2> |
1415 | FBGEMM_API int rowOffsetBufferSizeGConv( |
1416 | const conv_param_t<SPATIAL_DIM>& conv_param); |
1417 | |
1418 | /** |
1419 | * @brief Is this depthwise convolution optimized? |
1420 | */ |
1421 | template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t> |
1422 | bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p); |
1423 | |
1424 | /** |
1425 | * @brief Is this groupwise convolution supported? |
1426 | */ |
1427 | template <int SPATIAL_DIM> |
1428 | FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p); |
1429 | |
1430 | /** |
1431 | * @brief Is this convolution a direct matrix-matrix multiplication, i.e., 1x1 |
1432 | * (aka pointwise) with right paddings etc.? |
1433 | */ |
1434 | template <int SPATIAL_DIM> |
1435 | FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p); |
1436 | |
1437 | /** |
1438 | * @brief Are we running on a fbgemm supported cpu? |
1439 | */ |
1440 | FBGEMM_API bool fbgemmSupportedCPU(); |
1441 | |
1442 | /** |
1443 | * @brief Performs convolution using fastest path available. |
1444 | * |
1445 | * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions. |
1446 | */ |
1447 | template < |
1448 | typename processOutputType, |
1449 | int SPATIAL_DIM = 2, |
1450 | typename ACC_T = std::int32_t> |
1451 | FBGEMM_API int fbgemmConv( |
1452 | const conv_param_t<SPATIAL_DIM>& conv_p, |
1453 | const std::uint8_t* activations, |
1454 | PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, |
1455 | typename processOutputType::outType* out, |
1456 | std::int32_t* outBuffer, |
1457 | processOutputType& outProcess, |
1458 | int thread_id, |
1459 | int num_threads, |
1460 | const BlockingFactors* blocking_params = nullptr); |
1461 | |
1462 | /** |
1463 | * @brief Returns which fast path to take |
1464 | * |
1465 | * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions. |
1466 | * |
1467 | * @return optimized_conv_t::depthwise, optimized_conv_t::groupwise or |
1468 | * optimized_conv_t::im2col |
1469 | * |
1470 | */ |
1471 | template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t> |
1472 | FBGEMM_API optimized_conv_t |
1473 | ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p); |
1474 | } // namespace fbgemm |
1475 | |