Fbgemm.h source code [pytorch/third_party/fbgemm/include/fbgemm/Fbgemm.h]

1	/*
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	* All rights reserved.
4	* This source code is licensed under the BSD-style license found in the
5	* LICENSE file in the root directory of this source tree.
6	*/
7	#pragma once
8
9	/**
10	* Top level include file for FBGEMM.
11	*/
12	#include <cassert>
13	#include <cmath>
14	#include <limits>
15	#include <memory>
16	#include <type_traits>
17	#include "./ConvUtils.h"
18	#include "./FbgemmBuild.h"
19	#include "./FbgemmEmbedding.h"
20	#include "./FbgemmI8DepthwiseAvx2.h"
21	#include "./FbgemmI8DirectconvAvx2.h"
22	#include "./FbgemmI8Spmdm.h"
23	#include "./QuantUtilsAvx2.h"
24	#include "./Types.h"
25	#include "./Utils.h"
26
27	// Turning on this option will print out time breakdown of each stage (e.g.,
28	// input packing, the main GEMM kernel, each output processing pipeline).
29	// Please note that currently this option won't report accurate timing if
30	// multiple threads are used.
31	// #define FBGEMM_MEASURE_TIME_BREAKDOWN
32
33	#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
34	#include <chrono>
35	#include <iostream>
36	extern double packing_time;
37	extern double computing_time;
38	extern double kernel_time;
39	extern double postprocessing_time;
40	extern double run_time;
41	#endif
42
43	namespace fbgemm {
44
45	/**
46	* @brief Templatized struct for packing parameters for A and B matrices.
47	*
48	* @tparam T input type
49	* @tparam accT the type used for accumulation
50	* @tparam instSet anyarch/avx2/avx512
51	* @tparam int8Type an auxiliary template parameter to specialize for 8-bit
52	* input types.
53	*/
54	template <
55	typename T,
56	typename accT,
57	inst_set_t instSet,
58	typename int8Type = void>
59	struct PackingTraits;
60
61	// type specialized implementation in an include file
62	#include "./PackingTraits-inl.h"
63
64	/**
65	* @brief Base class for packing matrices for higher GEMM performance.
66	*
67	* Matrix is tiled into blockRows() * blockCols() blocks.
68	* Each block is with size blockRowSize() * blockColSize().
69	* This class is designed using CRTP
70	* (https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern)
71	*
72	* @tparam PT actual packing type, e.g., PackAWithRowOffset
73	*/
74	template <typename PT, typename inpType, typename accType = std::int32_t>
75	class PackMatrix {
76	public:
77	PackMatrix() = delete; // no default constructor
78
79	/**
80	* @param rows total number of rows in the matrix
81	* (packed rows can be less than rows).
82	* @param cols total number of columns in the matrix
83	* @param pmat A buffer to contain the packed matrix.
84	* If nullptr, a buffer owned by PackMatrix will be allocated
85	* internally to contain the packed matrix.
86	* For non-constant matrices like activation matrices, the client
87	* code may want to pass a pre-allocated pmat to avoid the
88	* overhead of internal memory allocation everytime a PackMatrix
89	* is constructed. The client code can query how big patm should
90	* be with packedBufferSize function.
91	* @param groups when groups > 1, we compute groups number of GEMMs each
92	* multiplies A.rows by A.cols/A.groups matrix with
93	* B.rows/B.groups by B.cols matrix (in conventional BLAS
94	* terminology, this is a batched GEMM but we use the name group
95	* to follow deep learning terminology). The result matrix has
96	* dimension A.rows by B.cols*B.groups .
97	* A.groups must be same as B.groups, A.groups must divide
98	* A.cols, and B.groups must divide B.rows and C.cols.
99	*/
100	PackMatrix(
101	std::int32_t rows,
102	std::int32_t cols,
103	inpType* pmat,
104	int groups = `1`,
105	const BlockingFactors* params = nullptr);
106
107	/**
108	* @return true usually when the matrix is constant matrix (e.g., weight
109	* matrices) that can be prepacked
110	*/
111	bool isPrePacked() const {
112	return static_cast<const PT>(this*)->isPrePacked();
113	}
114
115	/**
116	* @return true if this is the first input matrix in GEMM (i.e., A in C = A *
117	* B)
118	*/
119	static constexpr bool isA() {
120	return PT::isA();
121	}
122
123	/**
124	* @brief The size of the buffer used for packing (The size is in number of
125	* elements).
126	*
127	* rows and cols are only used for fully packing, i.e., for B matrix. The
128	* client code can use this function to query how big the buffer used for
129	* packing should be.
130	*/
131	static int packedBufferSize(
132	int rows = `0`,
133	int cols = `0`,
134	const BlockingFactors* params = nullptr);
135
136	/**
137	* @return Pointer to a buffer containing row offset results. Some packing
138	* objects fuse row offset computation for later requantization step.
139	*/
140	std::int32_t* getRowOffsetBuffer() const {
141	return static_cast<const PT>(this*)->getRowOffsetBuffer();
142	}
143
144	/**
145	* @brief When k loop is also tiled/blocked, this function is used to check if
146	* have executed computations for the last k block so that we can perform
147	* post-GEMM operations.
148	*/
149	bool isThisLastKBlock(int block_id) const {
150	return static_cast<const PT>(this*)->isThisLastKBlock(block_id);
151	}
152
153	/**
154	* @brief Actual packing of a block of the source matrix in pmat buffer.
155	*/
156	void pack(const block_type_t& block) {
157	static_cast<PT>(this*)->pack(block);
158	}
159
160	std::int32_t numRows() const {
161	return nrows_;
162	}
163
164	std::int32_t numCols() const {
165	return ncols_;
166	}
167
168	/**
169	* @return The number of rows in each block
170	*/
171	std::int32_t blockRowSize() const {
172	return brow_;
173	}
174
175	/**
176	* @return The number of columns in each block
177	*/
178	std::int32_t blockColSize() const {
179	return bcol_;
180	}
181
182	/**
183	* @return The number of blocks along rows
184	*/
185	std::int32_t blockRows() const {
186	return nbrow_;
187	}
188
189	/**
190	* @return The number of blocks along columns
191	*/
192	std::int32_t blockCols() const {
193	return nbcol_;
194	}
195
196	/**
197	* @return The number of the rows in the currently packed block of a matrix.
198	* For pre-packed (i.e., fully-packed), it's equal to the total number
199	* of rows.
200	*/
201	std::int32_t numPackedRows() const {
202	return packedBlock_.row_size;
203	}
204
205	/**
206	* @return The number of columns in the currently packed block of a matrix.
207	* For pre-packed (i.e., fully-packed), it's equal to the number of
208	* columns.
209	*/
210	std::int32_t numPackedCols() const {
211	return packedBlock_.col_size;
212	}
213
214	/**
215	* @return The first row of the block we're working on.
216	*/
217	std::int32_t packedRowStart() const {
218	return packedBlock_.row_start;
219	}
220
221	/**
222	* @return The first column of the block we're working on.
223	*/
224	std::int32_t packedColStart() const {
225	return packedBlock_.col_start;
226	}
227
228	/**
229	* @return The beginning of (rowBlockNum, colBlockNum)th block
230	*/
231	inpType* getBuf(std::int32_t rowBlockNum = `0`, std::int32_t colBlockNum = `0`) {
232	return buf_ + blockRowSize() * blockColSize() * rowBlockNum +
233	blockRowSize() * blockColSize() * blockCols() * colBlockNum;
234	}
235
236	/**
237	* @brief Print the packed block.
238	*/
239	void printPackedMatrix(std::string name) {
240	static_cast<PT>(this*)->printPackedMatrix(name);
241	}
242
243	/**
244	* @return The number of rows in the last row block.
245	*/
246	std::int32_t lastBrow() const {
247	return last_brow_;
248	}
249
250	/**
251	* @return The number of columns in the last column block.
252	*/
253	std::int32_t lastBcol() const {
254	return last_bcol_;
255	}
256
257	int numGroups() const {
258	return G_;
259	}
260
261	/**
262	* @return True if the last column block has fewer columns than the block
263	* size.
264	*/
265	bool isThereColRemainder() const {
266	return last_bcol_ != blockColSize();
267	}
268
269	virtual ~PackMatrix() {
270	if (bufAllocatedHere_) {
271	fbgemmAlignedFree(buf_);
272	}
273	}
274
275	protected:
276	/**
277	* Set which block we're packing
278	*/
279	void packedBlock(const block_type_t& block) {
280	packedBlock_ = block;
281	nbrow_ = (numPackedRows() + blockRowSize() - `1`) / blockRowSize();
282	nbcol_ = (numPackedCols() + blockColSize() - `1`) / blockColSize();
283
284	last_brow_ = ((numPackedRows() % blockRowSize()) == `0`)
285	? blockRowSize()
286	: (numPackedRows() % blockRowSize());
287	last_bcol_ = ((numPackedCols() % blockColSize()) == `0`)
288	? blockColSize()
289	: (numPackedCols() % blockColSize());
290	}
291
292	inpType* buf_;
293	std::int32_t brow_; ///< the number of rows in each block
294	std::int32_t bcol_; ///< the number of columns in each block
295	std::int32_t nbrow_; ///< the number of blocks along rows
296	std::int32_t nbcol_; ///< the number of blocks along columns
297	bool bufAllocatedHere_;
298	const BlockingFactors*
299	blocking_params; ///< MCB, KCB, NCB, MR, NR, NR_MIN, ROW_INTERLEAVE;
300
301	private:
302	std::int32_t nrows_, ncols_;
303	int G_;
304	block_type_t packedBlock_; ///< The block in the source matrix just packed
305	std::int32_t last_brow_, last_bcol_;
306	};
307
308	/**
309	* @brief Matrix packed for the first input matrix in GEMM (usually
310	* activation). The source matrix is already quantized. Default
311	* accumulation type is int32.
312	*/
313	template <typename T, typename accT = std::int32_t>
314	class FBGEMM_API PackAMatrix final
315	: public PackMatrix<PackAMatrix<T, accT>, T, accT> {
316	public:
317	using This = PackAMatrix<T, accT>;
318	using BaseType = PackMatrix<This, T, accT>;
319	using inpType = T;
320	using accType = accT;
321
322	PackAMatrix() = delete; // no default constructor
323
324	PackAMatrix(
325	matrix_op_t trans,
326	std::int32_t nRow,
327	std::int32_t nCol,
328	const inpType* smat,
329	std::int32_t ld,
330	inpType* pmat = nullptr,
331	int groups = `1`,
332	const BlockingFactors* params = nullptr);
333
334	/**
335	* Activation matrices are not constant so cannot amortize the cost of
336	* pre-packing.
337	*/
338	bool isPrePacked() const {
339	return false;
340	}
341
342	/**
343	* @return True if this is used as A matrix.
344	*/
345	static constexpr bool isA() {
346	return true;
347	}
348
349	/**
350	* @return A pointer to the row offset buffer. There is no row offset buffer
351	* calculations with this packing class, hence, it returns nullptr.
352	*/
353	std::int32_t* getRowOffsetBuffer() const {
354	return nullptr;
355	}
356
357	/**
358	* @return Offset of the element in the packed matrix that was at (i, j) in
359	* the source matrix.
360	*/
361	std::int32_t addr(std::int32_t i, std::int32_t j) const;
362
363	/**
364	* @brief Packs a block of source matrix into pmat buffer.
365	*/
366	void pack(const block_type_t& block);
367
368	/**
369	* @brief Print the packed block.
370	*/
371	void printPackedMatrix(std::string name);
372
373	private:
374	matrix_op_t trans_;
375	const T* smat_;
376	std::int32_t ld_;
377	std::int32_t row_interleave_B_;
378	};
379
380	/**
381	* @brief Matrix packed for the second input matrix in GEMM (usually weight).
382	* The source matrix is already quantized. Default accumulation
383	* type is int32.
384	*/
385	template <typename T, typename accT = std::int32_t>
386	class FBGEMM_API PackBMatrix final
387	: public PackMatrix<PackBMatrix<T, accT>, T, accT> {
388	public:
389	using This = PackBMatrix<T, accT>;
390	using BaseType = PackMatrix<This, T, accT>;
391	using inpType = T;
392	using accType = accT;
393
394	PackBMatrix() = delete; // no default constructor
395
396	/**
397	* @param groups if > 1 and trans == NoTranspose, smat is nRow x nCol with
398	* groups are vertically concatenated: each group is
399	* (nRow / groups) x nCol .
400	* if > 1 and trans == Transpose, smat is (nCol * groups) x
401	* (nRow / groups) with groups are horizontally concatenated:
402	* each group is nCol x (nRow / groups) . Each group is
403	* transposed and vertically concatenated to match with the
404	* NoTranspose case.
405	*/
406	PackBMatrix(
407	matrix_op_t trans,
408	std::int32_t nRow,
409	std::int32_t nCol,
410	const inpType* smat,
411	std::int32_t ld,
412	inpType* pmat = nullptr,
413	int groups = `1`,
414	const BlockingFactors* params = nullptr);
415
416	/**
417	* Weight matrices are usually constant so worth pre-packing.
418	*/
419	bool isPrePacked() const {
420	return true;
421	}
422
423	/**
424	* @return True if to be used as A matrix, False otherwise.
425	*/
426	static constexpr bool isA() {
427	return false;
428	}
429
430	/**
431	* @brief When k loop is also tiled/blocked, this function is used to check if
432	* have executed computations for the last k block so that we can perform
433	* post-GEMM operations.
434	*/
435	bool isThisLastKBlock(int block_id) const {
436	return (BaseType::blockRows() - `1`) == block_id;
437	}
438
439	/**
440	* @return Offset of the element in the packed matrix that was at (i, j) in
441	* the source matrix.
442	*/
443	std::int32_t addr(std::int32_t i, std::int32_t j) const;
444
445	/**
446	* @brief Packs a block of source matrix into pmat buffer. The blocking
447	* parameters are needed to compute the buffer size of each group.
448	* It will use default blocking parameters if params is not provided.
449	*/
450	void pack(const block_type_t& block, const BlockingFactors* params = nullptr);
451
452	/**
453	* @brief Print the packed block.
454	*/
455	void printPackedMatrix(
456	std::string name,
457	const BlockingFactors* params = nullptr);
458
459	/**
460	* @return true if meta information like matrix shape is the same.
461	*/
462	bool metaEquals(const PackBMatrix<T, accT>& that) const;
463	/**
464	* @return true if matrices are the same.
465	*/
466	bool equals(const PackBMatrix<T, accT>& that) const;
467
468	/**
469	* @brief Unpack pmat buffer to the origin_buf (Used for the serialization to
470	* recover weight matrix).
471	*/
472	void unpack(T* origin_buf, const BlockingFactors* params = nullptr);
473
474	~PackBMatrix() {}
475
476	private:
477	matrix_op_t trans_;
478	const T* smat_;
479	std::int32_t ld_;
480	std::int32_t row_interleave_;
481
482	/**
483	* @brief Internal function performing both pack & unpack
484	*/
485	void pack_unpack_(
486	const block_type_t& block,
487	T* unpack_buf,
488	T* pack_buf,
489	bool ispack,
490	const BlockingFactors* params = nullptr);
491	};
492
493	/**
494	* @brief Matrix packed for direct group convolution.
495	* The source matrix is already quantized. Default accumulation
496	* type is int32.
497	*/
498	template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = `2`>
499	class FBGEMM_API PackWeightMatrixForGConv {
500	public:
501	using This = PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>;
502	using inpType = T;
503	using accType = accT;
504
505	PackWeightMatrixForGConv() = delete; // no default constructor
506
507	/**
508	* @param pmat if nullptr, a buffer is allocated and owned by this class.
509	*/
510	PackWeightMatrixForGConv(
511	matrix_op_t trans,
512	const conv_param_t<SPATIAL_DIM>& conv_param,
513	const inpType* sdata,
514	inpType* pdata = nullptr);
515
516	/**
517	* Number of groups we work at a time to fill the full simd width
518	* e.g., IC_PER_G = 4 and OC_PER_G = 4, we work on two groups at a time
519	* to fill the avx2 width of 256 bits.
520	*/
521	static int numOfGroupsTogether(const conv_param_t<SPATIAL_DIM>& conv_param);
522
523	/**
524	* @brief Packs a block of source matrix into pmat buffer.
525	*/
526	void pack();
527
528	/**
529	* @brief Unpacks a pmat buffer into source matrix.
530	*/
531	void unpack(T* origin_buf);
532
533	/**
534	* @brief Return packed data
535	*/
536	inpType* getBuf() {
537	return pdata_;
538	}
539
540	~PackWeightMatrixForGConv() {
541	if (bufAllocatedHere_) {
542	fbgemmAlignedFree(pdata_);
543	}
544	}
545
546	private:
547	matrix_op_t trans_;
548	const conv_param_t<SPATIAL_DIM> conv_param_;
549	const T* sdata_;
550	T* pdata_;
551	bool bufAllocatedHere_;
552	// Number of groups we work at a time to fill the full simd width
553	int GTogether_;
554
555	/**
556	* @brief Internal function performing both pack & unpack
557	*/
558	void pack_unpack_(const T* src, T* dst, bool ispack);
559
560	/**
561	* @brief Get the index of the unpacked data
562	*/
563	int unpacked_index_(int t, int r, int s, int k, int g, int c, bool tr);
564
565	/**
566	* @brief Get the index of the packed data
567	*/
568	int packed_index_(int t, int r, int s, int k, int g, int c);
569	};
570
571	/**
572	* @brief A container class to keep packed weight tensor for convolution.
573	* The source tensor should already be quantized.
574	*
575	* @tparam SPATIAL_DIM is equal to 2 for 2D convolutions and 3 for 3D
576	* convolutions. Default value is 2.
577	* @tparam T is the datatype for source tensor. Default value is int8.
578	* @tparam accT is the datatype to accumulate into. Default value is int32.
579	*/
580	template <
581	int SPATIAL_DIM = `2`,
582	typename T = std::int8_t,
583	typename accT = std::int32_t>
584	class FBGEMM_API PackWeightsForConv {
585	public:
586	using This = PackWeightsForConv<SPATIAL_DIM, T, accT>;
587	using inpType = T;
588	using accType = accT;
589
590	PackWeightsForConv() = delete; // no default constructor
591
592	PackWeightsForConv(
593	const conv_param_t<SPATIAL_DIM>& conv_param,
594	const inpType* sdata,
595	const BlockingFactors* blocking_params = nullptr);
596
597	std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() {
598	return W_im2col_packed_;
599	}
600
601	std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
602	return W_dw_packed_;
603	}
604
605	std::shared_ptr<PackedDirectConvMatrix> getPackedWForDirectconv() {
606	return W_dc_packed_;
607	}
608
609	std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
610	getPackedWForGroupwise() {
611	return W_gconv_packed_;
612	}
613
614	std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() {
615	return W_pointwise_packed_;
616	}
617
618	int inputChannels() {
619	return conv_param_.IC;
620	}
621
622	int outputChannels() {
623	return conv_param_.OC;
624	}
625
626	std::array<int, SPATIAL_DIM> kernelDims() {
627	return conv_param_.K;
628	}
629
630	int groups() {
631	return conv_param_.G;
632	}
633
634	/**
635	* @brief Returns true if the packed weights would work for the given
636	* convolution parameters, and false otherwise
637	*/
638	bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p);
639
640	/**
641	* @brief Returns a string of mismatching parameters
642	*/
643	std::string mismatchingParams(const conv_param_t<SPATIAL_DIM>& conv_p);
644
645	/**
646	* @brief Unpack packed matric into origin_buf (Used for the serialization to
647	* recover weight matrix).
648	*/
649	void unpack(T* origin_buf);
650
651	private:
652	const conv_param_t<SPATIAL_DIM> conv_param_;
653	// Packed weights if we use im2col based convolution implementation
654	std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
655	// Packed weights if we use depthwise convolution implementation
656	std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
657	// Packed weights if we use direct convolution implementation
658	std::shared_ptr<PackedDirectConvMatrix> W_dc_packed_;
659	// Packed weights if we use groupwise (small channels per group) convolution
660	// implementation
661	std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
662	W_gconv_packed_;
663	// Packed weights if we use direct gemm for pointwise convolution
664	std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_;
665	};
666
667	/**
668	* @brief Matrix packed for the first input matrix in GEMM (usually activation),
669	* and row offsets used for requantization is computed during packing.
670	* Im2col is fused with packing here. The source matrix is already
671	* quantized.
672	*/
673	template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = `2`>
674	class FBGEMM_API PackAWithIm2Col
675	: public PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT> {
676	public:
677	using This = PackAWithIm2Col<T, accT, SPATIAL_DIM>;
678	using BaseType = PackMatrix<This, T, accT>;
679	using inpType = T;
680	using accType = accT;
681
682	PackAWithIm2Col() = delete; // no default constructor
683	/**
684	* @param zero_pt the quantized value that maps to 0.0f floating-point number.
685	* @param row_offset If nullptr, this constructor internally allocates a
686	* buffer and owns it. Otherwise, this class doesn't own
687	* the buffer. The buffer will be populated when pack
688	* function is called.
689	* @param b_symmetric if true we skip row offset computation
690	*/
691	PackAWithIm2Col(
692	const conv_param_t<SPATIAL_DIM>& conv_param,
693	const T* sdata,
694	inpType* pmat = nullptr,
695	std::int32_t a_zero_pt = `0`,
696	std::int32_t* row_offset = nullptr,
697	bool b_symmetric = false,
698	const BlockingFactors* params = nullptr);
699
700	/**
701	* Activation matrices are not constant so cannot amortize the cost of
702	* pre-packing.
703	*/
704	bool isPrePacked() const {
705	return false;
706	}
707
708	/**
709	* @return True if this is used as A matrix.
710	*/
711	static constexpr bool isA() {
712	return true;
713	}
714
715	/**
716	* @brief Packs a block of source matrix into pmat buffer.
717	*/
718	void pack(const block_type_t& block);
719
720	/**
721	* @return A pointer to the row offset buffer.
722	*/
723	std::int32_t* getRowOffsetBuffer() const {
724	return row_offset_;
725	}
726
727	/**
728	* @brief Print the packed block.
729	*/
730	void printPackedMatrix(std::string name);
731
732	/**
733	* @return Size of row offset buffer in number of elements
734	*/
735	static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
736
737	~PackAWithIm2Col() {
738	if (rowOffsetAllocatedHere) {
739	fbgemmAlignedFree(row_offset_);
740	}
741	}
742
743	private:
744	const conv_param_t<SPATIAL_DIM> conv_p_;
745	const T* sdata_;
746	std::int32_t a_zero_pt_;
747	std::int32_t* row_offset_{nullptr};
748	bool rowOffsetAllocatedHere{false};
749	std::int32_t row_interleave_B_;
750	};
751
752	/**
753	* @brief Matrix packed for the first input matrix in GEMM (usually activation),
754	* and row offsets used for requantization is computed during packing.
755	* The source matrix is already quantized.
756	*/
757	template <typename T, typename accT = std::int32_t>
758	class FBGEMM_API PackAWithRowOffset final
759	: public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> {
760	public:
761	using This = PackAWithRowOffset<T, accT>;
762	using BaseType = PackMatrix<This, T, accT>;
763	using inpType = T;
764	using accType = accT;
765
766	PackAWithRowOffset() = delete; // no default constructor
767	/**
768	* @param row_offset If nullptr, this constructor internally allocates a
769	* buffer and owns it. Otherwise, this class doesn't own
770	* the buffer. The buffer will be populated when pack
771	* function is called.
772	*/
773	PackAWithRowOffset(
774	matrix_op_t trans,
775	std::uint32_t nRow,
776	std::uint32_t nCol,
777	const T* smat,
778	std::uint32_t ld,
779	inpType* pmat = nullptr,
780	int groups = `1`,
781	std::int32_t* row_offset = nullptr,
782	const BlockingFactors* params = nullptr);
783
784	/**
785	* Activation matrices are not constant so cannot amortize the cost of
786	* pre-packing.
787	*/
788	bool isPrePacked() const {
789	return false;
790	}
791
792	/**
793	* @return True if this is used as A matrix.
794	*/
795	static constexpr bool isA() {
796	return true;
797	}
798
799	/**
800	* @return Offset of the element in the packed matrix that was at (i, j) in
801	* the source matrix
802	*/
803	std::int32_t addr(std::int32_t i, std::int32_t j) const;
804
805	/**
806	* @brief Packs a block of source matrix into pmat buffer.
807	*/
808	void pack(const block_type_t& block);
809
810	/**
811	* @return A pointer to the row offset buffer.
812	*/
813	std::int32_t* getRowOffsetBuffer() const {
814	return row_offset_;
815	}
816
817	/**
818	* @brief Print the packed block.
819	*/
820	void printPackedMatrix(std::string name);
821
822	/**
823	* @return size of row offset buffer in number of elements
824	*/
825	static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
826
827	~PackAWithRowOffset() {
828	if (rowOffsetAllocatedHere) {
829	fbgemmAlignedFree(row_offset_);
830	}
831	}
832
833	private:
834	matrix_op_t trans_;
835	const T* smat_;
836	std::uint32_t ld_;
837	std::int32_t* row_offset_;
838	bool rowOffsetAllocatedHere;
839	std::int32_t row_interleave_B_;
840	};
841
842	/**
843	* @brief Matrix packed for the first input matrix in GEMM (usually activation),
844	* and row offsets used for requantization is computed during packing.
845	* The source matrix is in fp32 and quantized during packing.
846	*/
847	template <typename T, typename accT = std::int32_t>
848	class FBGEMM_API PackAWithQuantRowOffset final
849	: public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> {
850	public:
851	using This = PackAWithQuantRowOffset<T, accT>;
852	using BaseType = PackMatrix<This, T, accT>;
853	using inpType = T;
854	using accType = accT;
855
856	PackAWithQuantRowOffset() = delete; // no default constructor
857	/**
858	* @param row_offset If nullptr, this constructor internally allocates a
859	* buffer and owns it. Otherwise, this class doesn't own
860	* the buffer. The buffer will be populated when pack
861	* function is called.
862	*/
863	PackAWithQuantRowOffset(
864	matrix_op_t trans,
865	std::int32_t nRow,
866	std::int32_t nCol,
867	const float* smat,
868	std::int32_t ld,
869	inpType* pmat = nullptr,
870	float scale = `1.0f`,
871	std::int32_t zero_pt = `0`,
872	int groups = `1`,
873	std::int32_t* row_offset = nullptr,
874	const BlockingFactors* params = nullptr);
875
876	/**
877	* Activation matrices are not constant so cannot amortize the cost of
878	* pre-packing.
879	*/
880	bool isPrePacked() const {
881	return false;
882	}
883
884	/**
885	* @return True if this is used as A matrix.
886	*/
887	static constexpr bool isA() {
888	return true;
889	}
890
891	/**
892	* @return offset of the element in the packed matrix that was at (i, j) in
893	* the source matrix
894	*/
895	std::int32_t addr(std::int32_t i, std::int32_t j) const;
896
897	/**
898	* @brief Packs a block of source matrix into pmat buffer.
899	*/
900	void pack(const block_type_t& block);
901
902	/**
903	* @return A pointer to the row offset buffer.
904	*/
905	std::int32_t* getRowOffsetBuffer() const {
906	return row_offset_;
907	}
908
909	/**
910	* @brief Print the packed block.
911	*/
912	void printPackedMatrix(std::string name);
913
914	/**
915	* @return Size of row offset buffer in number of elements
916	*/
917	static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
918
919	~PackAWithQuantRowOffset() {
920	if (rowOffsetAllocatedHere) {
921	fbgemmAlignedFree(row_offset_);
922	}
923	}
924
925	private:
926	matrix_op_t trans_;
927	const float* smat_;
928	std::int32_t ld_;
929	float scale_;
930	std::int32_t zero_pt_;
931	std::int32_t* row_offset_;
932	bool rowOffsetAllocatedHere;
933	std::int32_t row_interleave_B_;
934	};
935
936	/*
937	*
938	* Post Processing of outputs
939	*
940	*/
941
942	/**
943	* @brief Does nothing. NoOp. Used as the last operation in the output
944	* processing pipeline.
945	*
946	*/
947	template <typename outT = std::uint8_t, typename inT = std::uint8_t>
948	class FBGEMM_API DoNothing {
949	public:
950	using outType = outT;
951	using inpType = inT;
952	DoNothing() {}
953	template <inst_set_t instSet>
954	int f(
955	outType* / unused /,
956	inpType* / unused /,
957	const block_type_t& / unused /,
958	int / unused /,
959	int / unused /) const {
960	return `0`;
961	}
962	};
963
964	/**
965	* @brief Copy data pointed by inp ptr to out ptr when
966	* inp ptr and out ptr are not the same.
967	* inp buffer: row and column start points: (0, 0)
968	* output buffer: row and column start points:
969	* (block.row_start, block.col_start)
970	*
971	* This is the output processing stage that should passed when there is no
972	* requantization and output is required in the same format as internal buffer
973	* used for accumulation.
974	*/
975	template <
976	typename outT = std::int32_t,
977	typename inT = std::int32_t,
978	typename nextOPType = DoNothing<outT, outT>>
979	class FBGEMM_API memCopy {
980	public:
981	using outType = outT;
982	using inpType = inT;
983	explicit memCopy(nextOPType& nextop) : nextop_(nextop) {}
984	template <inst_set_t instSet>
985	inline int f(
986	outType* out,
987	inpType* inp,
988	const block_type_t& block,
989	int ld_out,
990	int ld_in) const;
991
992	private:
993	nextOPType& nextop_;
994	};
995
996	/**
997	* @brief Perform scaling on accumulated data.
998	*/
999	template <
1000	typename outT = std::int32_t,
1001	typename inT = std::int32_t,
1002	typename nextOPType = DoNothing<outT, outT>>
1003	class ScaleOP {
1004	public:
1005	using outType = outT;
1006	using inpType = inT;
1007	explicit ScaleOP(inpType scalingFactor) : scalingFactor_(scalingFactor) {}
1008
1009	template <inst_set_t instSet>
1010	inline int f(
1011	outType* out,
1012	inpType* inp,
1013	const block_type_t& block,
1014	int ld_out,
1015	int ld_in) const;
1016
1017	private:
1018	inpType scalingFactor_;
1019	};
1020
1021	/**
1022	* @brief Perform Relu on accumulated data.
1023	*/
1024	template <
1025	typename outT = std::int32_t,
1026	typename inT = std::int32_t,
1027	typename nextOPType = DoNothing<outT, outT>>
1028	class ReluOutput {
1029	public:
1030	using outType = outT;
1031	using inpType = inT;
1032	explicit ReluOutput(inpType zero_pt) : zero_pt_(zero_pt) {}
1033
1034	template <inst_set_t instSet>
1035	inline int f(
1036	outType* out,
1037	inpType* inp,
1038	const block_type_t& block,
1039	int ld_out,
1040	int ld_in) const;
1041
1042	private:
1043	inpType zero_pt_;
1044	};
1045
1046	/**
1047	* @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output
1048	* processing pipeline.
1049	*
1050	* SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
1051	* (inp). After modifying the input buffer, pass it to the next op.
1052	* When groups > 1, each group is numRows() x (numCols()/groups) matrix.
1053	*/
1054	template <
1055	typename outT = std::int32_t,
1056	typename inT = std::int32_t,
1057	typename nextOPType = DoNothing<inT, inT>>
1058	class FBGEMM_API DoSpmdmOnInpBuffer {
1059	public:
1060	using outType = outT;
1061	using inpType = inT;
1062	DoSpmdmOnInpBuffer(
1063	nextOPType& nextop,
1064	const std::uint8_t* A,
1065	int lda,
1066	const CompressedSparseColumn& B_csc,
1067	int groups = `1`)
1068	: nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {}
1069
1070	template <inst_set_t instSet>
1071	inline int f(
1072	outT* out,
1073	inT* inp,
1074	const block_type_t& block,
1075	int ld_out,
1076	int ld_in) const;
1077
1078	private:
1079	nextOPType& nextop_;
1080	const std::uint8_t* A_;
1081	const int lda_;
1082	const CompressedSparseColumn& B_csc_;
1083	const int groups_;
1084	};
1085
1086	/**
1087	* @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output
1088	* processing pipeline.
1089	*
1090	* SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
1091	* (inp). After modifying the input buffer, pass it to the next op.
1092	* When groups > 1, each group is numRows() x (numCols()/groups) matrix.
1093	*/
1094	template <
1095	typename outT = std::int32_t,
1096	typename inT = std::int32_t,
1097	typename nextOPType = DoNothing<inT, inT>>
1098	class FBGEMM_API DoSConvOnInpBuffer {
1099	public:
1100	using outType = outT;
1101	using inpType = inT;
1102	DoSConvOnInpBuffer(
1103	nextOPType& nextop,
1104	const std::uint8_t* A,
1105	const conv_param_t<>& conv_p,
1106	std::int32_t A_zero_point,
1107	const CompressedSparseColumn& B_csc)
1108	: nextop_(nextop),
1109	A_(A),
1110	conv_p_(conv_p),
1111	A_zero_point_(A_zero_point),
1112	B_csc_(B_csc) {}
1113
1114	template <inst_set_t instSet>
1115	inline int f(
1116	outT* out,
1117	inT* inp,
1118	const block_type_t& block,
1119	int ld_out,
1120	int ld_in) const;
1121
1122	private:
1123	nextOPType& nextop_;
1124	const std::uint8_t* A_;
1125	const conv_param_t<> conv_p_;
1126	const std::int32_t A_zero_point_;
1127	const CompressedSparseColumn& B_csc_;
1128	};
1129
1130	/**
1131	* @brief Requantize values in inp buffer and write to out buffer.
1132	* pass the out buffer to next op for further processing.
1133	*/
1134	template <
1135	bool FUSE_RELU,
1136	QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
1137	typename BIAS_TYPE = std::int32_t,
1138	typename outT = std::uint8_t,
1139	typename inT = std::int32_t,
1140	typename nextOPType = DoNothing<outT, outT>>
1141	class FBGEMM_API ReQuantizeOutput {
1142	public:
1143	static constexpr int RELU_FUSED = FUSE_RELU;
1144	static constexpr QuantizationGranularity QGRANType = Q_GRAN;
1145	using BIAS_T = BIAS_TYPE;
1146	using outType = outT;
1147	using inpType = inT;
1148	/**
1149	* @param C_multiplier The length of this array is
1150	* 1 when Q_GRAN == QuantizationGranularity::TENSOR,
1151	* groups when Q_GRAN == QuantizationGranularity::GROUP,
1152	* nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL
1153	* @param Bq_zero_point The length of this array should be the same as
1154	* C_multiplier.
1155	* @param row_offsets Typically, this should've been computed by a
1156	* PackAMatrix and should be obtained by
1157	* PackMatrix::getRowOffsetBuffer().
1158	* If Bq_zero_point == 0 (symmetric quantization of B
1159	* matrix), we can pass nullptr.
1160	* @param col_offsets This should be pre-computed for example using
1161	* col_offsets_with_zero_pt_s8acc32_ref.
1162	* The length should be nCol.
1163	* See PackedRequantizeTest.cc for an example.
1164	* TODO: if Aq_zero_point == 0, allow passing nullptr.
1165	* @param bias can be nullptr otherwise the length should be nCol
1166	* @param act_times_w_scale activation_scale * weight_scale. This is only
1167	* used if bias is unquantized (i.e., float).
1168	*/
1169	ReQuantizeOutput(
1170	nextOPType& nextop,
1171	const float* C_multiplier,
1172	std::int32_t C_zero_point,
1173	std::int32_t Aq_zero_point,
1174	const std::int32_t* Bq_zero_point,
1175	const std::int32_t* row_offsets,
1176	const std::int32_t* col_offsets,
1177	const BIAS_T* bias,
1178	std::uint32_t nCol,
1179	int groups = `1`,
1180	const float* act_times_w_scale = nullptr)
1181	: nextop_(nextop),
1182	C_multiplier_(C_multiplier),
1183	C_zero_point_(C_zero_point),
1184	Aq_zero_point_(Aq_zero_point),
1185	Bq_zero_point_(Bq_zero_point),
1186	q_row_offsets_(row_offsets),
1187	q_col_offsets_(col_offsets),
1188	bias_(bias),
1189	ncols_(nCol),
1190	groups_(groups),
1191	act_times_w_scale_(act_times_w_scale) {}
1192
1193	template <inst_set_t instSet>
1194	inline int f(
1195	outT* out,
1196	const inT* inp,
1197	const block_type_t& block,
1198	int ld_out,
1199	int ld_in) const;
1200
1201	const float* getCMultiplier() const {
1202	return C_multiplier_;
1203	}
1204	std::int32_t getAZeroPoint() const {
1205	return Aq_zero_point_;
1206	}
1207	std::int32_t getCZeroPoint() const {
1208	return C_zero_point_;
1209	}
1210	const std::int32_t* getBZeroPoint() const {
1211	return Bq_zero_point_;
1212	}
1213	const std::int32_t* getRowOffsets() const {
1214	return q_row_offsets_;
1215	}
1216	const std::int32_t* getColOffsets() const {
1217	return q_col_offsets_;
1218	}
1219	const BIAS_T* getBias() const {
1220	return bias_;
1221	}
1222	std::uint32_t getNCols() const {
1223	return ncols_;
1224	}
1225	const float* getActWScale() const {
1226	return act_times_w_scale_;
1227	}
1228
1229	void setRowOffsets(const std::int32_t* row_offsets) {
1230	q_row_offsets_ = row_offsets;
1231	}
1232
1233	private:
1234	nextOPType& nextop_;
1235	const float* C_multiplier_;
1236	std::int32_t C_zero_point_;
1237	std::int32_t Aq_zero_point_;
1238	const std::int32_t* Bq_zero_point_;
1239	const std::int32_t* q_row_offsets_;
1240	const std::int32_t* q_col_offsets_;
1241	const BIAS_T* bias_;
1242	std::uint32_t ncols_;
1243	int groups_;
1244	const float* act_times_w_scale_;
1245	};
1246
1247	/**
1248	* @brief Requantize to convert accumulated data to be used as float, i.e., the
1249	* output would be used as float.
1250	*/
1251	template <
1252	bool FUSE_RELU,
1253	QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
1254	typename outT = float,
1255	typename inT = std::int32_t,
1256	typename nextOPType = DoNothing<outT, outT>>
1257	class FBGEMM_API ReQuantizeForFloat {
1258	public:
1259	using outType = outT;
1260	using inpType = inT;
1261	/**
1262	* @param Bq_scale The length of this array is
1263	* 1 when Q_GRAN == QuantizationGranularity::TENSOR,
1264	* groups when Q_GRAN == QuantizationGranularity::GROUP,
1265	* nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL
1266	* @param Bq_zero_point The length of this array should be the same as
1267	* Bq_scale.
1268	* @param row_offsets Typically, this should've been computed by a
1269	* PackAMatrix and should be obtained by
1270	* PackMatrix::getRowOffsetBuffer().
1271	* If Bq_zero_point == 0 (symmetric quantization of B
1272	* matrix), we can pass nullptr.
1273	* @param col_offsets This should be pre-computed for example using
1274	* col_offsets_with_zero_pt_s8acc32_ref.
1275	* The length should be nCol.
1276	* See PackedRequantizeTest.cc for an example.
1277	* TODO: if Aq_zero_point == 0, allow passing nullptr.
1278	* @param bias can be nullptr otherwise the length should be nCol
1279	*/
1280	ReQuantizeForFloat(
1281	nextOPType& nextop,
1282	float Aq_scale,
1283	const float* Bq_scale,
1284	std::int32_t Aq_zero_point,
1285	const std::int32_t* Bq_zero_point,
1286	const std::int32_t* row_offsets,
1287	const std::int32_t* col_offsets,
1288	const float* bias,
1289	std::uint32_t nCol,
1290	int groups = `1`)
1291	: nextop_(nextop),
1292	Aq_scale_(Aq_scale),
1293	Bq_scale_(Bq_scale),
1294	Aq_zero_point_(Aq_zero_point),
1295	Bq_zero_point_(Bq_zero_point),
1296	q_row_offsets_(row_offsets),
1297	q_col_offsets_(col_offsets),
1298	bias_(bias),
1299	ncols_(nCol),
1300	groups_(groups) {}
1301
1302	template <inst_set_t instSet>
1303	inline int f(
1304	outT* out,
1305	inT* inp,
1306	const block_type_t& block,
1307	int ld_out,
1308	int ld_in) const;
1309
1310	private:
1311	nextOPType& nextop_;
1312	float Aq_scale_;
1313	const float* Bq_scale_;
1314	std::int32_t Aq_zero_point_;
1315	const std::int32_t* Bq_zero_point_;
1316	const std::int32_t* q_row_offsets_;
1317	const std::int32_t* q_col_offsets_;
1318	const float* bias_;
1319	std::uint32_t ncols_;
1320	int groups_;
1321	};
1322
1323	// type specialized implementation in an include file
1324	#include "./OutputProcessing-inl.h"
1325
1326	/*
1327	*
1328	* ####### GEMM related functions #######
1329	*
1330	*/
1331
1332	/**
1333	* Matrix B must be prepacked. For matrix A, packA.pack function is called to
1334	* pack it.
1335	*
1336	* @tparam packingAMatrix processing of A matrix while packing,
1337	* e.g., PackAWithQuantRowOffset
1338	*
1339	* @tparam packingBMatrix processing of B matrix while packing,
1340	* e.g., pre-multiply by alpha
1341	* @tparam cT data type of C matrix
1342	* @tparam processOutputType further processing of outputs, e.g., Relu
1343	*/
1344	template <
1345	typename packingAMatrix,
1346	typename packingBMatrix,
1347	typename cT,
1348	typename processOutputType>
1349	FBGEMM_API void fbgemmPacked(
1350	PackMatrix<
1351	packingAMatrix,
1352	typename packingAMatrix::inpType,
1353	typename packingAMatrix::accType>& packA,
1354	PackMatrix<
1355	packingBMatrix,
1356	typename packingBMatrix::inpType,
1357	typename packingBMatrix::accType>& packB,
1358	cT* C,
1359	std::int32_t* C_buffer,
1360	std::uint32_t ldc,
1361	const processOutputType& outProcess,
1362	int thread_id,
1363	int num_threads,
1364	const BlockingFactors* blocking_params = nullptr);
1365
1366	/**
1367	* @brief Perform small-channels-per-group groupwise convolution
1368	* Note: Currently threading is not supported. This function does
1369	* nothing for thread_ids > 0, i.e., returns early.
1370	*
1371	* @param rowOffsetBuf nullptr if B uses symmetric quantization
1372	* Note: Currently threading is not supported. This function does
1373	* nothing for thread_ids > 0, i.e., returns early.
1374	*/
1375	template <
1376	typename packed_W,
1377	typename outType,
1378	bool FUSE_RELU,
1379	QuantizationGranularity Q_GRAN,
1380	int SPATIAL_DIM = `2`,
1381	typename BIAS_TYPE = std::int32_t>
1382	FBGEMM_API void fbgemmGroupwiseConv(
1383	const conv_param_t<SPATIAL_DIM>& conv_param,
1384	const std::uint8_t* activations,
1385	std::int32_t a_zero_point,
1386	std::int32_t* rowOffsetBuf,
1387	packed_W& packed_weights,
1388	outType* out,
1389	std::int32_t* outBuffer,
1390	const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
1391	int thread_id,
1392	int num_threads);
1393
1394	template <
1395	int SPATIAL_DIM,
1396	QuantizationGranularity Q_GRAN,
1397	bool FUSE_RELU,
1398	typename BIAS_TYPE = std::int32_t>
1399	FBGEMM_API void fbgemmDirectConv(
1400	const conv_param_t<SPATIAL_DIM>& conv_p,
1401	const uint8_t* Aint8,
1402	PackedDirectConvMatrix& Bint8_tr,
1403	uint8_t* C,
1404	int32_t* C_buffer,
1405	const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
1406	const BIAS_TYPE* bias,
1407	int thread_id,
1408	int num_threads);
1409
1410	/**
1411	* @return Size of row offset buffer in number of elements needed for
1412	* fbgemmGroupwiseConv
1413	*/
1414	template <int SPATIAL_DIM = `2`>
1415	FBGEMM_API int rowOffsetBufferSizeGConv(
1416	const conv_param_t<SPATIAL_DIM>& conv_param);
1417
1418	/**
1419	* @brief Is this depthwise convolution optimized?
1420	*/
1421	template <int SPATIAL_DIM = `2`, typename ACC_T = std::int32_t>
1422	bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
1423
1424	/**
1425	* @brief Is this groupwise convolution supported?
1426	*/
1427	template <int SPATIAL_DIM>
1428	FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p);
1429
1430	/**
1431	* @brief Is this convolution a direct matrix-matrix multiplication, i.e., 1x1
1432	* (aka pointwise) with right paddings etc.?
1433	*/
1434	template <int SPATIAL_DIM>
1435	FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
1436
1437	/**
1438	* @brief Are we running on a fbgemm supported cpu?
1439	*/
1440	FBGEMM_API bool fbgemmSupportedCPU();
1441
1442	/**
1443	* @brief Performs convolution using fastest path available.
1444	*
1445	* @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
1446	*/
1447	template <
1448	typename processOutputType,
1449	int SPATIAL_DIM = `2`,
1450	typename ACC_T = std::int32_t>
1451	FBGEMM_API int fbgemmConv(
1452	const conv_param_t<SPATIAL_DIM>& conv_p,
1453	const std::uint8_t* activations,
1454	PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights,
1455	typename processOutputType::outType* out,
1456	std::int32_t* outBuffer,
1457	processOutputType& outProcess,
1458	int thread_id,
1459	int num_threads,
1460	const BlockingFactors* blocking_params = nullptr);
1461
1462	/**
1463	* @brief Returns which fast path to take
1464	*
1465	* @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
1466	*
1467	* @return optimized_conv_t::depthwise, optimized_conv_t::groupwise or
1468	* optimized_conv_t::im2col
1469	*
1470	*/
1471	template <int SPATIAL_DIM = `2`, typename ACC_T = std::int32_t>
1472	FBGEMM_API optimized_conv_t
1473	ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
1474	} // namespace fbgemm
1475

Browse the source code of pytorch/third_party/fbgemm/include/fbgemm/Fbgemm.h