PackWeightMatrixForGConv.cc source code [pytorch/third_party/fbgemm/src/PackWeightMatrixForGConv.cc]

1	/*
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	* All rights reserved.
4	* This source code is licensed under the BSD-style license found in the
5	* LICENSE file in the root directory of this source tree.
6	*/
7	#define FBGEMM_EXPORTS
8	#include <cpuinfo.h>
9	#include <cassert>
10	#include <iomanip>
11	#include <numeric>
12	#include "./RefImplementations.h"
13	#include "fbgemm/Fbgemm.h"
14	#include "fbgemm/SimdUtils.h"
15
16	namespace fbgemm {
17
18	template <typename T, typename accT, int SPATIAL_DIM>
19	PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
20	matrix_op_t trans,
21	const conv_param_t<SPATIAL_DIM>& conv_param,
22	const T* sdata,
23	T* pdata)
24	: trans_(trans), conv_param_(conv_param), sdata_(sdata) {
25	if (!cpuinfo_initialize()) {
26	throw std::runtime_error ("Failed to initialize cpuinfo!");
27	}
28	GTogether_ = numOfGroupsTogether(conv_param_);
29	assert(
30	GTogether_ <= conv_param_.G &&
31	"Number of groups together smaller than total number of groups");
32	if (!pdata) {
33	bufAllocatedHere_ = true;
34	int kernel_prod = std::accumulate(
35	conv_param.K.begin(), conv_param.K.end(), `1`, std::multiplies<int>());
36	// we make it a multiple of 4
37	int paddedICPerG = ((conv_param_.IC / conv_param_.G) + `3`) / `4` * `4`;
38	pdata_ = static_cast<T*>(fbgemmAlignedAlloc(
39	`64`,
40	(conv_param_.G + GTogether_ - `1`) / GTogether_ * GTogether_ *
41	kernel_prod * (conv_param_.OC / conv_param_.G) * paddedICPerG *
42	sizeof(T)));
43	} else {
44	bufAllocatedHere_ = false;
45	pdata_ = pdata;
46	}
47
48	pack();
49	}
50
51	template <typename T, typename accT, int SPATIAL_DIM>
52	int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::numOfGroupsTogether(
53	const conv_param_t<SPATIAL_DIM>& conv_param) {
54	int OC_per_G = conv_param.OC / conv_param.G;
55	int IC_per_G = conv_param.IC / conv_param.G;
56	if (fbgemmHasAvx512Support() \|\| fbgemmHasAvx512VnniSupport()) {
57	// TODO: change to avx512 when avx512 support is available
58	return std::max(
59	simd_info<inst_set_t::avx512>::WIDTH_BYTES / OC_per_G /
60	std::max(IC_per_G, `4`),
61	`1`);
62	} else {
63	// avx2
64	// e.g., IC_per_G == 4, we need to work on 2 groups at a time
65	return std::max(
66	simd_info<inst_set_t::avx2>::WIDTH_BYTES / OC_per_G /
67	std::max(IC_per_G, `4`),
68	`1`);
69	}
70	return `1`;
71	}
72
73	/**
74	* @brief Get the index of the unpacked data
75	* for a given <t, r, s, k, g, c, tr>
76	*
77	* Non-transposed: G (T R S C/G) K/G
78	* Transposed: G K/G (T R S C/G)
79	* Using inline as this will be called frequently
80	*/
81	template <typename T, typename accT, int SPATIAL_DIM>
82	inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpacked_index_(
83	int t,
84	int r,
85	int s,
86	int k,
87	int g,
88	int c,
89	bool tr) {
90	// Get the full dimensions
91	// Can't use T as varname because T is a template parameter.
92	int F = SPATIAL_DIM <= `2` ? `1` : conv_param_.K[SPATIAL_DIM - `3`];
93	int R = SPATIAL_DIM == `1` ? `1` : conv_param_.K[SPATIAL_DIM - `2`];
94	int S = conv_param_.K[SPATIAL_DIM - `1`];
95	int G = conv_param_.G;
96	int IC_per_G = conv_param_.IC / G;
97	int OC_per_G = conv_param_.OC / G;
98
99	int idx;
100	if (tr) {
101	idx = ((((g * OC_per_G + k) * F + t) * R + r) * S + s) * IC_per_G + c;
102	} else {
103	idx = ((((g * F + t) * R + r) * S + s) * IC_per_G + c) * OC_per_G + k;
104	}
105	return idx;
106	}
107
108	/**
109	* @brief Get the index of the packed data for a given <t, r, s, k, g, c>
110	*
111	* The index may differ depending on IC_per_G.
112	* Using inline as this will be called frequently
113	*/
114	template <typename T, typename accT, int SPATIAL_DIM>
115	inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::packed_index_(
116	int t,
117	int r,
118	int s,
119	int k,
120	int g,
121	int c) {
122	// Get the full dimensions
123	// Can't use T as varname because T is a template parameter.
124	int F = SPATIAL_DIM <= `2` ? `1` : conv_param_.K[SPATIAL_DIM - `3`];
125	int R = SPATIAL_DIM == `1` ? `1` : conv_param_.K[SPATIAL_DIM - `2`];
126	int S = conv_param_.K[SPATIAL_DIM - `1`];
127	int G = conv_param_.G;
128	int IC_per_G = conv_param_.IC / G;
129	int OC_per_G = conv_param_.OC / G;
130	int paddedICPerG = (IC_per_G + `3`) / `4` * `4`;
131
132	int idx = ((((((g / GTogether_) * F + t) * R + r) * S + s) * OC_per_G + k) *
133	GTogether_ +
134	(g % GTogether_)) *
135	paddedICPerG +
136	c;
137	return idx;
138	}
139
140	/**
141	* @brief Pack or unpack matrix
142	*
143	* Let IC_per_G be number of input channels per group and OC_per_G be number of
144	* output channels per group.
145	*
146	* For IC_per_G == 4 && OC_per_G == 4 optimized
147	* kernel works on 2 groups at a time hence input channels for g and g+1 group
148	* are laid out sequentially for each output channel, i.e., the layout is (G/2)
149	* R S K (2C) and K (2C) is in each 32B vector.
150	* We work on two groups at a time to fully utilize the avx2 SIMD width of
151	* 256-bits.
152	*
153	* For IC_per_G == 8, 16, 32 && OC_per_G == 8, 16, 32 there is no need to work
154	* on 2 groups at a time and full SIMD width can be efficiently utilized even
155	* while working on 1 group at a time.
156	* In this case, the layout is G R S K_per_G paddedICPerG
157	*/
158
159	template <typename T, typename accT, int SPATIAL_DIM>
160	void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack_unpack_(
161	const T* src,
162	T* dst,
163	bool ispack) {
164	// Can't use T as varname because T is a template parameter.
165	int F = SPATIAL_DIM <= `2` ? `1` : conv_param_.K[SPATIAL_DIM - `3`];
166	int R = SPATIAL_DIM == `1` ? `1` : conv_param_.K[SPATIAL_DIM - `2`];
167	int S = conv_param_.K[SPATIAL_DIM - `1`];
168	int G = conv_param_.G;
169	int IC_per_G = conv_param_.IC / G;
170	int OC_per_G = conv_param_.OC / G;
171	int paddedICPerG = (IC_per_G + `3`) / `4` * `4`;
172
173	// If transpose option is set, the weight matrix is in layout G K/G (T R S
174	// C/G) instead of G (T R S C/G) K/G
175	bool tr = (trans_ == matrix_op_t::Transpose);
176	if (fbgemmOptimizedGConv(conv_param_)) {
177	// currently only this case is supported
178	for (int t = `0`; t < F; ++t) {
179	for (int r = `0`; r < R; ++r) {
180	for (int s = `0`; s < S; ++s) {
181	for (int k = `0`; k < OC_per_G; ++k) {
182	for (int g = `0`; g < G; ++g) {
183	for (int c = `0`; c < IC_per_G; ++c) {
184	int p_idx = packed_index_(t, r, s, k, g, c);
185	int up_idx = unpacked_index_(t, r, s, k, g, c, tr);
186	// Pack: src (unpacked) -> dst (packed)
187	if (ispack) {
188	dst[p_idx] = src[up_idx];
189	} else {
190	dst[up_idx] = src[p_idx];
191	}
192	}
193	if (ispack) {
194	for (int c = IC_per_G; c < paddedICPerG; ++c) {
195	int p_idx = packed_index_(t, r, s, k, g, c);
196	dst[p_idx] = `0`;
197	}
198	}
199	}
200	}
201	}
202	}
203	}
204	} else {
205	// For pack & transposed, call transposeConvWeights()
206	// G K/G (T R S C/G) => G (T R S C/G) K/G
207	if (tr) {
208	if (ispack) {
209	transposeConvWeights(conv_param_, src, dst);
210	} else {
211	// TODO: Wrap this as a inverseTransposeConvWeights()?
212	// For unpack & transposed, call transposeConvWeights()
213	// G (T R S C/G) K/G => G K/G (T R S C/G)
214	for (int t = `0`; t < F; ++t) {
215	for (int r = `0`; r < R; ++r) {
216	for (int s = `0`; s < S; ++s) {
217	for (int k = `0`; k < OC_per_G; ++k) {
218	for (int g = `0`; g < G; ++g) {
219	for (int c = `0`; c < IC_per_G; ++c) {
220	dst[((((g * OC_per_G + k) * F + t) * R + r) * S + s) *
221	IC_per_G +
222	c] =
223	src[((((g * F + t) * R + r) * S + s) * IC_per_G + c) *
224	OC_per_G +
225	k];
226	}
227	}
228	}
229	}
230	}
231	}
232	} // end if(ispack)
233	} else {
234	// just copy the data for not supported cases
235	int kernel_prod = std::accumulate(
236	conv_param_.K.begin(),
237	conv_param_.K.end(),
238	`1`,
239	std::multiplies<int>());
240	memcpy(dst, src, G * kernel_prod * OC_per_G * IC_per_G * sizeof(inpType));
241	} // end if(tr)
242	} // end if(fbgemmOptimizedGConv(conv_param_)
243	}
244
245	/**
246	* @brief Pack weight tensor in a suitable format required for the optimized
247	* kernel.
248	*/
249	template <typename T, typename accT, int SPATIAL_DIM>
250	void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
251	pack_unpack_(sdata_, pdata_, true);
252	}
253
254	/**
255	* @brief Unpack the packed weight tensor (for the optimized kernel)
256	* to the original form.
257	*/
258	template <typename T, typename accT, int SPATIAL_DIM>
259	void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpack(T* origin_buf) {
260	pack_unpack_(const_cast<const T>(pdata_), origin_buf, false*);
261	}
262
263	template class FBGEMM_API PackWeightMatrixForGConv<int8_t, int32_t, `1`>;
264	template class FBGEMM_API PackWeightMatrixForGConv<int8_t, int16_t, `1`>;
265	template class FBGEMM_API PackWeightMatrixForGConv<int8_t, int32_t, `2`>;
266	template class FBGEMM_API PackWeightMatrixForGConv<int8_t, int16_t, `2`>;
267	template class FBGEMM_API PackWeightMatrixForGConv<int8_t, int32_t, `3`>;
268	template class FBGEMM_API PackWeightMatrixForGConv<int8_t, int16_t, `3`>;
269	} // namespace fbgemm
270

Browse the source code of pytorch/third_party/fbgemm/src/PackWeightMatrixForGConv.cc