PackAWithQuantRowOffset.cc source code [pytorch/third_party/fbgemm/src/PackAWithQuantRowOffset.cc]

1	/*
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	* All rights reserved.
4	* This source code is licensed under the BSD-style license found in the
5	* LICENSE file in the root directory of this source tree.
6	*/
7	#define FBGEMM_EXPORTS
8	#include <cpuinfo.h>
9	#include <cassert>
10	#include <cmath>
11	#include <cstring>
12	#include <iomanip>
13	#include <iostream>
14	#include <stdexcept>
15	#include "./OptimizedKernelsAvx2.h"
16	#include "fbgemm/Fbgemm.h"
17	#include "fbgemm/QuantUtilsAvx2.h"
18
19	namespace fbgemm {
20
21	template <typename T, typename accT>
22	PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
23	matrix_op_t trans,
24	int32_t nRow,
25	int32_t nCol,
26	const float* smat,
27	int32_t ld,
28	inpType* pmat,
29	float scale,
30	int32_t zero_pt,
31	int groups,
32	int32_t* row_offset,
33	const BlockingFactors* params)
34	: PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT>(
35	nRow,
36	nCol,
37	pmat,
38	groups,
39	params),
40	trans_(trans),
41	smat_(smat),
42	ld_(ld),
43	scale_(scale),
44	zero_pt_(zero_pt),
45	row_offset_(row_offset) {
46	if (!cpuinfo_initialize()) {
47	throw std::runtime_error ("Failed to initialize cpuinfo!");
48	}
49	if (scale_ == `0.0f`) {
50	throw std::runtime_error ("scale cannot be zero");
51	}
52	if (std::isinf(`1.0f` / scale_)) {
53	throw std::runtime_error ("scale's reciprocal cannot be infinity");
54	}
55	if ((!fbgemmHasAvx512VnniSupport() && !fbgemmHasAvx512Support() &&
56	!fbgemmHasAvx2Support())) {
57	assert(`0` && "unknown architecure");
58	}
59
60	if (params) {
61	BaseType::brow_ = params->MCB;
62	BaseType::bcol_ = params->KCB;
63	row_interleave_B_ = params->ROW_INTERLEAVE;
64	} else {
65	const inst_set_t isa = fbgemmInstructionSet();
66	switch (isa) {
67	case inst_set_t::avx512_vnni:
68	std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
69	PackingTraits<T, accT, inst_set_t::avx512_vnni>::
70	getMatrixPackAParams();
71	break;
72
73	case inst_set_t::avx512_vnni_ymm:
74	std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
75	PackingTraits<T, accT, inst_set_t::avx512_vnni_ymm>::
76	getMatrixPackAParams();
77	break;
78
79	case inst_set_t::avx512:
80	std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
81	PackingTraits<T, accT, inst_set_t::avx512>::getMatrixPackAParams();
82	break;
83
84	case inst_set_t::avx512_ymm:
85	std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
86	PackingTraits<T, accT, inst_set_t::avx512_ymm>::
87	getMatrixPackAParams();
88	break;
89
90	case inst_set_t::avx2:
91	std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
92	PackingTraits<T, accT, inst_set_t::avx2>::getMatrixPackAParams();
93	break;
94
95	default:
96	assert(`0` && "unknown architecure");
97	throw std::runtime_error ("unknown architecure");
98	}
99	}
100
101	rowOffsetAllocatedHere = false;
102
103	if (BaseType::numCols() % groups != `0`) {
104	throw std::runtime_error(
105	"groups = " + std::to_string(groups) +
106	" does not divide numCols = " + std::to_string(BaseType::numCols()));
107	}
108	if (pmat) {
109	BaseType::buf_ = pmat;
110	} else {
111	BaseType::bufAllocatedHere_ = true;
112	BaseType::buf_ = static_cast<T*>(
113	fbgemmAlignedAlloc(`64`, BaseType::brow_ * BaseType::bcol_ * sizeof(T)));
114	}
115	if (!row_offset_) {
116	rowOffsetAllocatedHere = true;
117	row_offset_ = static_cast<int32_t*>(
118	fbgemmAlignedAlloc(`64`, BaseType::brow_ * sizeof(accT)));
119	}
120	}
121
122	template <typename T, typename accT>
123	void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
124	// assert(block.row_start % BaseType::blockRowSize() == 0);
125	assert(block.row_size <= BaseType::blockRowSize());
126	assert(block.col_size <= BaseType::blockColSize());
127
128	block_type_t block_p = {
129	block.row_start,
130	block.row_size,
131	block.col_start,
132	(block.col_size + row_interleave_B_ - `1`) / row_interleave_B_ *
133	row_interleave_B_};
134	assert(block_p.col_size <= BaseType::blockColSize());
135	BaseType::packedBlock(block_p);
136
137	T* out = BaseType::getBuf();
138	bool tr = (trans_ == matrix_op_t::Transpose);
139	// accumulate into row offset?
140	bool row_offset_acc =
141	(block.col_start % (this->numCols() / this->numGroups())) != `0`;
142	int32_t* row_offset_buf = getRowOffsetBuffer();
143
144	float* smat_transposed = nullptr;
145	if (tr) {
146	smat_transposed = static_cast<float*>(fbgemmAlignedAlloc(
147	`64`, block.row_size * block.col_size * sizeof(float)));
148	transpose_simd(
149	block.col_size,
150	block.row_size,
151	smat_ + block.col_start * ld_ + block.row_start,
152	ld_,
153	smat_transposed,
154	block.col_size);
155	}
156	const float* smat_temp =
157	tr ? smat_transposed : smat_ + block.row_start * ld_ + block.col_start;
158	int32_t ld_temp = tr ? block.col_size : ld_;
159
160	static_assert(
161	std::is_same<T, uint8_t>::value,
162	"PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t");
163
164	// Only scale and zero points are used in QuantizeAvx2
165	TensorQuantizationParams qparams;
166	qparams.scale = scale_;
167	qparams.zero_point = zero_pt_;
168
169	for (int i = `0`; i < block.row_size; ++i) {
170	QuantizeAvx2(
171	smat_temp + i * ld_temp,
172	out + i * BaseType::blockColSize(),
173	block.col_size,
174	qparams);
175	int32_t row_sum = row_offset_acc ? row_offset_buf[i] : `0`;
176	row_sum += reduceAvx2(out + i * BaseType::blockColSize(), block.col_size);
177	row_offset_buf[i] = row_sum;
178
179	// zero fill
180	// Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
181	for (int j = block.col_start + block.col_size; j < block_p.col_size; ++j) {
182	out[i * BaseType::blockColSize() + j] = `0`;
183	}
184	}
185	if (smat_transposed) {
186	fbgemmAlignedFree(smat_transposed);
187	}
188	}
189
190	template <typename T, typename accT>
191	int32_t PackAWithQuantRowOffset<T, accT>::addr(int32_t r, int32_t c) const {
192	int32_t block_row_id = r / BaseType::blockRowSize();
193	int32_t brow_offset = (block_row_id * BaseType::blockCols()) *
194	(BaseType::blockRowSize() * BaseType::blockColSize());
195
196	int32_t block_col_id = c / BaseType::blockColSize();
197	int32_t bcol_offset =
198	block_col_id * BaseType::blockRowSize() * BaseType::blockColSize();
199	int32_t block_offset = brow_offset + bcol_offset;
200	int32_t inblock_offset =
201	(r % BaseType::blockRowSize()) * BaseType::blockColSize() +
202	(c % BaseType::blockColSize());
203
204	int32_t index = block_offset + inblock_offset;
205
206	return index;
207	}
208
209	template <typename T, typename accT>
210	void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) {
211	std::cout << name << ":"
212	<< "[" << BaseType::numPackedRows() << ", "
213	<< BaseType::numPackedCols() << "]" << std::endl;
214
215	T* out = BaseType::getBuf();
216	for (auto r = `0`; r < BaseType::numPackedRows(); ++r) {
217	for (auto c = `0`; c < BaseType::numPackedCols(); ++c) {
218	T val = out[addr(r, c)];
219	if (std::is_integral<T>::value) {
220	// cast to int64 because cout doesn't print int8_t type directly
221	std::cout << std::setw(`5`) << static_cast<int64_t>(val) << " ";
222	} else {
223	std::cout << std::setw(`5`) << val << " ";
224	}
225	}
226	std::cout << std::endl;
227	}
228	std::cout << std::endl;
229	}
230
231	template <typename T, typename accT>
232	int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize(
233	const BlockingFactors* params) {
234	if (cpuinfo_initialize()) {
235	if (params) {
236	return params->MCB;
237	} else {
238	if (fbgemmHasAvx512VnniSupport()) {
239	return PackingTraits<T, accT, inst_set_t::avx512_vnni>::MCB;
240	} else if (fbgemmHasAvx512Support()) {
241	return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
242	} else if (fbgemmHasAvx2Support()) {
243	return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
244	} else {
245	assert(`0` && "unsupported architecture");
246	return -`1`;
247	}
248	}
249	} else {
250	throw std::runtime_error ("Failed to initialize cpuinfo!");
251	}
252	}
253
254	template class PackAWithQuantRowOffset<uint8_t, int32_t>;
255
256	} // namespace fbgemm
257

Browse the source code of pytorch/third_party/fbgemm/src/PackAWithQuantRowOffset.cc