1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 * This source code is licensed under the BSD-style license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7#define FBGEMM_EXPORTS
8#include <cpuinfo.h>
9#include <cassert>
10#include <cmath>
11#include <cstring>
12#include <iomanip>
13#include <iostream>
14#include <stdexcept>
15#include "./OptimizedKernelsAvx2.h"
16#include "fbgemm/Fbgemm.h"
17#include "fbgemm/QuantUtilsAvx2.h"
18
19namespace fbgemm {
20
21template <typename T, typename accT>
22PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
23 matrix_op_t trans,
24 int32_t nRow,
25 int32_t nCol,
26 const float* smat,
27 int32_t ld,
28 inpType* pmat,
29 float scale,
30 int32_t zero_pt,
31 int groups,
32 int32_t* row_offset,
33 const BlockingFactors* params)
34 : PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT>(
35 nRow,
36 nCol,
37 pmat,
38 groups,
39 params),
40 trans_(trans),
41 smat_(smat),
42 ld_(ld),
43 scale_(scale),
44 zero_pt_(zero_pt),
45 row_offset_(row_offset) {
46 if (!cpuinfo_initialize()) {
47 throw std::runtime_error("Failed to initialize cpuinfo!");
48 }
49 if (scale_ == 0.0f) {
50 throw std::runtime_error("scale cannot be zero");
51 }
52 if (std::isinf(1.0f / scale_)) {
53 throw std::runtime_error("scale's reciprocal cannot be infinity");
54 }
55 if ((!fbgemmHasAvx512VnniSupport() && !fbgemmHasAvx512Support() &&
56 !fbgemmHasAvx2Support())) {
57 assert(0 && "unknown architecure");
58 }
59
60 if (params) {
61 BaseType::brow_ = params->MCB;
62 BaseType::bcol_ = params->KCB;
63 row_interleave_B_ = params->ROW_INTERLEAVE;
64 } else {
65 const inst_set_t isa = fbgemmInstructionSet();
66 switch (isa) {
67 case inst_set_t::avx512_vnni:
68 std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
69 PackingTraits<T, accT, inst_set_t::avx512_vnni>::
70 getMatrixPackAParams();
71 break;
72
73 case inst_set_t::avx512_vnni_ymm:
74 std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
75 PackingTraits<T, accT, inst_set_t::avx512_vnni_ymm>::
76 getMatrixPackAParams();
77 break;
78
79 case inst_set_t::avx512:
80 std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
81 PackingTraits<T, accT, inst_set_t::avx512>::getMatrixPackAParams();
82 break;
83
84 case inst_set_t::avx512_ymm:
85 std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
86 PackingTraits<T, accT, inst_set_t::avx512_ymm>::
87 getMatrixPackAParams();
88 break;
89
90 case inst_set_t::avx2:
91 std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) =
92 PackingTraits<T, accT, inst_set_t::avx2>::getMatrixPackAParams();
93 break;
94
95 default:
96 assert(0 && "unknown architecure");
97 throw std::runtime_error("unknown architecure");
98 }
99 }
100
101 rowOffsetAllocatedHere = false;
102
103 if (BaseType::numCols() % groups != 0) {
104 throw std::runtime_error(
105 "groups = " + std::to_string(groups) +
106 " does not divide numCols = " + std::to_string(BaseType::numCols()));
107 }
108 if (pmat) {
109 BaseType::buf_ = pmat;
110 } else {
111 BaseType::bufAllocatedHere_ = true;
112 BaseType::buf_ = static_cast<T*>(
113 fbgemmAlignedAlloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)));
114 }
115 if (!row_offset_) {
116 rowOffsetAllocatedHere = true;
117 row_offset_ = static_cast<int32_t*>(
118 fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(accT)));
119 }
120}
121
122template <typename T, typename accT>
123void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
124 // assert(block.row_start % BaseType::blockRowSize() == 0);
125 assert(block.row_size <= BaseType::blockRowSize());
126 assert(block.col_size <= BaseType::blockColSize());
127
128 block_type_t block_p = {
129 block.row_start,
130 block.row_size,
131 block.col_start,
132 (block.col_size + row_interleave_B_ - 1) / row_interleave_B_ *
133 row_interleave_B_};
134 assert(block_p.col_size <= BaseType::blockColSize());
135 BaseType::packedBlock(block_p);
136
137 T* out = BaseType::getBuf();
138 bool tr = (trans_ == matrix_op_t::Transpose);
139 // accumulate into row offset?
140 bool row_offset_acc =
141 (block.col_start % (this->numCols() / this->numGroups())) != 0;
142 int32_t* row_offset_buf = getRowOffsetBuffer();
143
144 float* smat_transposed = nullptr;
145 if (tr) {
146 smat_transposed = static_cast<float*>(fbgemmAlignedAlloc(
147 64, block.row_size * block.col_size * sizeof(float)));
148 transpose_simd(
149 block.col_size,
150 block.row_size,
151 smat_ + block.col_start * ld_ + block.row_start,
152 ld_,
153 smat_transposed,
154 block.col_size);
155 }
156 const float* smat_temp =
157 tr ? smat_transposed : smat_ + block.row_start * ld_ + block.col_start;
158 int32_t ld_temp = tr ? block.col_size : ld_;
159
160 static_assert(
161 std::is_same<T, uint8_t>::value,
162 "PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t");
163
164 // Only scale and zero points are used in QuantizeAvx2
165 TensorQuantizationParams qparams;
166 qparams.scale = scale_;
167 qparams.zero_point = zero_pt_;
168
169 for (int i = 0; i < block.row_size; ++i) {
170 QuantizeAvx2(
171 smat_temp + i * ld_temp,
172 out + i * BaseType::blockColSize(),
173 block.col_size,
174 qparams);
175 int32_t row_sum = row_offset_acc ? row_offset_buf[i] : 0;
176 row_sum += reduceAvx2(out + i * BaseType::blockColSize(), block.col_size);
177 row_offset_buf[i] = row_sum;
178
179 // zero fill
180 // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
181 for (int j = block.col_start + block.col_size; j < block_p.col_size; ++j) {
182 out[i * BaseType::blockColSize() + j] = 0;
183 }
184 }
185 if (smat_transposed) {
186 fbgemmAlignedFree(smat_transposed);
187 }
188}
189
190template <typename T, typename accT>
191int32_t PackAWithQuantRowOffset<T, accT>::addr(int32_t r, int32_t c) const {
192 int32_t block_row_id = r / BaseType::blockRowSize();
193 int32_t brow_offset = (block_row_id * BaseType::blockCols()) *
194 (BaseType::blockRowSize() * BaseType::blockColSize());
195
196 int32_t block_col_id = c / BaseType::blockColSize();
197 int32_t bcol_offset =
198 block_col_id * BaseType::blockRowSize() * BaseType::blockColSize();
199 int32_t block_offset = brow_offset + bcol_offset;
200 int32_t inblock_offset =
201 (r % BaseType::blockRowSize()) * BaseType::blockColSize() +
202 (c % BaseType::blockColSize());
203
204 int32_t index = block_offset + inblock_offset;
205
206 return index;
207}
208
209template <typename T, typename accT>
210void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) {
211 std::cout << name << ":"
212 << "[" << BaseType::numPackedRows() << ", "
213 << BaseType::numPackedCols() << "]" << std::endl;
214
215 T* out = BaseType::getBuf();
216 for (auto r = 0; r < BaseType::numPackedRows(); ++r) {
217 for (auto c = 0; c < BaseType::numPackedCols(); ++c) {
218 T val = out[addr(r, c)];
219 if (std::is_integral<T>::value) {
220 // cast to int64 because cout doesn't print int8_t type directly
221 std::cout << std::setw(5) << static_cast<int64_t>(val) << " ";
222 } else {
223 std::cout << std::setw(5) << val << " ";
224 }
225 }
226 std::cout << std::endl;
227 }
228 std::cout << std::endl;
229}
230
231template <typename T, typename accT>
232int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize(
233 const BlockingFactors* params) {
234 if (cpuinfo_initialize()) {
235 if (params) {
236 return params->MCB;
237 } else {
238 if (fbgemmHasAvx512VnniSupport()) {
239 return PackingTraits<T, accT, inst_set_t::avx512_vnni>::MCB;
240 } else if (fbgemmHasAvx512Support()) {
241 return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
242 } else if (fbgemmHasAvx2Support()) {
243 return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
244 } else {
245 assert(0 && "unsupported architecture");
246 return -1;
247 }
248 }
249 } else {
250 throw std::runtime_error("Failed to initialize cpuinfo!");
251 }
252}
253
254template class PackAWithQuantRowOffset<uint8_t, int32_t>;
255
256} // namespace fbgemm
257