1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * This source code is licensed under the BSD-style license found in the |
5 | * LICENSE file in the root directory of this source tree. |
6 | */ |
7 | #define FBGEMM_EXPORTS |
8 | #include <cpuinfo.h> |
9 | #include <cassert> |
10 | #include <cmath> |
11 | #include <cstring> |
12 | #include <iomanip> |
13 | #include <iostream> |
14 | #include <stdexcept> |
15 | #include "./OptimizedKernelsAvx2.h" |
16 | #include "fbgemm/Fbgemm.h" |
17 | #include "fbgemm/QuantUtilsAvx2.h" |
18 | |
19 | namespace fbgemm { |
20 | |
21 | template <typename T, typename accT> |
22 | PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( |
23 | matrix_op_t trans, |
24 | int32_t nRow, |
25 | int32_t nCol, |
26 | const float* smat, |
27 | int32_t ld, |
28 | inpType* pmat, |
29 | float scale, |
30 | int32_t zero_pt, |
31 | int groups, |
32 | int32_t* row_offset, |
33 | const BlockingFactors* params) |
34 | : PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT>( |
35 | nRow, |
36 | nCol, |
37 | pmat, |
38 | groups, |
39 | params), |
40 | trans_(trans), |
41 | smat_(smat), |
42 | ld_(ld), |
43 | scale_(scale), |
44 | zero_pt_(zero_pt), |
45 | row_offset_(row_offset) { |
46 | if (!cpuinfo_initialize()) { |
47 | throw std::runtime_error("Failed to initialize cpuinfo!" ); |
48 | } |
49 | if (scale_ == 0.0f) { |
50 | throw std::runtime_error("scale cannot be zero" ); |
51 | } |
52 | if (std::isinf(1.0f / scale_)) { |
53 | throw std::runtime_error("scale's reciprocal cannot be infinity" ); |
54 | } |
55 | if ((!fbgemmHasAvx512VnniSupport() && !fbgemmHasAvx512Support() && |
56 | !fbgemmHasAvx2Support())) { |
57 | assert(0 && "unknown architecure" ); |
58 | } |
59 | |
60 | if (params) { |
61 | BaseType::brow_ = params->MCB; |
62 | BaseType::bcol_ = params->KCB; |
63 | row_interleave_B_ = params->ROW_INTERLEAVE; |
64 | } else { |
65 | const inst_set_t isa = fbgemmInstructionSet(); |
66 | switch (isa) { |
67 | case inst_set_t::avx512_vnni: |
68 | std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) = |
69 | PackingTraits<T, accT, inst_set_t::avx512_vnni>:: |
70 | getMatrixPackAParams(); |
71 | break; |
72 | |
73 | case inst_set_t::avx512_vnni_ymm: |
74 | std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) = |
75 | PackingTraits<T, accT, inst_set_t::avx512_vnni_ymm>:: |
76 | getMatrixPackAParams(); |
77 | break; |
78 | |
79 | case inst_set_t::avx512: |
80 | std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) = |
81 | PackingTraits<T, accT, inst_set_t::avx512>::getMatrixPackAParams(); |
82 | break; |
83 | |
84 | case inst_set_t::avx512_ymm: |
85 | std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) = |
86 | PackingTraits<T, accT, inst_set_t::avx512_ymm>:: |
87 | getMatrixPackAParams(); |
88 | break; |
89 | |
90 | case inst_set_t::avx2: |
91 | std::tie(BaseType::brow_, BaseType::bcol_, row_interleave_B_) = |
92 | PackingTraits<T, accT, inst_set_t::avx2>::getMatrixPackAParams(); |
93 | break; |
94 | |
95 | default: |
96 | assert(0 && "unknown architecure" ); |
97 | throw std::runtime_error("unknown architecure" ); |
98 | } |
99 | } |
100 | |
101 | rowOffsetAllocatedHere = false; |
102 | |
103 | if (BaseType::numCols() % groups != 0) { |
104 | throw std::runtime_error( |
105 | "groups = " + std::to_string(groups) + |
106 | " does not divide numCols = " + std::to_string(BaseType::numCols())); |
107 | } |
108 | if (pmat) { |
109 | BaseType::buf_ = pmat; |
110 | } else { |
111 | BaseType::bufAllocatedHere_ = true; |
112 | BaseType::buf_ = static_cast<T*>( |
113 | fbgemmAlignedAlloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); |
114 | } |
115 | if (!row_offset_) { |
116 | rowOffsetAllocatedHere = true; |
117 | row_offset_ = static_cast<int32_t*>( |
118 | fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(accT))); |
119 | } |
120 | } |
121 | |
122 | template <typename T, typename accT> |
123 | void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) { |
124 | // assert(block.row_start % BaseType::blockRowSize() == 0); |
125 | assert(block.row_size <= BaseType::blockRowSize()); |
126 | assert(block.col_size <= BaseType::blockColSize()); |
127 | |
128 | block_type_t block_p = { |
129 | block.row_start, |
130 | block.row_size, |
131 | block.col_start, |
132 | (block.col_size + row_interleave_B_ - 1) / row_interleave_B_ * |
133 | row_interleave_B_}; |
134 | assert(block_p.col_size <= BaseType::blockColSize()); |
135 | BaseType::packedBlock(block_p); |
136 | |
137 | T* out = BaseType::getBuf(); |
138 | bool tr = (trans_ == matrix_op_t::Transpose); |
139 | // accumulate into row offset? |
140 | bool row_offset_acc = |
141 | (block.col_start % (this->numCols() / this->numGroups())) != 0; |
142 | int32_t* row_offset_buf = getRowOffsetBuffer(); |
143 | |
144 | float* smat_transposed = nullptr; |
145 | if (tr) { |
146 | smat_transposed = static_cast<float*>(fbgemmAlignedAlloc( |
147 | 64, block.row_size * block.col_size * sizeof(float))); |
148 | transpose_simd( |
149 | block.col_size, |
150 | block.row_size, |
151 | smat_ + block.col_start * ld_ + block.row_start, |
152 | ld_, |
153 | smat_transposed, |
154 | block.col_size); |
155 | } |
156 | const float* smat_temp = |
157 | tr ? smat_transposed : smat_ + block.row_start * ld_ + block.col_start; |
158 | int32_t ld_temp = tr ? block.col_size : ld_; |
159 | |
160 | static_assert( |
161 | std::is_same<T, uint8_t>::value, |
162 | "PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t" ); |
163 | |
164 | // Only scale and zero points are used in QuantizeAvx2 |
165 | TensorQuantizationParams qparams; |
166 | qparams.scale = scale_; |
167 | qparams.zero_point = zero_pt_; |
168 | |
169 | for (int i = 0; i < block.row_size; ++i) { |
170 | QuantizeAvx2( |
171 | smat_temp + i * ld_temp, |
172 | out + i * BaseType::blockColSize(), |
173 | block.col_size, |
174 | qparams); |
175 | int32_t row_sum = row_offset_acc ? row_offset_buf[i] : 0; |
176 | row_sum += reduceAvx2(out + i * BaseType::blockColSize(), block.col_size); |
177 | row_offset_buf[i] = row_sum; |
178 | |
179 | // zero fill |
180 | // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. |
181 | for (int j = block.col_start + block.col_size; j < block_p.col_size; ++j) { |
182 | out[i * BaseType::blockColSize() + j] = 0; |
183 | } |
184 | } |
185 | if (smat_transposed) { |
186 | fbgemmAlignedFree(smat_transposed); |
187 | } |
188 | } |
189 | |
190 | template <typename T, typename accT> |
191 | int32_t PackAWithQuantRowOffset<T, accT>::addr(int32_t r, int32_t c) const { |
192 | int32_t block_row_id = r / BaseType::blockRowSize(); |
193 | int32_t brow_offset = (block_row_id * BaseType::blockCols()) * |
194 | (BaseType::blockRowSize() * BaseType::blockColSize()); |
195 | |
196 | int32_t block_col_id = c / BaseType::blockColSize(); |
197 | int32_t bcol_offset = |
198 | block_col_id * BaseType::blockRowSize() * BaseType::blockColSize(); |
199 | int32_t block_offset = brow_offset + bcol_offset; |
200 | int32_t inblock_offset = |
201 | (r % BaseType::blockRowSize()) * BaseType::blockColSize() + |
202 | (c % BaseType::blockColSize()); |
203 | |
204 | int32_t index = block_offset + inblock_offset; |
205 | |
206 | return index; |
207 | } |
208 | |
209 | template <typename T, typename accT> |
210 | void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) { |
211 | std::cout << name << ":" |
212 | << "[" << BaseType::numPackedRows() << ", " |
213 | << BaseType::numPackedCols() << "]" << std::endl; |
214 | |
215 | T* out = BaseType::getBuf(); |
216 | for (auto r = 0; r < BaseType::numPackedRows(); ++r) { |
217 | for (auto c = 0; c < BaseType::numPackedCols(); ++c) { |
218 | T val = out[addr(r, c)]; |
219 | if (std::is_integral<T>::value) { |
220 | // cast to int64 because cout doesn't print int8_t type directly |
221 | std::cout << std::setw(5) << static_cast<int64_t>(val) << " " ; |
222 | } else { |
223 | std::cout << std::setw(5) << val << " " ; |
224 | } |
225 | } |
226 | std::cout << std::endl; |
227 | } |
228 | std::cout << std::endl; |
229 | } |
230 | |
231 | template <typename T, typename accT> |
232 | int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize( |
233 | const BlockingFactors* params) { |
234 | if (cpuinfo_initialize()) { |
235 | if (params) { |
236 | return params->MCB; |
237 | } else { |
238 | if (fbgemmHasAvx512VnniSupport()) { |
239 | return PackingTraits<T, accT, inst_set_t::avx512_vnni>::MCB; |
240 | } else if (fbgemmHasAvx512Support()) { |
241 | return PackingTraits<T, accT, inst_set_t::avx512>::MCB; |
242 | } else if (fbgemmHasAvx2Support()) { |
243 | return PackingTraits<T, accT, inst_set_t::avx2>::MCB; |
244 | } else { |
245 | assert(0 && "unsupported architecture" ); |
246 | return -1; |
247 | } |
248 | } |
249 | } else { |
250 | throw std::runtime_error("Failed to initialize cpuinfo!" ); |
251 | } |
252 | } |
253 | |
254 | template class PackAWithQuantRowOffset<uint8_t, int32_t>; |
255 | |
256 | } // namespace fbgemm |
257 | |