1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #ifndef GLOW_QUANTIZATION_BASE_BASE_H |
18 | #define GLOW_QUANTIZATION_BASE_BASE_H |
19 | |
20 | #include "glow/Base/Tensor.h" |
21 | #include "glow/Base/Traits.h" |
22 | #include "glow/Base/Type.h" |
23 | |
24 | #include <algorithm> |
25 | #include <cassert> |
26 | #include <cstdlib> |
27 | #include <limits> |
28 | |
29 | namespace glow { |
30 | |
31 | /// Dummy scale used for representing dummy quantization parameters that have |
32 | /// been loaded in place of real quantization parameters. |
33 | constexpr float dummyScale = 0.123456813395023345947265625; |
34 | |
35 | /// Profiling parameters of a tensor consisting in the global minimum and global |
36 | /// maximum values and also the histogram obtained during profiling. To be noted |
37 | /// that the histogram is not normalized. |
38 | struct TensorProfilingParams { |
39 | float min; |
40 | float max; |
41 | std::vector<float> histogram; |
42 | |
43 | TensorProfilingParams() = default; |
44 | TensorProfilingParams(float min, float max) : min(min), max(max) {} |
45 | TensorProfilingParams(float min, float max, const std::vector<float> &hist) |
46 | : min(min), max(max), histogram(hist) {} |
47 | TensorProfilingParams(float min, float max, const Tensor &hist) |
48 | : min(min), max(max) { |
49 | auto histH = hist.getHandle<float>(); |
50 | histogram = std::vector<float>(histH.size()); |
51 | for (dim_t idx = 0, e = histH.size(); idx < e; idx++) { |
52 | histogram[idx] = histH.raw(idx); |
53 | } |
54 | } |
55 | }; |
56 | |
57 | /// Main attributes of a quantized tensor. |
58 | /// Scale and Offset allow quantization of a float tensor and dequantization of |
59 | /// integer tensor back to float one. |
60 | struct TensorQuantizationParams { |
61 | float scale; |
62 | int32_t offset; |
63 | }; |
64 | |
65 | /// A data structure that represents the 32-bit to 8-bit quantization |
66 | /// scaling operation. This data structure represents the transformation: |
67 | /// (((input >> pre) * scale) + rtn) >> post + offset. |
68 | struct QuantizationTransform32To8 { |
69 | int pre; |
70 | int post; |
71 | int scale; |
72 | int offset; |
73 | |
74 | /// Initializes the transformation based on the conversion formula (above). |
75 | QuantizationTransform32To8(int pre, int post, int scale, int offset) |
76 | : pre(pre), post(post), scale(scale), offset(offset) {} |
77 | |
78 | /// \returns the scaled integer. |
79 | int32_t transform(int32_t input) { |
80 | // The operation x >> post is rounded down to negative infinity. To get to |
81 | // round-nearest we add (1 << (post - 1)) to the value prior to shifting. |
82 | // Rounding is performed only when shifting right (pos > 0). |
83 | int rtn = (post > 0) ? (1 << (post - 1)) : 0; |
84 | return ((((input >> pre) * scale) + rtn) >> post) + offset; |
85 | } |
86 | }; |
87 | |
88 | /// Tensor profiling parameters for a given node. |
89 | struct NodeProfilingInfo { |
90 | std::string nodeOutputName_; |
91 | TensorProfilingParams tensorProfilingParams_; |
92 | |
93 | NodeProfilingInfo() = default; |
94 | NodeProfilingInfo(const std::string &nodeOutputName, |
95 | const TensorProfilingParams &tensorProfilingParams) |
96 | : nodeOutputName_(nodeOutputName), |
97 | tensorProfilingParams_(tensorProfilingParams) {} |
98 | |
99 | float min() const { return tensorProfilingParams_.min; } |
100 | float max() const { return tensorProfilingParams_.max; } |
101 | const std::vector<float> &histogram() const { |
102 | return tensorProfilingParams_.histogram; |
103 | } |
104 | }; |
105 | |
106 | /// Tensor quantization parameters for a given node. |
107 | struct NodeQuantizationInfo { |
108 | std::string nodeOutputName_; |
109 | TensorQuantizationParams tensorQuantizationParams_; |
110 | |
111 | NodeQuantizationInfo() = default; |
112 | NodeQuantizationInfo(const std::string &nodeOutputName, |
113 | const TensorQuantizationParams &tensorQuantizationParams) |
114 | : nodeOutputName_(nodeOutputName), |
115 | tensorQuantizationParams_(tensorQuantizationParams) {} |
116 | |
117 | float scale() const { return tensorQuantizationParams_.scale; } |
118 | int32_t offset() const { return tensorQuantizationParams_.offset; } |
119 | }; |
120 | |
121 | /// Primitive to encode an integer in 32-bit unsigned fixed-point format. |
122 | class FixedPointUInt32 { |
123 | private: |
124 | /// Encoded fixed-point value. |
125 | uint32_t val_; |
126 | /// Number of integer bits. |
127 | unsigned intBits_; |
128 | /// Number of fractional bits. |
129 | unsigned fracBits_; |
130 | |
131 | public: |
132 | /// Default constructor. |
133 | FixedPointUInt32() = default; |
134 | |
135 | /// Construct a fixed-point representation of the floating-point value |
136 | /// \p floatVal using the fixed-point configuration with minimum approximation |
137 | /// error by using the least amount of integer bits and the highest amount |
138 | /// of fractional bits. |
139 | FixedPointUInt32(float floatVal) { |
140 | assert(floatVal >= 0 && "Floating point value must be positive!" ); |
141 | val_ = floatingToFixedPoint(floatVal, 32 - minBitsIntegerPart(floatVal)); |
142 | intBits_ = minBitsIntegerPart(floatVal); |
143 | fracBits_ = 32 - intBits_; |
144 | }; |
145 | |
146 | /// Construct a fixed-point representation of the floating-point value |
147 | /// \p floatVal using the given number of integer bits \p intBits. |
148 | FixedPointUInt32(float floatVal, unsigned intBits) { |
149 | assert(floatVal >= 0 && "Floating point value must be positive!" ); |
150 | assert(intBits >= 0 && intBits <= 32 && |
151 | "Integer bits must be between 0 and 32" ); |
152 | val_ = floatingToFixedPoint(floatVal, 32 - intBits); |
153 | intBits_ = intBits; |
154 | fracBits_ = 32 - intBits_; |
155 | } |
156 | |
157 | /// \returns the encoded fixed-point value as integer. |
158 | uint32_t getFixedVal() const { return val_; } |
159 | |
160 | /// \returns the encoded fixed-point value as float. |
161 | float getFloatVal() const { return (float)(val_) / std::exp2(fracBits_); } |
162 | |
163 | /// \returns the number of integer bits. |
164 | unsigned getIntBits() const { return intBits_; } |
165 | |
166 | /// \returns the number of fractional bits. |
167 | unsigned getFracBits() const { return fracBits_; } |
168 | |
169 | private: |
170 | // \p number. |
171 | // \returns the minimum number of bits representing the |
172 | // integer part of the fixed point representation of a |
173 | // floating point number. |
174 | uint32_t minBitsIntegerPart(float number) { |
175 | assert(number >= 0 && "Floating point value must be positive!" ); |
176 | uint32_t aux = (uint32_t)number; |
177 | uint32_t integerPart = 0; |
178 | |
179 | while (aux / 2 != 0 || aux % 2 != 0) { |
180 | integerPart += 1; |
181 | aux /= 2; |
182 | } |
183 | |
184 | assert(integerPart >= 0 && integerPart <= 32 && |
185 | "Overflow caused by input number\n" ); |
186 | return integerPart; |
187 | } |
188 | |
189 | // \p elem. |
190 | // \p fracPart representing number of bits for fixed point representation. |
191 | // \returns the fixed point representation of the input floating point number |
192 | // using the format Q(32- fracPart).fracPart. |
193 | uint32_t floatingToFixedPoint(float elem, uint32_t fracPart) { |
194 | assert(elem >= 0 && "Floating point value must be positive!" ); |
195 | double result = (double)elem * (double)std::exp2((double)fracPart); |
196 | assert(result >= (double)std::numeric_limits<uint32_t>::min() && |
197 | result <= (double)std::numeric_limits<uint32_t>::max() && |
198 | "Float to fix point conversion overflow\n" ); |
199 | return round(result); |
200 | } |
201 | |
202 | public: |
203 | /// \returns a string representation of the fixed-point value (e.g. "0.13"). |
204 | std::string toString() const { return std::to_string(getFloatVal()); } |
205 | }; |
206 | |
207 | namespace quantization { |
208 | |
209 | /// Type definition for a float min/max range. |
210 | using FloatRange = std::pair<float, float>; |
211 | |
212 | /// Type definition for a quantized min/max range. |
213 | using QuantizedRange = std::pair<int64_t, int64_t>; |
214 | |
215 | /// Quantization schema which influences the way the quantization parameters |
216 | /// scale and offset are computed based on the target min/max dynamic range. |
217 | enum Schema { |
218 | /// Asymmetric quantization produces ranges not necessarily centered on 0. |
219 | Asymmetric, |
220 | /// Symmetric quantization produces ranges centered on 0. |
221 | Symmetric, |
222 | /// Symmetric quantization produces ranges centered on 0 or -qmin, qmin being |
223 | /// the minimum value of the quantized type. |
224 | /// An offset of qmin (i.e., offset == -128 for int8) represents an unsigned |
225 | /// version of the quantized type with an offset of zero: |
226 | /// For example, int8 is [-128; 127] - (-128) == uint8 [0; 255] - 0 |
227 | SymmetricWithUnsigned, |
228 | /// Quantization schema with: |
229 | /// - range centered on 0 (symmetric): offset == 0. |
230 | /// - scale parameter is a power of 2: scale = 2^E where E is a signed |
231 | /// exponent. Since the scale parameter is mostly subunitary, the |
232 | /// exponent is mostly negative. |
233 | /// Since the scale parameter is stored as floating point, the values |
234 | /// of E which are exactly representable range from -126 to 127. |
235 | SymmetricWithPower2Scale, |
236 | }; |
237 | |
238 | /// Calibration mode which influences the way the dynamic range min/max obtained |
239 | /// during profiling is narrowed in order to have a more precise representation |
240 | /// for the majority of the values with the price of saturating the outliers. |
241 | enum Calibration { |
242 | /// No calibration. The quantization parameters will be computed using the |
243 | /// unaltered dynamic range min/max obtained during profiling such that all |
244 | /// the profiled dynamic range will be representable without saturation. |
245 | None, |
246 | /// Calibration mode based on minimizing the Kullback-Leibler divergence. |
247 | KLMinimization |
248 | }; |
249 | |
250 | /// Configuration for Profiling, passed into \ref profileQuantization(). |
251 | struct ProfilingConfiguration { |
252 | /// Number of bins used to compute the histogram during profiling. |
253 | unsigned numHistogramBins{10}; |
254 | }; |
255 | |
256 | /// Configuration for Quantization, passed into \ref quantizeFunction(). |
257 | struct QuantizationConfiguration { |
258 | /// Profiling infos to use when computing the scale and offset for all the |
259 | /// Nodes inside the function being quantized, including the referenced |
260 | /// Placeholders and Constants. |
261 | std::vector<NodeProfilingInfo> infos{}; |
262 | |
263 | /// The hash of the graph obtained during profiling in the pre lowering stage. |
264 | /// This hash is used to verify during quantization that the graph being |
265 | /// compiled matches the graph used for obtaining the profiling information. |
266 | llvm::hash_code graphPreLowerHash{0}; |
267 | |
268 | /// Whether to check the graph hash during quantization. |
269 | bool checkGraphPreLowerHash{false}; |
270 | |
271 | /// Precision to use when quantizing a Function. |
272 | ElemKind precision{ElemKind::Int8QTy}; |
273 | |
274 | /// Schema to use when quantizing a Function. |
275 | Schema schema{Schema::Asymmetric}; |
276 | |
277 | /// Calibration mode used when computing the quantization parameters. |
278 | Calibration calibration{Calibration::None}; |
279 | |
280 | /// Whether to enable the calibration for constant weights. |
281 | bool calibrateConstants{false}; |
282 | |
283 | /// Whether to use rowwise quantization when quantizing a Function. |
284 | bool enableRowwise{false}; |
285 | |
286 | /// Whether to use channelwise quantization when quantizing a Function. |
287 | bool enableChannelwise{false}; |
288 | |
289 | /// New name for the quantized function. If no name is given then |
290 | /// \ref quantizeFunction() will generate a name. |
291 | std::string newFuncName{"" }; |
292 | |
293 | /// If true, the quantizer will abort when encountering a node that it would |
294 | /// like to quantize but the backend cannot support. Note that node kinds in |
295 | /// doNotQuantizeKinds will skip this check and not cause an abort. |
296 | bool assertAllNodesQuantized{false}; |
297 | |
298 | /// Precision used for bias quantization for Convolution and FullyConnected. |
299 | /// This allows specializing the bias quantization. Default is int32. |
300 | ElemKind precisionBias{ElemKind::Int32QTy}; |
301 | |
302 | /// If true, don't apply quantization to FC bias inputs. |
303 | bool skipQuantizeFCBias{false}; |
304 | |
305 | QuantizationConfiguration() = default; |
306 | QuantizationConfiguration(llvm::ArrayRef<NodeProfilingInfo> i) : infos(i) {} |
307 | }; |
308 | |
309 | /// \returns the tensor average value based on the profiling info \p profParams. |
310 | float getTensorAverageValue(const TensorProfilingParams &profParams); |
311 | |
312 | /// \returns the value \p in as clipped to the range of \p DestTy. |
313 | template <class SrcTy, class DestTy> DestTy clip(SrcTy in) { |
314 | static_assert(sizeof(SrcTy) >= sizeof(DestTy), "Invalid types" ); |
315 | |
316 | auto mx = std::numeric_limits<DestTy>::max(); |
317 | auto mn = std::numeric_limits<DestTy>::min(); |
318 | return std::max<SrcTy>(mn, std::min<SrcTy>(mx, in)); |
319 | } |
320 | |
321 | /// Converts floating point value \p input to DestTy (quantized type) based |
322 | /// on the quantization parameters \p TQP. |
323 | template <class DestTy = int8_t> |
324 | inline DestTy quantize(float input, const TensorQuantizationParams &TQP) { |
325 | float result = input / TQP.scale + TQP.offset; |
326 | // Note: use int64_t since casts of large values might be wrapped around |
327 | // before clipping, for example for result = 2147483648.00 (float). |
328 | return quantization::clip<int64_t, DestTy>((int64_t)nearbyintf(result)); |
329 | } |
330 | |
331 | /// Converts floating point value \p input to \p DestTy (quantized type) based |
332 | /// on the quantization parameters \p TQP. The value is returned as int64. |
333 | inline int64_t quantize(float input, const TensorQuantizationParams &TQP, |
334 | ElemKind DestTy) { |
335 | if (DestTy == ElemKind::Int8QTy) { |
336 | return quantize<int8_t>(input, TQP); |
337 | } else if (DestTy == ElemKind::Int16QTy) { |
338 | return quantize<int16_t>(input, TQP); |
339 | } else if (DestTy == ElemKind::Int32QTy) { |
340 | return quantize<int32_t>(input, TQP); |
341 | } else if (DestTy == ElemKind::Int64QTy) { |
342 | return quantize<int64_t>(input, TQP); |
343 | } else { |
344 | llvm_unreachable("Precision not supported!" ); |
345 | } |
346 | } |
347 | |
348 | /// Converts a quantized value (type eTy) to floating point based on the |
349 | /// quantization parameters \p TQP. |
350 | /// Note: use int64_t to cover the 'symmetric int32 with unsigned' case. |
351 | template <class eTy = int8_t> |
352 | inline float dequantize(eTy input, const TensorQuantizationParams &TQP) { |
353 | return TQP.scale * ((int64_t)input - TQP.offset); |
354 | } |
355 | |
356 | /// Converts floating point value to DestTy (quantized type) based on the |
357 | /// quantization parameters \p scale and \p offset. If the destination type is |
358 | /// int8_t then an offset of 128 is subtracted to convert to int8_t. If the |
359 | /// destination type is int16_t then an offset of 32768 is subtracted to convert |
360 | /// to int16_t. |
361 | template <class DestTy> |
362 | inline DestTy quantizeWithFloatOffset(float input, float scale, float offset) { |
363 | uint16_t d = static_cast<uint16_t>(std::round((input - offset) / scale)); |
364 | if (std::is_same<int8_t, DestTy>::value) { |
365 | d -= 128; |
366 | } else if (std::is_same<int16_t, DestTy>::value) { |
367 | d -= 32768; |
368 | } |
369 | return static_cast<DestTy>(d); |
370 | } |
371 | |
372 | /// Converts floating point value \p input to 4-bit quantization based on the |
373 | /// quantization parameters \p scale and \p offset. |
374 | inline uint8_t quantize4BitsWithFloatOffset(float input, float scale, |
375 | float offset) { |
376 | uint8_t d = std::max( |
377 | 0, std::min(static_cast<int>(std::round((input - offset) / scale)), 15)); |
378 | return d; |
379 | } |
380 | |
381 | /// Converts a quantized value (type eTy) to floating point based on the |
382 | /// quantization parameters \p scale and \p offset. If the input type is int8_t, |
383 | /// then an offset of 128 is added to convert to uint8_t. If the input type is |
384 | /// int16_t, then an offset of 32768 is added to convert to uint16_t. |
385 | template <class eTy> |
386 | inline float dequantizeWithFloatOffset(eTy input, float scale, float offset) { |
387 | uint16_t d = static_cast<uint16_t>(input); |
388 | if (std::is_same<int8_t, eTy>::value) { |
389 | d += 128; |
390 | } else if (std::is_same<int16_t, eTy>::value) { |
391 | d += 32768; |
392 | } |
393 | return (d * scale) + offset; |
394 | } |
395 | |
396 | /// Converts a 4-bit quantized value, which is stored in \p input (MSB if \p |
397 | /// isMSB is true, otherwise LSB), to floating point based on the quantization |
398 | /// parameters \p scale and \p offset. |
399 | inline float dequantize4BitWithFloatOffset(uint8_t input, float scale, |
400 | float offset, bool isMSB) { |
401 | if (isMSB) { |
402 | input >>= 4; |
403 | } |
404 | input &= 0x0f; |
405 | return (input * scale) + offset; |
406 | } |
407 | |
408 | /// Converts a floating point \p tensor to quantized tensor based on the |
409 | /// quantization parameters \p TQP and \p Ty. |
410 | Tensor quantizeTensor(const Tensor &tensor, const TensorQuantizationParams &TQP, |
411 | ElemKind Ty = ElemKind::Int8QTy); |
412 | |
413 | /// Converts quantized tensor \p tensor to floating point tensor of type \p Ty |
414 | /// floatKind. |
415 | Tensor dequantizeTensor(const Tensor &tensor, ElemKind floatKind); |
416 | |
417 | /// Dequantize 4-bit fused quantized tensor \p input. \returns the float type |
418 | /// output. |
419 | Tensor tensor4BitsFusedRowwiseDequantization(const Tensor &input); |
420 | |
421 | /// Convert the floating point quantization parameters \p scale and \p offset |
422 | /// into the integer sequence of: |
423 | /// result = ((input >> pre) * scale) >> post + offset. |
424 | /// This scales a 32-bit signed integer word into an 8-bit signed integer. |
425 | /// \returns transformation parameters. |
426 | QuantizationTransform32To8 quantizeScaleOffset32To8(float scale, |
427 | int32_t offset); |
428 | |
429 | /// Function to get the quantized range for a given precision type \p qTy. |
430 | /// \returns the range as a (min, max) pair. |
431 | QuantizedRange getQuantizedRange(ElemKind qTy); |
432 | |
433 | /// Function to validate that the given quantization parameters \p qParams |
434 | /// comply with the given quantization \p schema and precision \p qTy. |
435 | void validateQuantizationParams(TensorQuantizationParams qParams, Schema schema, |
436 | ElemKind qTy); |
437 | |
438 | /// Calculate the TensorQuantizationParams from the TensorProfilingParams |
439 | /// \p profParams using the quantization type \p qTy and the quantization |
440 | /// method described by \p schema. The calibration of the quantization |
441 | /// parameters will be done using the method given by \p calibration. |
442 | TensorQuantizationParams |
443 | chooseQuantizationParams(TensorProfilingParams profParams, |
444 | Schema schema = Asymmetric, |
445 | ElemKind qTy = ElemKind::Int8QTy, |
446 | Calibration calibration = Calibration::None); |
447 | |
448 | /// Function to specialize the TensorQuantizationParams of the bias operand |
449 | /// for nodes like Convolution and FullyConnected given the initially computed |
450 | /// parameters \p biasTQP and the parameters of the input \p inputTQP and the |
451 | /// weights \p weightsTQP, for given quantization schema \p schema and bias type |
452 | /// \p biasQTy. The parameter \p biasZero provides the information whether bias |
453 | /// data is zero. The bias operand requires a more thoughtful quantization since |
454 | /// every bias value has a higher impact on the precision of the output value |
455 | /// than any particular weight value. The specialization logic is: |
456 | /// - for INT32 bias quantization: since the dynamic range of INT32 is large we |
457 | /// can always force symmetric quantization (offset = 0). This allows a faster |
458 | /// implementation since no offset subtraction is required at run-time. |
459 | /// - for INT8/INT16 bias quantization: since the dynamic range is small we |
460 | /// will keep the original offset. |
461 | /// - regardless of precision, we try to force the bias scale parameter to |
462 | /// bias_scale = input_scale * weights_scale since this has a performance |
463 | /// benefit by specializing the parameters to biasPre = 0, biasPost = 0, |
464 | /// biasScale = 1. We must verify that by changing the bias scale we don`t |
465 | /// saturate the bias data. This is also equivalent to forcing the effective |
466 | /// scale applied at run-time (bias_scale / (input_scale * weights_scale)) |
467 | /// to be always greater than or equal to 1.0 which is a common constraint |
468 | /// for the bias for most libraries with quantized implementations. |
469 | TensorQuantizationParams |
470 | specializeBiasQuantizationParams(const TensorQuantizationParams &biasTQP, |
471 | const TensorQuantizationParams &inputTQP, |
472 | const TensorQuantizationParams &weightsTQP, |
473 | Schema schema, ElemKind biasQTy, |
474 | bool biasZero = false); |
475 | |
476 | /// Function similar to \ref specializeBiasQuantizationParams with the main |
477 | /// distinction that this function is also allowed to change the quantization |
478 | /// parameters of the weights. The modification is done in place. This function |
479 | /// is used for per-channel quantization. When the requested bias precision is |
480 | /// INT32 this function ensures that bias_scale = input_scale * weights_scale |
481 | /// while making sure the bias data is not saturated by changing both the bias |
482 | /// and weights quantization parameters. |
483 | void specializeBiasWeightsQuantizationParams( |
484 | TensorQuantizationParams &biasTQP, const TensorQuantizationParams &inputTQP, |
485 | TensorQuantizationParams &weightsTQP, Schema schema, ElemKind biasQTy, |
486 | bool biasZero = false); |
487 | |
488 | /// \returns an integer mapping from the \p inTy to the \p outTy given the |
489 | /// floating-point function \p func. |
490 | /// \pre inTy and outTy must be quantized types. |
491 | template <typename T = int8_t> |
492 | std::vector<T> createMapping(TypeRef inTy, TypeRef outTy, |
493 | std::function<float(float)> func) { |
494 | assert(inTy->isQuantizedType() && "Input type must be quantized!" ); |
495 | assert(outTy->isQuantizedType() && "Output type must be quantized!" ); |
496 | assert(outTy->isType<T>() && "Output type must match template type!" ); |
497 | |
498 | // Calculate the step which will be added to the currInputVal repeatedly in |
499 | // order to cover the input range of the input type. |
500 | auto inputRange = inTy->getQuantizedValueRange(); |
501 | const float step = inTy->getQuantizedValueStep(); |
502 | float currInputVal = inputRange.first; |
503 | |
504 | // Calculate the output int value for each possible input value. |
505 | std::vector<T> mapping(inTy->getQuantizedValueCount()); |
506 | TensorQuantizationParams outputTQP{outTy->getScale(), outTy->getOffset()}; |
507 | for (size_t i = 0; i < mapping.size(); i++, currInputVal += step) { |
508 | float currOutputVal = func(currInputVal); |
509 | mapping[i] = quantization::quantize<T>(currOutputVal, outputTQP); |
510 | } |
511 | return mapping; |
512 | } |
513 | |
514 | /// Row-wise quantize the tensor \p input. \p scales and \p offsets are |
515 | /// generated by each row of \p input, \p output is tensor of the same shape as |
516 | /// input, quantized from \p input using \p scales and \p offsets for each |
517 | /// row. Note that the shape of input/output can be any non-zero number of |
518 | /// dimensions; row refers to all data in the first dimension of the shape. |
519 | /// Template parameter \p ScaleT and OffsetT represent the type to use for the |
520 | /// scales and offsets for quantization respectively. Template parameter \p QP |
521 | /// represents quantization precision, typically int8_t or uint8_t. |
522 | template <typename ScaleT, typename OffsetT, typename QP> |
523 | void tensorRowwiseQuantization(const Tensor &input, Tensor &output, |
524 | Tensor &scales, Tensor &offsets, |
525 | quantization::Schema schema) { |
526 | constexpr bool offsetIsFP = std::is_same<float, OffsetT>::value || |
527 | std::is_same<float16_t, OffsetT>::value; |
528 | constexpr bool offsetIsInt32 = std::is_same<int32_t, OffsetT>::value; |
529 | static_assert((offsetIsInt32 && std::is_same<float, ScaleT>::value) || |
530 | (offsetIsFP && std::is_same<ScaleT, OffsetT>::value), |
531 | "Invalid combination of Scale/Offset types." ); |
532 | |
533 | const auto fDims = flattenCdr(input.dims()); |
534 | Tensor finalIn = input.getUnowned({fDims.first, fDims.second}); |
535 | Tensor finalOut = output.getUnowned({fDims.first, fDims.second}); |
536 | ShapeHW idim(finalIn.dims()); |
537 | |
538 | auto srcH = finalIn.getHandle<float>(); |
539 | auto destH = finalOut.getHandle<QP>(); |
540 | auto scalesH = scales.getHandle<ScaleT>(); |
541 | auto offsetsH = offsets.getHandle<OffsetT>(); |
542 | for (dim_t i = 0; i < idim.height; i++) { |
543 | auto slice = srcH.extractSlice(i); |
544 | auto rSrc = slice.getHandle<float>(); |
545 | auto res = rSrc.minMaxArg(); |
546 | float min = rSrc.raw(res.first); |
547 | float max = rSrc.raw(res.second); |
548 | |
549 | // Handle rowwise quantization for FCs. |
550 | if (offsetIsInt32) { |
551 | TensorQuantizationParams qParams = |
552 | chooseQuantizationParams({min, max}, schema); |
553 | for (dim_t j = 0; j < idim.width; j++) { |
554 | destH.at({i, j}) = quantization::quantize(srcH.at({i, j}), qParams); |
555 | } |
556 | scalesH.raw(i) = qParams.scale; |
557 | offsetsH.raw(i) = qParams.offset; |
558 | } else if (offsetIsFP) { |
559 | // Handle rowwise quantization for Rowwise quantized SLS. |
560 | constexpr float kEqualityThreshold = 1e-10f; |
561 | const float scale = ((max - min) < kEqualityThreshold) |
562 | ? 1.0 |
563 | : ((double)max - (double)min) / 255.0; |
564 | float offset = min; |
565 | |
566 | for (dim_t j = 0; j < idim.width; j++) { |
567 | destH.at({i, j}) = quantization::quantizeWithFloatOffset<QP>( |
568 | srcH.at({i, j}), scale, offset); |
569 | } |
570 | scalesH.raw(i) = static_cast<ScaleT>(scale); |
571 | offsetsH.raw(i) = static_cast<OffsetT>(offset); |
572 | } else { |
573 | llvm_unreachable("Unsupported offset type." ); |
574 | } |
575 | } |
576 | } |
577 | |
578 | /// Fused-rowwise quantize the tensor \p input. Scales and offsets are generated |
579 | /// from each row of \p input. This function supports 8-bits quantization (i.e. |
580 | /// each quantized data uses 8 bits) and 4-bits quantization(i.e. each quantized |
581 | /// data uses 4 bits). |
582 | /// For 8-bits quantization, \p output is tensor of the same shape as input but |
583 | /// with extra columns for storing fused scales. Template parameter \p T |
584 | /// represents the datatype used for storing the scale and offset in the row |
585 | /// | .... int8 data ... | scale | offset | |
586 | /// |num_of_input_columns * 1B| sizeof(T) | sizeof(T) | |
587 | /// For 4-bits quantization, in \p output, 1 byte will contain 2 quantized data. |
588 | /// Template parameter \p T here could be either float or float16_t. |
589 | /// | .... int4 data ... | scale | offset | |
590 | /// |num_of_input_columns * 0.5B | sizeof(T) | sizeof(T) | |
591 | /// \pre input.dims().size() == 2 |
592 | /// \pre output.dims().size() == 2 |
593 | /// For 8-bits quantization: |
594 | /// \pre input.dims()[1] + 2 * sizeof(T) == output.dims()[1] |
595 | /// For 4-bits quantization: |
596 | /// \pre input.dims()[1] % 2 == 0 |
597 | /// \pre input.dims()[1] / 2 + 2 * sizeof(T) == output.dims()[1] |
598 | template <typename T> |
599 | void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output) { |
600 | // We are fusing the scale and offset onto the end of each row. Thus input and |
601 | // output must both be 2 dimensional, with output having 2*sizeof(T) extra |
602 | // columns for the scale and offset. |
603 | auto outputType = output.getElementType(); |
604 | assert(input.dims().size() == 2 && output.dims().size() == 2 && |
605 | "Input and output must be 2 dimensional." ); |
606 | if (outputType == ElemKind::UInt8FusedFP16QTy || |
607 | outputType == ElemKind::UInt8FusedQTy) { |
608 | assert(input.dims()[1] + 2 * sizeof(T) == output.dims()[1] && |
609 | "Output must have 2*sizeof(T) more columns than input for 8-bits " |
610 | "quantization." ); |
611 | } else if (outputType == ElemKind::UInt4FusedFP16QTy || |
612 | outputType == ElemKind::UInt4FusedQTy) { |
613 | assert( |
614 | input.dims()[1] % 2 == 0 && |
615 | "4-bits fused quantization only works for the number of input column " |
616 | "a multiple of 2" ); |
617 | assert( |
618 | input.dims()[1] / 2 + 2 * sizeof(T) == output.dims()[1] && |
619 | "Output must have 2*sizeof(T) more columns than half of input columns " |
620 | "for 4-bits quantization." ); |
621 | } |
622 | |
623 | auto srcH = input.getHandle<float>(); |
624 | auto destH = output.getHandle<uint8_t>(); |
625 | for (dim_t i = 0, e = input.dims()[0]; i < e; i++) { |
626 | auto slice = srcH.extractSlice(i); |
627 | auto rSrc = slice.getHandle<float>(); |
628 | auto res = rSrc.minMaxArg(); |
629 | float min = rSrc.raw(res.first); |
630 | float max = rSrc.raw(res.second); |
631 | |
632 | float range; |
633 | switch (outputType) { |
634 | case ElemKind::UInt8FusedQTy: |
635 | case ElemKind::UInt8FusedFP16QTy: |
636 | range = 255.0; |
637 | break; |
638 | case ElemKind::UInt4FusedFP16QTy: |
639 | case ElemKind::UInt4FusedQTy: |
640 | range = 15.0; |
641 | break; |
642 | default: |
643 | llvm_unreachable("Not yet supported" ); |
644 | } |
645 | |
646 | // This matches the Caffe2 implementation for FloatToRowwiseQuantized8BitsOp |
647 | // found in operators/lengths_reducer_rowwise_8bit_ops.h. |
648 | constexpr float kEqualityThreshold = 1e-10f; |
649 | const float scale = ((max - min) < kEqualityThreshold) |
650 | ? 1.0 |
651 | : ((double)max - (double)min) / range; |
652 | const float offset = min; |
653 | |
654 | for (dim_t j = 0, f = input.dims()[1]; j < f; j++) { |
655 | if (outputType == ElemKind::UInt8FusedFP16QTy || |
656 | outputType == ElemKind::UInt8FusedQTy) { |
657 | destH.at({i, j}) = quantization::quantizeWithFloatOffset<uint8_t>( |
658 | srcH.at({i, j}), scale, offset); |
659 | } else if (outputType == ElemKind::UInt4FusedFP16QTy || |
660 | outputType == ElemKind::UInt4FusedQTy) { |
661 | uint8_t quantized = quantization::quantize4BitsWithFloatOffset( |
662 | srcH.at({i, j}), scale, offset); |
663 | if (j % 2 == 0) { |
664 | // Even columns use LSB 4-bit. |
665 | destH.at({i, j / 2}) = quantized; |
666 | } else { |
667 | // Odd columns use MSB 4-bit. |
668 | destH.at({i, j / 2}) |= quantized << 4; |
669 | } |
670 | } else { |
671 | llvm_unreachable("Not yet supported" ); |
672 | } |
673 | } |
674 | |
675 | // Now set the scale/offset at the end of each row. |
676 | destH.setFusedScaleOffsetInRow<T>(i, scale, offset); |
677 | } |
678 | } |
679 | |
680 | /// Generic function to compute the quantization parameters for an input |
681 | /// floating-point tensor \p tensor with given schema \p qSchema and type |
682 | /// \p qTy. A separate set of quantization parameters (scale, offset) will |
683 | /// be computed for each group of \p qStep indices along the \p qDim dimension. |
684 | /// This allows quantizing a given tensor with finer granularity (e.g. rowwise |
685 | /// or channelwise). |
686 | /// For example, for a tensor of size [4, 6, 8, 10], qDim = 1 and qStep = 3: |
687 | /// -> one set of quantization parameters will be computed for [:,0:2,:,:]. |
688 | /// -> one set of quantization parameters will be computed for [:,3:5,:,:]. |
689 | /// The number of sets of computed quantization parameters (scale, offset) is |
690 | /// tensor.dims()[qDim] / qStep. \returns the set of quantization parameters. |
691 | std::vector<TensorQuantizationParams> |
692 | getTensorQuantizationParams(const Tensor &tensor, Schema qSchema = Asymmetric, |
693 | ElemKind qTy = ElemKind::Int8QTy, dim_t qDim = 0, |
694 | dim_t qStep = 1); |
695 | |
696 | /// Similar function to the one above with the difference that the quantization |
697 | /// parameters scales and offsets are written into separate tensors \p scales |
698 | /// and \p offsets which are assummed allocated with the correct type and size. |
699 | void getTensorQuantizationParams(const Tensor &tensor, Tensor &scales, |
700 | Tensor &offsets, Schema qSchema = Asymmetric, |
701 | ElemKind qTy = ElemKind::Int8QTy, |
702 | dim_t qDim = 0, dim_t qStep = 1); |
703 | |
704 | /// Generic function to quantize a given input floating-point tensor \p tensor |
705 | /// with given tensor quantization parameters \p TQP and type \p qTy. A separate |
706 | /// set of quantization parameters (scale, offset) is provided for each group |
707 | /// of \p qStep indices along the \p qDim dimension and can be obtained using |
708 | /// the function \ref getTensorQuantizationParams. This allows quantizing a |
709 | /// given tensor with finer granularity (e.g. rowwise or channelwise). |
710 | /// For example, for a tensor of size [4, 6, 8, 10], qDim = 1 and qStep = 3: |
711 | /// -> one set of quantization parameters will be provided for [:,0:2,:,:]. |
712 | /// -> one set of quantization parameters will be provided for [:,3:5,:,:]. |
713 | /// The number of sets of provided quantization parameters (scale, offset) is |
714 | /// tensor.dims()[qDim] / qStep. \returns the quantized tensor. |
715 | Tensor quantizeTensor(const Tensor &tensor, |
716 | llvm::ArrayRef<TensorQuantizationParams> TQP, |
717 | ElemKind qTy = ElemKind::Int8QTy, dim_t qDim = 0, |
718 | dim_t qStep = 1); |
719 | |
720 | /// Similar function to the one above with the difference that the quantization |
721 | /// parameters scales and offsets are loaded from separate tensors \p scales |
722 | /// and \p offsets. |
723 | Tensor quantizeTensor(const Tensor &tensor, const Tensor &scales, |
724 | const Tensor &offsets, ElemKind qTy = ElemKind::Int8QTy, |
725 | dim_t qDim = 0, dim_t qStep = 1); |
726 | |
727 | /// Verify if float is an exact power of 2 (mantissa is exactly 1.0). |
728 | bool isFloatPowerOf2(float val); |
729 | |
730 | /// Get float 2's exponent. |
731 | int getFloat2Exp(float val); |
732 | |
733 | } // namespace quantization |
734 | } // namespace glow |
735 | |
736 | #endif // GLOW_QUANTIZATION_BASE_BASE_H |
737 | |