1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef GLOW_QUANTIZATION_BASE_BASE_H
18#define GLOW_QUANTIZATION_BASE_BASE_H
19
20#include "glow/Base/Tensor.h"
21#include "glow/Base/Traits.h"
22#include "glow/Base/Type.h"
23
24#include <algorithm>
25#include <cassert>
26#include <cstdlib>
27#include <limits>
28
29namespace glow {
30
31/// Dummy scale used for representing dummy quantization parameters that have
32/// been loaded in place of real quantization parameters.
33constexpr float dummyScale = 0.123456813395023345947265625;
34
35/// Profiling parameters of a tensor consisting in the global minimum and global
36/// maximum values and also the histogram obtained during profiling. To be noted
37/// that the histogram is not normalized.
38struct TensorProfilingParams {
39 float min;
40 float max;
41 std::vector<float> histogram;
42
43 TensorProfilingParams() = default;
44 TensorProfilingParams(float min, float max) : min(min), max(max) {}
45 TensorProfilingParams(float min, float max, const std::vector<float> &hist)
46 : min(min), max(max), histogram(hist) {}
47 TensorProfilingParams(float min, float max, const Tensor &hist)
48 : min(min), max(max) {
49 auto histH = hist.getHandle<float>();
50 histogram = std::vector<float>(histH.size());
51 for (dim_t idx = 0, e = histH.size(); idx < e; idx++) {
52 histogram[idx] = histH.raw(idx);
53 }
54 }
55};
56
57/// Main attributes of a quantized tensor.
58/// Scale and Offset allow quantization of a float tensor and dequantization of
59/// integer tensor back to float one.
60struct TensorQuantizationParams {
61 float scale;
62 int32_t offset;
63};
64
65/// A data structure that represents the 32-bit to 8-bit quantization
66/// scaling operation. This data structure represents the transformation:
67/// (((input >> pre) * scale) + rtn) >> post + offset.
68struct QuantizationTransform32To8 {
69 int pre;
70 int post;
71 int scale;
72 int offset;
73
74 /// Initializes the transformation based on the conversion formula (above).
75 QuantizationTransform32To8(int pre, int post, int scale, int offset)
76 : pre(pre), post(post), scale(scale), offset(offset) {}
77
78 /// \returns the scaled integer.
79 int32_t transform(int32_t input) {
80 // The operation x >> post is rounded down to negative infinity. To get to
81 // round-nearest we add (1 << (post - 1)) to the value prior to shifting.
82 // Rounding is performed only when shifting right (pos > 0).
83 int rtn = (post > 0) ? (1 << (post - 1)) : 0;
84 return ((((input >> pre) * scale) + rtn) >> post) + offset;
85 }
86};
87
88/// Tensor profiling parameters for a given node.
89struct NodeProfilingInfo {
90 std::string nodeOutputName_;
91 TensorProfilingParams tensorProfilingParams_;
92
93 NodeProfilingInfo() = default;
94 NodeProfilingInfo(const std::string &nodeOutputName,
95 const TensorProfilingParams &tensorProfilingParams)
96 : nodeOutputName_(nodeOutputName),
97 tensorProfilingParams_(tensorProfilingParams) {}
98
99 float min() const { return tensorProfilingParams_.min; }
100 float max() const { return tensorProfilingParams_.max; }
101 const std::vector<float> &histogram() const {
102 return tensorProfilingParams_.histogram;
103 }
104};
105
106/// Tensor quantization parameters for a given node.
107struct NodeQuantizationInfo {
108 std::string nodeOutputName_;
109 TensorQuantizationParams tensorQuantizationParams_;
110
111 NodeQuantizationInfo() = default;
112 NodeQuantizationInfo(const std::string &nodeOutputName,
113 const TensorQuantizationParams &tensorQuantizationParams)
114 : nodeOutputName_(nodeOutputName),
115 tensorQuantizationParams_(tensorQuantizationParams) {}
116
117 float scale() const { return tensorQuantizationParams_.scale; }
118 int32_t offset() const { return tensorQuantizationParams_.offset; }
119};
120
121/// Primitive to encode an integer in 32-bit unsigned fixed-point format.
122class FixedPointUInt32 {
123private:
124 /// Encoded fixed-point value.
125 uint32_t val_;
126 /// Number of integer bits.
127 unsigned intBits_;
128 /// Number of fractional bits.
129 unsigned fracBits_;
130
131public:
132 /// Default constructor.
133 FixedPointUInt32() = default;
134
135 /// Construct a fixed-point representation of the floating-point value
136 /// \p floatVal using the fixed-point configuration with minimum approximation
137 /// error by using the least amount of integer bits and the highest amount
138 /// of fractional bits.
139 FixedPointUInt32(float floatVal) {
140 assert(floatVal >= 0 && "Floating point value must be positive!");
141 val_ = floatingToFixedPoint(floatVal, 32 - minBitsIntegerPart(floatVal));
142 intBits_ = minBitsIntegerPart(floatVal);
143 fracBits_ = 32 - intBits_;
144 };
145
146 /// Construct a fixed-point representation of the floating-point value
147 /// \p floatVal using the given number of integer bits \p intBits.
148 FixedPointUInt32(float floatVal, unsigned intBits) {
149 assert(floatVal >= 0 && "Floating point value must be positive!");
150 assert(intBits >= 0 && intBits <= 32 &&
151 "Integer bits must be between 0 and 32");
152 val_ = floatingToFixedPoint(floatVal, 32 - intBits);
153 intBits_ = intBits;
154 fracBits_ = 32 - intBits_;
155 }
156
157 /// \returns the encoded fixed-point value as integer.
158 uint32_t getFixedVal() const { return val_; }
159
160 /// \returns the encoded fixed-point value as float.
161 float getFloatVal() const { return (float)(val_) / std::exp2(fracBits_); }
162
163 /// \returns the number of integer bits.
164 unsigned getIntBits() const { return intBits_; }
165
166 /// \returns the number of fractional bits.
167 unsigned getFracBits() const { return fracBits_; }
168
169private:
170 // \p number.
171 // \returns the minimum number of bits representing the
172 // integer part of the fixed point representation of a
173 // floating point number.
174 uint32_t minBitsIntegerPart(float number) {
175 assert(number >= 0 && "Floating point value must be positive!");
176 uint32_t aux = (uint32_t)number;
177 uint32_t integerPart = 0;
178
179 while (aux / 2 != 0 || aux % 2 != 0) {
180 integerPart += 1;
181 aux /= 2;
182 }
183
184 assert(integerPart >= 0 && integerPart <= 32 &&
185 "Overflow caused by input number\n");
186 return integerPart;
187 }
188
189 // \p elem.
190 // \p fracPart representing number of bits for fixed point representation.
191 // \returns the fixed point representation of the input floating point number
192 // using the format Q(32- fracPart).fracPart.
193 uint32_t floatingToFixedPoint(float elem, uint32_t fracPart) {
194 assert(elem >= 0 && "Floating point value must be positive!");
195 double result = (double)elem * (double)std::exp2((double)fracPart);
196 assert(result >= (double)std::numeric_limits<uint32_t>::min() &&
197 result <= (double)std::numeric_limits<uint32_t>::max() &&
198 "Float to fix point conversion overflow\n");
199 return round(result);
200 }
201
202public:
203 /// \returns a string representation of the fixed-point value (e.g. "0.13").
204 std::string toString() const { return std::to_string(getFloatVal()); }
205};
206
207namespace quantization {
208
209/// Type definition for a float min/max range.
210using FloatRange = std::pair<float, float>;
211
212/// Type definition for a quantized min/max range.
213using QuantizedRange = std::pair<int64_t, int64_t>;
214
215/// Quantization schema which influences the way the quantization parameters
216/// scale and offset are computed based on the target min/max dynamic range.
217enum Schema {
218 /// Asymmetric quantization produces ranges not necessarily centered on 0.
219 Asymmetric,
220 /// Symmetric quantization produces ranges centered on 0.
221 Symmetric,
222 /// Symmetric quantization produces ranges centered on 0 or -qmin, qmin being
223 /// the minimum value of the quantized type.
224 /// An offset of qmin (i.e., offset == -128 for int8) represents an unsigned
225 /// version of the quantized type with an offset of zero:
226 /// For example, int8 is [-128; 127] - (-128) == uint8 [0; 255] - 0
227 SymmetricWithUnsigned,
228 /// Quantization schema with:
229 /// - range centered on 0 (symmetric): offset == 0.
230 /// - scale parameter is a power of 2: scale = 2^E where E is a signed
231 /// exponent. Since the scale parameter is mostly subunitary, the
232 /// exponent is mostly negative.
233 /// Since the scale parameter is stored as floating point, the values
234 /// of E which are exactly representable range from -126 to 127.
235 SymmetricWithPower2Scale,
236};
237
238/// Calibration mode which influences the way the dynamic range min/max obtained
239/// during profiling is narrowed in order to have a more precise representation
240/// for the majority of the values with the price of saturating the outliers.
241enum Calibration {
242 /// No calibration. The quantization parameters will be computed using the
243 /// unaltered dynamic range min/max obtained during profiling such that all
244 /// the profiled dynamic range will be representable without saturation.
245 None,
246 /// Calibration mode based on minimizing the Kullback-Leibler divergence.
247 KLMinimization
248};
249
250/// Configuration for Profiling, passed into \ref profileQuantization().
251struct ProfilingConfiguration {
252 /// Number of bins used to compute the histogram during profiling.
253 unsigned numHistogramBins{10};
254};
255
256/// Configuration for Quantization, passed into \ref quantizeFunction().
257struct QuantizationConfiguration {
258 /// Profiling infos to use when computing the scale and offset for all the
259 /// Nodes inside the function being quantized, including the referenced
260 /// Placeholders and Constants.
261 std::vector<NodeProfilingInfo> infos{};
262
263 /// The hash of the graph obtained during profiling in the pre lowering stage.
264 /// This hash is used to verify during quantization that the graph being
265 /// compiled matches the graph used for obtaining the profiling information.
266 llvm::hash_code graphPreLowerHash{0};
267
268 /// Whether to check the graph hash during quantization.
269 bool checkGraphPreLowerHash{false};
270
271 /// Precision to use when quantizing a Function.
272 ElemKind precision{ElemKind::Int8QTy};
273
274 /// Schema to use when quantizing a Function.
275 Schema schema{Schema::Asymmetric};
276
277 /// Calibration mode used when computing the quantization parameters.
278 Calibration calibration{Calibration::None};
279
280 /// Whether to enable the calibration for constant weights.
281 bool calibrateConstants{false};
282
283 /// Whether to use rowwise quantization when quantizing a Function.
284 bool enableRowwise{false};
285
286 /// Whether to use channelwise quantization when quantizing a Function.
287 bool enableChannelwise{false};
288
289 /// New name for the quantized function. If no name is given then
290 /// \ref quantizeFunction() will generate a name.
291 std::string newFuncName{""};
292
293 /// If true, the quantizer will abort when encountering a node that it would
294 /// like to quantize but the backend cannot support. Note that node kinds in
295 /// doNotQuantizeKinds will skip this check and not cause an abort.
296 bool assertAllNodesQuantized{false};
297
298 /// Precision used for bias quantization for Convolution and FullyConnected.
299 /// This allows specializing the bias quantization. Default is int32.
300 ElemKind precisionBias{ElemKind::Int32QTy};
301
302 /// If true, don't apply quantization to FC bias inputs.
303 bool skipQuantizeFCBias{false};
304
305 QuantizationConfiguration() = default;
306 QuantizationConfiguration(llvm::ArrayRef<NodeProfilingInfo> i) : infos(i) {}
307};
308
309/// \returns the tensor average value based on the profiling info \p profParams.
310float getTensorAverageValue(const TensorProfilingParams &profParams);
311
312/// \returns the value \p in as clipped to the range of \p DestTy.
313template <class SrcTy, class DestTy> DestTy clip(SrcTy in) {
314 static_assert(sizeof(SrcTy) >= sizeof(DestTy), "Invalid types");
315
316 auto mx = std::numeric_limits<DestTy>::max();
317 auto mn = std::numeric_limits<DestTy>::min();
318 return std::max<SrcTy>(mn, std::min<SrcTy>(mx, in));
319}
320
321/// Converts floating point value \p input to DestTy (quantized type) based
322/// on the quantization parameters \p TQP.
323template <class DestTy = int8_t>
324inline DestTy quantize(float input, const TensorQuantizationParams &TQP) {
325 float result = input / TQP.scale + TQP.offset;
326 // Note: use int64_t since casts of large values might be wrapped around
327 // before clipping, for example for result = 2147483648.00 (float).
328 return quantization::clip<int64_t, DestTy>((int64_t)nearbyintf(result));
329}
330
331/// Converts floating point value \p input to \p DestTy (quantized type) based
332/// on the quantization parameters \p TQP. The value is returned as int64.
333inline int64_t quantize(float input, const TensorQuantizationParams &TQP,
334 ElemKind DestTy) {
335 if (DestTy == ElemKind::Int8QTy) {
336 return quantize<int8_t>(input, TQP);
337 } else if (DestTy == ElemKind::Int16QTy) {
338 return quantize<int16_t>(input, TQP);
339 } else if (DestTy == ElemKind::Int32QTy) {
340 return quantize<int32_t>(input, TQP);
341 } else if (DestTy == ElemKind::Int64QTy) {
342 return quantize<int64_t>(input, TQP);
343 } else {
344 llvm_unreachable("Precision not supported!");
345 }
346}
347
348/// Converts a quantized value (type eTy) to floating point based on the
349/// quantization parameters \p TQP.
350/// Note: use int64_t to cover the 'symmetric int32 with unsigned' case.
351template <class eTy = int8_t>
352inline float dequantize(eTy input, const TensorQuantizationParams &TQP) {
353 return TQP.scale * ((int64_t)input - TQP.offset);
354}
355
356/// Converts floating point value to DestTy (quantized type) based on the
357/// quantization parameters \p scale and \p offset. If the destination type is
358/// int8_t then an offset of 128 is subtracted to convert to int8_t. If the
359/// destination type is int16_t then an offset of 32768 is subtracted to convert
360/// to int16_t.
361template <class DestTy>
362inline DestTy quantizeWithFloatOffset(float input, float scale, float offset) {
363 uint16_t d = static_cast<uint16_t>(std::round((input - offset) / scale));
364 if (std::is_same<int8_t, DestTy>::value) {
365 d -= 128;
366 } else if (std::is_same<int16_t, DestTy>::value) {
367 d -= 32768;
368 }
369 return static_cast<DestTy>(d);
370}
371
372/// Converts floating point value \p input to 4-bit quantization based on the
373/// quantization parameters \p scale and \p offset.
374inline uint8_t quantize4BitsWithFloatOffset(float input, float scale,
375 float offset) {
376 uint8_t d = std::max(
377 0, std::min(static_cast<int>(std::round((input - offset) / scale)), 15));
378 return d;
379}
380
381/// Converts a quantized value (type eTy) to floating point based on the
382/// quantization parameters \p scale and \p offset. If the input type is int8_t,
383/// then an offset of 128 is added to convert to uint8_t. If the input type is
384/// int16_t, then an offset of 32768 is added to convert to uint16_t.
385template <class eTy>
386inline float dequantizeWithFloatOffset(eTy input, float scale, float offset) {
387 uint16_t d = static_cast<uint16_t>(input);
388 if (std::is_same<int8_t, eTy>::value) {
389 d += 128;
390 } else if (std::is_same<int16_t, eTy>::value) {
391 d += 32768;
392 }
393 return (d * scale) + offset;
394}
395
396/// Converts a 4-bit quantized value, which is stored in \p input (MSB if \p
397/// isMSB is true, otherwise LSB), to floating point based on the quantization
398/// parameters \p scale and \p offset.
399inline float dequantize4BitWithFloatOffset(uint8_t input, float scale,
400 float offset, bool isMSB) {
401 if (isMSB) {
402 input >>= 4;
403 }
404 input &= 0x0f;
405 return (input * scale) + offset;
406}
407
408/// Converts a floating point \p tensor to quantized tensor based on the
409/// quantization parameters \p TQP and \p Ty.
410Tensor quantizeTensor(const Tensor &tensor, const TensorQuantizationParams &TQP,
411 ElemKind Ty = ElemKind::Int8QTy);
412
413/// Converts quantized tensor \p tensor to floating point tensor of type \p Ty
414/// floatKind.
415Tensor dequantizeTensor(const Tensor &tensor, ElemKind floatKind);
416
417/// Dequantize 4-bit fused quantized tensor \p input. \returns the float type
418/// output.
419Tensor tensor4BitsFusedRowwiseDequantization(const Tensor &input);
420
421/// Convert the floating point quantization parameters \p scale and \p offset
422/// into the integer sequence of:
423/// result = ((input >> pre) * scale) >> post + offset.
424/// This scales a 32-bit signed integer word into an 8-bit signed integer.
425/// \returns transformation parameters.
426QuantizationTransform32To8 quantizeScaleOffset32To8(float scale,
427 int32_t offset);
428
429/// Function to get the quantized range for a given precision type \p qTy.
430/// \returns the range as a (min, max) pair.
431QuantizedRange getQuantizedRange(ElemKind qTy);
432
433/// Function to validate that the given quantization parameters \p qParams
434/// comply with the given quantization \p schema and precision \p qTy.
435void validateQuantizationParams(TensorQuantizationParams qParams, Schema schema,
436 ElemKind qTy);
437
438/// Calculate the TensorQuantizationParams from the TensorProfilingParams
439/// \p profParams using the quantization type \p qTy and the quantization
440/// method described by \p schema. The calibration of the quantization
441/// parameters will be done using the method given by \p calibration.
442TensorQuantizationParams
443chooseQuantizationParams(TensorProfilingParams profParams,
444 Schema schema = Asymmetric,
445 ElemKind qTy = ElemKind::Int8QTy,
446 Calibration calibration = Calibration::None);
447
448/// Function to specialize the TensorQuantizationParams of the bias operand
449/// for nodes like Convolution and FullyConnected given the initially computed
450/// parameters \p biasTQP and the parameters of the input \p inputTQP and the
451/// weights \p weightsTQP, for given quantization schema \p schema and bias type
452/// \p biasQTy. The parameter \p biasZero provides the information whether bias
453/// data is zero. The bias operand requires a more thoughtful quantization since
454/// every bias value has a higher impact on the precision of the output value
455/// than any particular weight value. The specialization logic is:
456/// - for INT32 bias quantization: since the dynamic range of INT32 is large we
457/// can always force symmetric quantization (offset = 0). This allows a faster
458/// implementation since no offset subtraction is required at run-time.
459/// - for INT8/INT16 bias quantization: since the dynamic range is small we
460/// will keep the original offset.
461/// - regardless of precision, we try to force the bias scale parameter to
462/// bias_scale = input_scale * weights_scale since this has a performance
463/// benefit by specializing the parameters to biasPre = 0, biasPost = 0,
464/// biasScale = 1. We must verify that by changing the bias scale we don`t
465/// saturate the bias data. This is also equivalent to forcing the effective
466/// scale applied at run-time (bias_scale / (input_scale * weights_scale))
467/// to be always greater than or equal to 1.0 which is a common constraint
468/// for the bias for most libraries with quantized implementations.
469TensorQuantizationParams
470specializeBiasQuantizationParams(const TensorQuantizationParams &biasTQP,
471 const TensorQuantizationParams &inputTQP,
472 const TensorQuantizationParams &weightsTQP,
473 Schema schema, ElemKind biasQTy,
474 bool biasZero = false);
475
476/// Function similar to \ref specializeBiasQuantizationParams with the main
477/// distinction that this function is also allowed to change the quantization
478/// parameters of the weights. The modification is done in place. This function
479/// is used for per-channel quantization. When the requested bias precision is
480/// INT32 this function ensures that bias_scale = input_scale * weights_scale
481/// while making sure the bias data is not saturated by changing both the bias
482/// and weights quantization parameters.
483void specializeBiasWeightsQuantizationParams(
484 TensorQuantizationParams &biasTQP, const TensorQuantizationParams &inputTQP,
485 TensorQuantizationParams &weightsTQP, Schema schema, ElemKind biasQTy,
486 bool biasZero = false);
487
488/// \returns an integer mapping from the \p inTy to the \p outTy given the
489/// floating-point function \p func.
490/// \pre inTy and outTy must be quantized types.
491template <typename T = int8_t>
492std::vector<T> createMapping(TypeRef inTy, TypeRef outTy,
493 std::function<float(float)> func) {
494 assert(inTy->isQuantizedType() && "Input type must be quantized!");
495 assert(outTy->isQuantizedType() && "Output type must be quantized!");
496 assert(outTy->isType<T>() && "Output type must match template type!");
497
498 // Calculate the step which will be added to the currInputVal repeatedly in
499 // order to cover the input range of the input type.
500 auto inputRange = inTy->getQuantizedValueRange();
501 const float step = inTy->getQuantizedValueStep();
502 float currInputVal = inputRange.first;
503
504 // Calculate the output int value for each possible input value.
505 std::vector<T> mapping(inTy->getQuantizedValueCount());
506 TensorQuantizationParams outputTQP{outTy->getScale(), outTy->getOffset()};
507 for (size_t i = 0; i < mapping.size(); i++, currInputVal += step) {
508 float currOutputVal = func(currInputVal);
509 mapping[i] = quantization::quantize<T>(currOutputVal, outputTQP);
510 }
511 return mapping;
512}
513
514/// Row-wise quantize the tensor \p input. \p scales and \p offsets are
515/// generated by each row of \p input, \p output is tensor of the same shape as
516/// input, quantized from \p input using \p scales and \p offsets for each
517/// row. Note that the shape of input/output can be any non-zero number of
518/// dimensions; row refers to all data in the first dimension of the shape.
519/// Template parameter \p ScaleT and OffsetT represent the type to use for the
520/// scales and offsets for quantization respectively. Template parameter \p QP
521/// represents quantization precision, typically int8_t or uint8_t.
522template <typename ScaleT, typename OffsetT, typename QP>
523void tensorRowwiseQuantization(const Tensor &input, Tensor &output,
524 Tensor &scales, Tensor &offsets,
525 quantization::Schema schema) {
526 constexpr bool offsetIsFP = std::is_same<float, OffsetT>::value ||
527 std::is_same<float16_t, OffsetT>::value;
528 constexpr bool offsetIsInt32 = std::is_same<int32_t, OffsetT>::value;
529 static_assert((offsetIsInt32 && std::is_same<float, ScaleT>::value) ||
530 (offsetIsFP && std::is_same<ScaleT, OffsetT>::value),
531 "Invalid combination of Scale/Offset types.");
532
533 const auto fDims = flattenCdr(input.dims());
534 Tensor finalIn = input.getUnowned({fDims.first, fDims.second});
535 Tensor finalOut = output.getUnowned({fDims.first, fDims.second});
536 ShapeHW idim(finalIn.dims());
537
538 auto srcH = finalIn.getHandle<float>();
539 auto destH = finalOut.getHandle<QP>();
540 auto scalesH = scales.getHandle<ScaleT>();
541 auto offsetsH = offsets.getHandle<OffsetT>();
542 for (dim_t i = 0; i < idim.height; i++) {
543 auto slice = srcH.extractSlice(i);
544 auto rSrc = slice.getHandle<float>();
545 auto res = rSrc.minMaxArg();
546 float min = rSrc.raw(res.first);
547 float max = rSrc.raw(res.second);
548
549 // Handle rowwise quantization for FCs.
550 if (offsetIsInt32) {
551 TensorQuantizationParams qParams =
552 chooseQuantizationParams({min, max}, schema);
553 for (dim_t j = 0; j < idim.width; j++) {
554 destH.at({i, j}) = quantization::quantize(srcH.at({i, j}), qParams);
555 }
556 scalesH.raw(i) = qParams.scale;
557 offsetsH.raw(i) = qParams.offset;
558 } else if (offsetIsFP) {
559 // Handle rowwise quantization for Rowwise quantized SLS.
560 constexpr float kEqualityThreshold = 1e-10f;
561 const float scale = ((max - min) < kEqualityThreshold)
562 ? 1.0
563 : ((double)max - (double)min) / 255.0;
564 float offset = min;
565
566 for (dim_t j = 0; j < idim.width; j++) {
567 destH.at({i, j}) = quantization::quantizeWithFloatOffset<QP>(
568 srcH.at({i, j}), scale, offset);
569 }
570 scalesH.raw(i) = static_cast<ScaleT>(scale);
571 offsetsH.raw(i) = static_cast<OffsetT>(offset);
572 } else {
573 llvm_unreachable("Unsupported offset type.");
574 }
575 }
576}
577
578/// Fused-rowwise quantize the tensor \p input. Scales and offsets are generated
579/// from each row of \p input. This function supports 8-bits quantization (i.e.
580/// each quantized data uses 8 bits) and 4-bits quantization(i.e. each quantized
581/// data uses 4 bits).
582/// For 8-bits quantization, \p output is tensor of the same shape as input but
583/// with extra columns for storing fused scales. Template parameter \p T
584/// represents the datatype used for storing the scale and offset in the row
585/// | .... int8 data ... | scale | offset |
586/// |num_of_input_columns * 1B| sizeof(T) | sizeof(T) |
587/// For 4-bits quantization, in \p output, 1 byte will contain 2 quantized data.
588/// Template parameter \p T here could be either float or float16_t.
589/// | .... int4 data ... | scale | offset |
590/// |num_of_input_columns * 0.5B | sizeof(T) | sizeof(T) |
591/// \pre input.dims().size() == 2
592/// \pre output.dims().size() == 2
593/// For 8-bits quantization:
594/// \pre input.dims()[1] + 2 * sizeof(T) == output.dims()[1]
595/// For 4-bits quantization:
596/// \pre input.dims()[1] % 2 == 0
597/// \pre input.dims()[1] / 2 + 2 * sizeof(T) == output.dims()[1]
598template <typename T>
599void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output) {
600 // We are fusing the scale and offset onto the end of each row. Thus input and
601 // output must both be 2 dimensional, with output having 2*sizeof(T) extra
602 // columns for the scale and offset.
603 auto outputType = output.getElementType();
604 assert(input.dims().size() == 2 && output.dims().size() == 2 &&
605 "Input and output must be 2 dimensional.");
606 if (outputType == ElemKind::UInt8FusedFP16QTy ||
607 outputType == ElemKind::UInt8FusedQTy) {
608 assert(input.dims()[1] + 2 * sizeof(T) == output.dims()[1] &&
609 "Output must have 2*sizeof(T) more columns than input for 8-bits "
610 "quantization.");
611 } else if (outputType == ElemKind::UInt4FusedFP16QTy ||
612 outputType == ElemKind::UInt4FusedQTy) {
613 assert(
614 input.dims()[1] % 2 == 0 &&
615 "4-bits fused quantization only works for the number of input column "
616 "a multiple of 2");
617 assert(
618 input.dims()[1] / 2 + 2 * sizeof(T) == output.dims()[1] &&
619 "Output must have 2*sizeof(T) more columns than half of input columns "
620 "for 4-bits quantization.");
621 }
622
623 auto srcH = input.getHandle<float>();
624 auto destH = output.getHandle<uint8_t>();
625 for (dim_t i = 0, e = input.dims()[0]; i < e; i++) {
626 auto slice = srcH.extractSlice(i);
627 auto rSrc = slice.getHandle<float>();
628 auto res = rSrc.minMaxArg();
629 float min = rSrc.raw(res.first);
630 float max = rSrc.raw(res.second);
631
632 float range;
633 switch (outputType) {
634 case ElemKind::UInt8FusedQTy:
635 case ElemKind::UInt8FusedFP16QTy:
636 range = 255.0;
637 break;
638 case ElemKind::UInt4FusedFP16QTy:
639 case ElemKind::UInt4FusedQTy:
640 range = 15.0;
641 break;
642 default:
643 llvm_unreachable("Not yet supported");
644 }
645
646 // This matches the Caffe2 implementation for FloatToRowwiseQuantized8BitsOp
647 // found in operators/lengths_reducer_rowwise_8bit_ops.h.
648 constexpr float kEqualityThreshold = 1e-10f;
649 const float scale = ((max - min) < kEqualityThreshold)
650 ? 1.0
651 : ((double)max - (double)min) / range;
652 const float offset = min;
653
654 for (dim_t j = 0, f = input.dims()[1]; j < f; j++) {
655 if (outputType == ElemKind::UInt8FusedFP16QTy ||
656 outputType == ElemKind::UInt8FusedQTy) {
657 destH.at({i, j}) = quantization::quantizeWithFloatOffset<uint8_t>(
658 srcH.at({i, j}), scale, offset);
659 } else if (outputType == ElemKind::UInt4FusedFP16QTy ||
660 outputType == ElemKind::UInt4FusedQTy) {
661 uint8_t quantized = quantization::quantize4BitsWithFloatOffset(
662 srcH.at({i, j}), scale, offset);
663 if (j % 2 == 0) {
664 // Even columns use LSB 4-bit.
665 destH.at({i, j / 2}) = quantized;
666 } else {
667 // Odd columns use MSB 4-bit.
668 destH.at({i, j / 2}) |= quantized << 4;
669 }
670 } else {
671 llvm_unreachable("Not yet supported");
672 }
673 }
674
675 // Now set the scale/offset at the end of each row.
676 destH.setFusedScaleOffsetInRow<T>(i, scale, offset);
677 }
678}
679
680/// Generic function to compute the quantization parameters for an input
681/// floating-point tensor \p tensor with given schema \p qSchema and type
682/// \p qTy. A separate set of quantization parameters (scale, offset) will
683/// be computed for each group of \p qStep indices along the \p qDim dimension.
684/// This allows quantizing a given tensor with finer granularity (e.g. rowwise
685/// or channelwise).
686/// For example, for a tensor of size [4, 6, 8, 10], qDim = 1 and qStep = 3:
687/// -> one set of quantization parameters will be computed for [:,0:2,:,:].
688/// -> one set of quantization parameters will be computed for [:,3:5,:,:].
689/// The number of sets of computed quantization parameters (scale, offset) is
690/// tensor.dims()[qDim] / qStep. \returns the set of quantization parameters.
691std::vector<TensorQuantizationParams>
692getTensorQuantizationParams(const Tensor &tensor, Schema qSchema = Asymmetric,
693 ElemKind qTy = ElemKind::Int8QTy, dim_t qDim = 0,
694 dim_t qStep = 1);
695
696/// Similar function to the one above with the difference that the quantization
697/// parameters scales and offsets are written into separate tensors \p scales
698/// and \p offsets which are assummed allocated with the correct type and size.
699void getTensorQuantizationParams(const Tensor &tensor, Tensor &scales,
700 Tensor &offsets, Schema qSchema = Asymmetric,
701 ElemKind qTy = ElemKind::Int8QTy,
702 dim_t qDim = 0, dim_t qStep = 1);
703
704/// Generic function to quantize a given input floating-point tensor \p tensor
705/// with given tensor quantization parameters \p TQP and type \p qTy. A separate
706/// set of quantization parameters (scale, offset) is provided for each group
707/// of \p qStep indices along the \p qDim dimension and can be obtained using
708/// the function \ref getTensorQuantizationParams. This allows quantizing a
709/// given tensor with finer granularity (e.g. rowwise or channelwise).
710/// For example, for a tensor of size [4, 6, 8, 10], qDim = 1 and qStep = 3:
711/// -> one set of quantization parameters will be provided for [:,0:2,:,:].
712/// -> one set of quantization parameters will be provided for [:,3:5,:,:].
713/// The number of sets of provided quantization parameters (scale, offset) is
714/// tensor.dims()[qDim] / qStep. \returns the quantized tensor.
715Tensor quantizeTensor(const Tensor &tensor,
716 llvm::ArrayRef<TensorQuantizationParams> TQP,
717 ElemKind qTy = ElemKind::Int8QTy, dim_t qDim = 0,
718 dim_t qStep = 1);
719
720/// Similar function to the one above with the difference that the quantization
721/// parameters scales and offsets are loaded from separate tensors \p scales
722/// and \p offsets.
723Tensor quantizeTensor(const Tensor &tensor, const Tensor &scales,
724 const Tensor &offsets, ElemKind qTy = ElemKind::Int8QTy,
725 dim_t qDim = 0, dim_t qStep = 1);
726
727/// Verify if float is an exact power of 2 (mantissa is exactly 1.0).
728bool isFloatPowerOf2(float val);
729
730/// Get float 2's exponent.
731int getFloat2Exp(float val);
732
733} // namespace quantization
734} // namespace glow
735
736#endif // GLOW_QUANTIZATION_BASE_BASE_H
737