1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "glow/Quantization/Base/Base.h"
18#include "glow/Base/Tensor.h"
19#include "glow/Quantization/Base/Calibration.h"
20#include "glow/Quantization/Base/Profile.h"
21
22#include <cmath>
23
24namespace glow {
25namespace quantization {
26
27float getTensorAverageValue(const TensorProfilingParams &profParams) {
28 size_t numBins = profParams.histogram.size();
29 assert(numBins > 0 && "Histogram is empty!");
30 float histDelta = (profParams.max - profParams.min) / (float)(numBins);
31 float histOff = profParams.min + histDelta / 2.0;
32 float histAvg = 0.0;
33 float histSum = 0.0;
34 for (size_t idx = 0; idx < numBins; ++idx) {
35 float histBinCenter = histOff + histDelta * (float)idx;
36 float histBinCount = profParams.histogram[idx];
37 histAvg += histBinCenter * histBinCount;
38 histSum += histBinCount;
39 }
40 histAvg /= histSum;
41 return histAvg;
42}
43
44template <class eTy = int8_t>
45static void quantizeTensorUtil(Tensor *dest, const Tensor &src) {
46 auto destH = dest->getHandle<eTy>();
47 TensorQuantizationParams TQP{dest->getType().getScale(),
48 dest->getType().getOffset()};
49 switch (src.getElementType()) {
50 case ElemKind::FloatTy: {
51 auto srcHandle = src.getHandle<float>();
52 for (size_t i = 0, e = destH.size(); i < e; ++i) {
53 destH.raw(i) = quantization::quantize<eTy>(
54 static_cast<float>(srcHandle.raw(i)), TQP);
55 }
56 break;
57 }
58 case ElemKind::Float16Ty: {
59 auto srcHandle = src.getHandle<float16_t>();
60 for (size_t i = 0, e = destH.size(); i < e; ++i) {
61 destH.raw(i) = quantization::quantize<eTy>(
62 static_cast<float>(srcHandle.raw(i)), TQP);
63 }
64 break;
65 }
66 case ElemKind::BFloat16Ty: {
67 auto srcHandle = src.getHandle<bfloat16_t>();
68 for (size_t i = 0, e = destH.size(); i < e; ++i) {
69 destH.raw(i) = quantization::quantize<eTy>(
70 static_cast<float>(srcHandle.raw(i)), TQP);
71 }
72 break;
73 }
74 default:
75 llvm_unreachable("Cannot quantize a type");
76 }
77}
78
79Tensor quantizeTensor(const Tensor &tensor, const TensorQuantizationParams &TQP,
80 ElemKind Ty) {
81 Tensor tmp(Ty, tensor.dims(), TQP.scale, TQP.offset);
82 assert(tensor.getType().isFPType() && "Type not supported yet");
83 if (Ty == ElemKind::Int8QTy) {
84 quantizeTensorUtil<int8_t>(&tmp, tensor);
85 } else if (Ty == ElemKind::UInt8QTy) {
86 quantizeTensorUtil<uint8_t>(&tmp, tensor);
87 } else if (Ty == ElemKind::Int16QTy) {
88 quantizeTensorUtil<int16_t>(&tmp, tensor);
89 } else if (Ty == ElemKind::Int32QTy) {
90 quantizeTensorUtil<int32_t>(&tmp, tensor);
91 } else {
92 llvm_unreachable("Quantized type not supported");
93 }
94 return tmp;
95}
96
97template <class eTy = int8_t>
98static void dequantizeTensorUtil(Tensor *dest, const Tensor &src) {
99 TensorQuantizationParams TQP{src.getType().getScale(),
100 src.getType().getOffset()};
101 auto srcHandle = src.getHandle<eTy>();
102 switch (dest->getElementType()) {
103 case ElemKind::FloatTy: {
104 auto destH = dest->getHandle<float>();
105 for (size_t i = 0, e = destH.size(); i < e; ++i) {
106 destH.raw(i) = quantization::dequantize<eTy>(
107 static_cast<eTy>(srcHandle.raw(i)), TQP);
108 }
109 break;
110 }
111 case ElemKind::Float16Ty: {
112 auto destH = dest->getHandle<float16_t>();
113 for (size_t i = 0, e = destH.size(); i < e; ++i) {
114 destH.raw(i) = quantization::dequantize<eTy>(
115 static_cast<eTy>(srcHandle.raw(i)), TQP);
116 }
117 break;
118 }
119 case ElemKind::BFloat16Ty: {
120 auto destH = dest->getHandle<bfloat16_t>();
121 for (size_t i = 0, e = destH.size(); i < e; ++i) {
122 destH.raw(i) = quantization::dequantize<eTy>(
123 static_cast<eTy>(srcHandle.raw(i)), TQP);
124 }
125 break;
126 }
127 default:
128 llvm_unreachable("Cannot dequantize to the given type");
129 }
130}
131
132/// Helper for dequantizing UInt8FusedQTy \p src to \p dest.
133template <typename DeqElemTy, typename ScaleOffsetTy>
134static void dequantizeFusedRowwiseTensorUtil(Tensor &dest, const Tensor &src) {
135 auto dims = dest.dims();
136 auto srcH = src.getHandle<uint8_t>();
137 auto destH = dest.getHandle<DeqElemTy>();
138 for (dim_t i = 0, e = dims[0]; i < e; ++i) {
139 float scale, offset;
140 std::tie(scale, offset) = srcH.getFusedScaleOffsetFromRow<ScaleOffsetTy>(i);
141 for (dim_t j = 0, f = dims[1]; j < f; ++j) {
142 destH.at({i, j}) =
143 static_cast<DeqElemTy>(quantization::dequantizeWithFloatOffset(
144 srcH.at({i, j}), scale, offset));
145 }
146 }
147}
148
149Tensor dequantizeTensor(const Tensor &tensor, ElemKind floatKind) {
150 assert(isFloatElemKind(floatKind) &&
151 "Non supported output floating point type");
152 auto Ty = tensor.getType().getElementType();
153
154 if (Ty == ElemKind::UInt8FusedQTy || Ty == ElemKind::UInt8FusedFP16QTy) {
155 const bool scaleOffsetFP16 = Ty == ElemKind::UInt8FusedFP16QTy;
156 const dim_t scaleOffsetSize =
157 scaleOffsetFP16 ? sizeof(float16_t) : sizeof(float);
158 assert(tensor.dims().size() == 2 && "Fused tensors should be 2D");
159 assert(tensor.dims()[1] > 2 * scaleOffsetSize &&
160 "Expected space for per-row scale/offset");
161 Tensor tmp(floatKind, {tensor.dims()[0],
162 tensor.dims()[1] - (dim_t)(2 * scaleOffsetSize)});
163 switch (floatKind) {
164 case ElemKind::FloatTy:
165 if (scaleOffsetFP16) {
166 dequantizeFusedRowwiseTensorUtil<float, float16_t>(tmp, tensor);
167 } else {
168 dequantizeFusedRowwiseTensorUtil<float, float>(tmp, tensor);
169 }
170 break;
171 case ElemKind::Float16Ty:
172 if (scaleOffsetFP16) {
173 dequantizeFusedRowwiseTensorUtil<float16_t, float16_t>(tmp, tensor);
174 } else {
175 dequantizeFusedRowwiseTensorUtil<float16_t, float>(tmp, tensor);
176 }
177 break;
178 default:
179 llvm_unreachable("Cannot dequantize to the given type");
180 }
181 return tmp;
182 }
183
184 Tensor tmp(floatKind, tensor.dims());
185 if (Ty == ElemKind::Int8QTy) {
186 dequantizeTensorUtil<int8_t>(&tmp, tensor);
187 } else if (Ty == ElemKind::UInt8QTy) {
188 dequantizeTensorUtil<uint8_t>(&tmp, tensor);
189 } else if (Ty == ElemKind::Int16QTy) {
190 dequantizeTensorUtil<int16_t>(&tmp, tensor);
191 } else if (Ty == ElemKind::Int32QTy) {
192 dequantizeTensorUtil<int32_t>(&tmp, tensor);
193 } else {
194 llvm_unreachable("Input quantized type not supported");
195 }
196 return tmp;
197}
198
199template <class T = float16_t>
200static Tensor tensor4BitsFusedRowwiseDequantizationUtil(const Tensor &input) {
201 assert(input.dims().size() == 2 && "Input must be 2 dimensional.");
202 // The output tensor should have the same raw as input tensor. Since the
203 // quantized tensor is in the following format: | 4bit quantized data |
204 // T scale | T offset| The columns of dequantized float data
205 // should be (input.dims()[1] - 2*sizeof(T)) * 2.
206 Tensor output(
207 ElemKind::FloatTy,
208 {input.dims()[0], (dim_t)(input.dims()[1] - 2 * sizeof(T)) * 2});
209 auto srcH = input.getHandle<uint8_t>();
210 auto destH = output.getHandle<float>();
211 for (dim_t i = 0; i < input.dims()[0]; i++) {
212 T scale, offset;
213 std::tie(scale, offset) = srcH.getFusedScaleOffsetFromRow<T>(i);
214 for (dim_t j = 0; j < output.dims()[1]; j++) {
215 bool isMSB = (j % 2 == 1);
216 destH.at({i, j}) = dequantize4BitWithFloatOffset(
217 srcH.at({i, j / 2}), static_cast<float>(scale),
218 static_cast<float>(offset), isMSB);
219 }
220 }
221 return output;
222}
223
224Tensor tensor4BitsFusedRowwiseDequantization(const Tensor &input) {
225 auto Ty = input.getType().getElementType();
226 assert((Ty == ElemKind::UInt4FusedFP16QTy || Ty == ElemKind::UInt4FusedQTy) &&
227 "Unsupported 4bits fused rw quantization type.");
228 if (Ty == ElemKind::UInt4FusedFP16QTy) {
229 return tensor4BitsFusedRowwiseDequantizationUtil<float16_t>(input);
230 }
231 return tensor4BitsFusedRowwiseDequantizationUtil<float>(input);
232}
233
234QuantizationTransform32To8 quantizeScaleOffset32To8(float scale,
235 int32_t offset) {
236 // In this function we compute an efficient way to convert signed 32-bit
237 // integers into signed 8-bit integers without the use of floating-point
238 // multiplication. Instead, we represent the original calculation:
239 //
240 // result = (x * scale + offset)
241 //
242 // as the following sequence of integer calculations:
243 //
244 // ((x >> pre_scale * integer_scale) >> post_scale) + offset
245 //
246 // This function converts the floating-point scale and offset values to the
247 // constants in the integer formula.
248 //
249 // In this method we assume that any signed 32-bit integer in the input word
250 // must be mapped into an 8-bit integer. If the scale factor is 2X, then the
251 // number 1000 won't be a legal input because after scaling the result would
252 // fall outside of the signed 8-bit range. Any 32-bit number that falls
253 // outside of signed the 8-bit output integer will be clipped. This gives us
254 // the ability to perform 32-bit arithmetic, as explained below.
255 //
256 // We can't accurately represent fraction scales (in the range zero to one),
257 // because the lowest integer multiplication value is one. For example, the
258 // scaling factor 0.25 must be represented as integer multiplication of either
259 // zero or one, which would result in an highly inaccurate output.
260 // Similarly, rounding the scaling factor of 1.6 to 2.0 would produce
261 // inaccurate results because drop a significant part of the number.
262 //
263 // The solution here is to scale (increase in size) the signed integer scalar,
264 // and divide the result by shifting it to the right hand side. For example,
265 // the floating-point scalar 0.41 is multiplied by 32x (to 13.12, rounded to
266 // 13). Then the signed 32-bit integer input is multiplied by 13, and then
267 // shifted 5 times to the right (to shrink the result back). The output of
268 // this calculation is (13.0 / 32), which is about ~0.4.
269 //
270 // This approach works well for some scale values. Notice that the modified
271 // integer multiplication requires more bits because the intermediate result
272 // is larger. Notice that it's always safe to promote the scalar value from a
273 // fraction up to one. When you multiply by the integer value one, the
274 // intermediate result does not overflow (does not require more bits).
275 //
276 // It is actually always safe to perform 16-bit post-multiplication
277 // right-shifts. Let's consider two cases. If the value of the floating-point
278 // scale is greater than 1.0 then we know that at most 8 of the 32-bits in the
279 // input register are used, because the result must fit in 8-bits. The result
280 // of 8-bit times 8-bit multiplication is 16-bits, which leaves another 16
281 // bits that are unused. We can use these 16-bits to increase the size of the
282 // integer scale, and shift the result, as described above, without
283 // overflowing the register.
284 // The second case is where the scalar value is smaller than 1.0.
285 // Multiplication of any number by zero or one does not increase the number of
286 // bits which are used by the number.
287 //
288 // Now, we need to consider another problem. In the previous section we
289 // described how we scaled small fractions into a number that's close to one.
290 // But scaling to around 1.0 is not accurate enough. Rounding a scale factor
291 // like 0.6 to integer would give a very high error rate. Generally, we can't
292 // increase the size of the integer multiplier without a limit because this
293 // would overflow large values that are close to the upper signed 32-bit
294 // limit.
295 //
296 // To solve the accuracy problem we need to continue to increase the size of
297 // the integer scalar without overflowing the signed 32-bit register.
298 // The solution here is to perform right-shift on the input, in addition to
299 // the output. The idea here is that by performing the post-multiplication
300 // right-shift we pick the high bits from the result of the multiplication,
301 // and the low bits are ignored. This means that we can continue to increase
302 // the size of the integer multiplier and continue to increase the accuracy of
303 // the calculation by pre-shifting the 32-bit input. Shifting the input to the
304 // right would flip some input bits to zero, but the accuracy loss would be
305 // minimal.
306 //
307 // If the floating point scale factor small then it spans a small part of the
308 // 32-bit word. For example, a scale factor of 0.125 (1/8) scales some range
309 // into the signed 8-bit result. This range is 8 + 3 bits. This means that we
310 // can shift as much as 32-11 bits without overflowing the register. This is
311 // a net win because we get to increase the accuracy of the floating point
312 // scale factor. For very small scale factors, the used range is very large
313 // and can take up the whole 32-bit register, so overflow is a real problem.
314 // Here we can use the post-shift value to estimate how many bits will be
315 // discarded from the after the multiplication operation and figure out how
316 // many bits we can take from the bottom of the input word by shifting it to
317 // the right and add more precision to the integer scale multiplier.
318 int preShift = 0;
319 int postShift = 0;
320
321 // We treat first the particular case when scale is a power of 2 (2 ^ exp,
322 // where exp is a signed integer exponent). The operation is specialized as:
323 // - for positive 2's exponent:
324 // x * scale + offset (pre = 0, post = 0, scale = (int)scale).
325 // - for negative 2's exponent:
326 // x >> post + offset (pre = 0, post = -exp, scale = 1).
327 if (isFloatPowerOf2(scale)) {
328 int exp = getFloat2Exp(scale);
329 if (exp > 0) {
330 return QuantizationTransform32To8(0, // pre
331 0, // post
332 static_cast<int>(scale), // scale
333 offset); // offset
334 } else {
335 return QuantizationTransform32To8(0, // pre
336 -exp, // post
337 1, // scale
338 offset); // offset
339 }
340 }
341
342 // Calculate the post-shift value. It's always safe to increase scale as long
343 // as it's below one, and it's always legal to shift at least 15 bits for
344 // small scale values.
345 while (scale < 0.5 || (scale < 256 && postShift < 15)) {
346 scale *= 2;
347 postShift++;
348 }
349
350 // Calculate the pre-multiplication shift. Estimate how many bits we can take
351 // from the input number and pass to the integer scale.
352 while (scale < 255 && preShift < (postShift / 2)) {
353 scale *= 2;
354 preShift++;
355 }
356
357 return QuantizationTransform32To8(preShift, postShift, std::round(scale),
358 offset);
359}
360
361QuantizedRange getQuantizedRange(ElemKind qTy) {
362 // Pick int64_t in order to cover the uint32_t range.
363 int64_t qmin;
364 int64_t qmax;
365
366 switch (qTy) {
367 case ElemKind::Int8QTy: {
368 qmin = std::numeric_limits<int8_t>::min();
369 qmax = std::numeric_limits<int8_t>::max();
370 break;
371 }
372 case ElemKind::UInt8QTy: {
373 qmin = std::numeric_limits<uint8_t>::min();
374 qmax = std::numeric_limits<uint8_t>::max();
375 break;
376 }
377 case ElemKind::Int16QTy: {
378 qmin = std::numeric_limits<int16_t>::min();
379 qmax = std::numeric_limits<int16_t>::max();
380 break;
381 }
382 case ElemKind::Int32QTy: {
383 // A corner case is when quantizing the bias tensor which is later used in
384 // arithmetic computations as (int32)(bias[idx] - biasOffset) (e.g. in the
385 // LIBJIT function "libjit_scale_i32i8"). To avoid overflow we must restrict
386 // the quantization range such that the subtraction result fits int32. Since
387 // both bias[idx] and biasOffset are within the range [qmin, qmax] we will
388 // impose: min(int32) <= qmin - qmax and qmax - qmin <= max(int32). In other
389 // words we will restrict the quantized dynamic range to int31. Furthermore,
390 // since scale is computed as scale = (max - min) / (qmax - qmin) where
391 // (qmax - qmin) is large (~2^31) the scale computation has large errors.
392 // We will further limit the quantized range to int30 (one extra bit) in
393 // order for the computed scale to provide safe quantization within the
394 // intended range.
395 qmin = std::numeric_limits<int32_t>::min() >> 2;
396 qmax = std::numeric_limits<int32_t>::max() >> 2;
397 break;
398 }
399 default:
400 llvm_unreachable("Quantized type not supported");
401 }
402 return QuantizedRange(qmin, qmax);
403}
404
405void validateQuantizationParams(TensorQuantizationParams qParams, Schema schema,
406 ElemKind qTy) {
407
408 // Get the quantized range.
409 auto minMaxPair = getQuantizedRange(qTy);
410 int64_t qmin = minMaxPair.first;
411 int64_t qmax = minMaxPair.second;
412
413 // Validate params.
414 (void)(qmin);
415 (void)(qmax);
416 assert((qmin <= qParams.offset) && (qParams.offset <= qmax) &&
417 "The offset must be within the quantized range");
418 if (schema == quantization::Schema::Symmetric) {
419 assert((qParams.offset == 0) &&
420 "Symmetric quantization should have offset 0");
421 } else if (schema == quantization::Schema::SymmetricWithUnsigned) {
422 assert((qParams.offset == qmin || qParams.offset == 0) &&
423 "SymmetricWithUnsigned quantization should have offset 0 or qmin");
424 } else if (schema == quantization::Schema::SymmetricWithPower2Scale) {
425 assert((qParams.offset == 0) &&
426 "SymmetricWithPower2Scale quantization should have offset 0");
427 assert(isFloatPowerOf2(qParams.scale) &&
428 "SymmetricWithPower2Scale quantization parameter should be a power "
429 "of 2");
430 }
431}
432
433TensorQuantizationParams
434chooseQuantizationParams(TensorProfilingParams profParams, Schema schema,
435 ElemKind qTy, Calibration calibration) {
436 float min = profParams.min;
437 float max = profParams.max;
438 assert(min <= max && "min must not be bigger than max");
439
440 // Get the quantized range.
441 auto minMaxPair = getQuantizedRange(qTy);
442 int64_t qmin = minMaxPair.first;
443 int64_t qmax = minMaxPair.second;
444
445 // We extend the [min, max] interval to ensure that it contains 0.
446 // Otherwise, we would not meet the requirement that 0 be an exactly
447 // representable value.
448 min = std::min(min, 0.f);
449 max = std::max(max, 0.f);
450
451 if (schema == quantization::Schema::SymmetricWithUnsigned) {
452 // Check if the range we try to encode is purely positive.
453 // If not, we cannot use the Unsigned mapping and we fall back
454 // to the symmetric schema.
455 if (min >= 0.f) {
456 // By construction we always have zero to our range.
457 // Since min is >= 0 and 0 is in our range, min is
458 // actually zero.
459 // Therefore zero is going to be mapped to the first
460 // element of the quantized range qmin and thus the
461 // offset is going to be qmin.
462 assert(min <= std::numeric_limits<float>::epsilon() &&
463 "Our range should start at zero");
464 } else {
465 schema = quantization::Schema::Symmetric;
466 }
467 }
468 if (schema == quantization::Schema::Symmetric ||
469 schema == quantization::Schema::SymmetricWithPower2Scale) {
470 // Check which end saturates the output dynamic range earlier
471 // and extend the other end to map the zero-point to quantized 0.
472 assert(qmin < 0 && "Symmetric schema incompatible with unsigned range");
473 double rmin = min / (double)qmin;
474 double rmax = max / (double)qmax;
475 if (rmin > rmax) {
476 max = rmin * qmax;
477 } else {
478 min = rmax * qmin;
479 }
480 }
481
482 min = std::max(min, std::numeric_limits<float>::lowest());
483 max = std::min(max, std::numeric_limits<float>::max());
484
485 // Calibrate the min/max range (for non-zero ranges only).
486 if ((profParams.min != profParams.max) && (min != max) &&
487 (calibration == Calibration::KLMinimization)) {
488
489 // Rescale the profiled histogram with the new constrained min/max range.
490 auto histRescaled = rescaleHistogram(profParams.histogram, profParams.min,
491 profParams.max, min, max);
492
493 // Number of quantized bins. Default value from TVM / MXNet.
494 const size_t numQuantizedBins = 255;
495
496 // Set symmetric, only if schema is Symmetric or SymmetricWithPower2Scale
497 const bool symmetric =
498 (schema == quantization::Schema::Symmetric ||
499 schema == quantization::Schema::SymmetricWithPower2Scale);
500
501 // Optimize the range.
502 FloatRange rangeOpt =
503 optimizeKL(histRescaled, min, max, numQuantizedBins, symmetric);
504
505 // Update the min/max range with the optimized range.
506 min = rangeOpt.first;
507 max = rangeOpt.second;
508 }
509
510 // Compute scale.
511 double scale = ((double)max - min) / ((double)qmax - qmin);
512
513 // Dequantization uses the following formula scale * (X - offset), so
514 // scale should not be equal to zero.
515 // If scale is 0, we arbitrary adjust the scale to 0.1.
516 if (scale == 0) {
517 scale = 0.1;
518 }
519
520 assert(scale > 0 && "Scale must be non negative");
521
522 // Zero-point computation.
523 // First the initial floating-point computation. The zero-point can be
524 // determined from solving an affine equation for any known pair
525 // (real value, corresponding quantized value).
526 // We know two such pairs: (rmin, qmin) and (rmax, qmax).
527 // The arithmetic error on the zero point computed from either pair
528 // will be roughly machine_epsilon * (sum of absolute values of terms)
529 // so we want to use the variant that adds the smaller terms.
530 double zeroPointFromMin = qmin - min / scale;
531 double zeroPointFromMax = qmax - max / scale;
532 double zeroPointFromMinError = std::abs(qmin) + std::abs(min / scale);
533 double zeroPointFromMaxError = std::abs(qmax) + std::abs(max / scale);
534 double initialZeroPoint = zeroPointFromMinError < zeroPointFromMaxError
535 ? zeroPointFromMin
536 : zeroPointFromMax;
537
538 // For symmetric quantization, if min == -max, force the zero point to be 0.
539 float difference = std::abs(max + min);
540 if (difference <= std::numeric_limits<float>::epsilon()) {
541 initialZeroPoint = 0;
542 }
543
544 // Now we need to nudge the zero point to be an integer (our zero points are
545 // integer, and this is motivated by the requirement to be able to represent
546 // the real value "0" exactly as a quantized value, which is required in
547 // multiple places, for example in Im2col with SAME padding).
548 int32_t nudgedZeroPoint = 0;
549 if (initialZeroPoint < qmin) {
550 nudgedZeroPoint = qmin;
551 } else if (initialZeroPoint > qmax) {
552 nudgedZeroPoint = qmax;
553 } else {
554 nudgedZeroPoint = static_cast<int32_t>(round(initialZeroPoint));
555 }
556
557 // For SymmetricWithPower2Scale, round scale to nearest higher power of 2.
558 if (schema == quantization::Schema::SymmetricWithPower2Scale) {
559 scale = std::exp2(std::ceil(std::log2(scale)));
560 }
561
562 TensorQuantizationParams result{static_cast<float>(scale), nudgedZeroPoint};
563 validateQuantizationParams(result, schema, qTy);
564 return result;
565}
566
567TensorQuantizationParams
568specializeBiasQuantizationParams(const TensorQuantizationParams &biasTQP,
569 const TensorQuantizationParams &inputTQP,
570 const TensorQuantizationParams &weightsTQP,
571 Schema schema, ElemKind biasQTy,
572 bool biasZero) {
573 // Choose bias offset. For int32 bias we always force offset 0 in order
574 // to simplify the implementation since the dynamic range allows it.
575 int32_t biasOffset = biasTQP.offset;
576 if (biasQTy == ElemKind::Int32QTy) {
577 biasOffset = 0;
578 }
579 // Choose bias scale. We try to force the bias scale value to the product
580 // inputScale * weightsScale but only if the resulting scale is larger
581 // (in order to avoid bias data saturation).
582 float inputScale = inputTQP.scale;
583 float weightsScale = weightsTQP.scale;
584 float biasScale = biasTQP.scale;
585 if (inputScale * weightsScale >= biasScale || biasZero) {
586 biasScale = inputScale * weightsScale;
587 }
588 // Validate new bias TQP and return.
589 TensorQuantizationParams biasTQPNew = {biasScale, biasOffset};
590 validateQuantizationParams(biasTQPNew, schema, biasQTy);
591 return biasTQPNew;
592}
593
594void specializeBiasWeightsQuantizationParams(
595 TensorQuantizationParams &biasTQP, const TensorQuantizationParams &inputTQP,
596 TensorQuantizationParams &weightsTQP, Schema schema, ElemKind biasQTy,
597 bool biasZero) {
598 // Choose bias offset. For int32 bias we always force offset 0 in order
599 // to simplify the implementation since the dynamic range allows it.
600 if (biasQTy == ElemKind::Int32QTy) {
601 biasTQP.offset = 0;
602 }
603 // Choose bias scale. We try to force the bias scale value to the product
604 // inputScale * weightsScale but only if the resulting scale is larger
605 // (in order to avoid bias data saturation). Otherwise, for INT32 bias
606 // only, we change the weightsScale to enforce the equality.
607 float inputScale = inputTQP.scale;
608 float weightsScale = weightsTQP.scale;
609 float biasScale = biasTQP.scale;
610 if (inputScale * weightsScale >= biasScale || biasZero) {
611 biasScale = inputScale * weightsScale;
612 } else {
613 if (biasQTy == ElemKind::Int32QTy) {
614 weightsScale = biasScale / inputScale;
615 // The division above does not always ensure that biasScale equals the
616 // product inputScale * weightsScale because float32 division is not
617 // that accurate. Instead we force the equality explicitly.
618 biasScale = inputScale * weightsScale;
619 }
620 }
621 biasTQP.scale = biasScale;
622 weightsTQP.scale = weightsScale;
623 // Validate new bias and weights TQP.
624 if (biasQTy == ElemKind::Int32QTy) {
625 assert((biasTQP.scale == (inputTQP.scale * weightsTQP.scale)) &&
626 "Bias scale invalid!");
627 }
628 validateQuantizationParams(biasTQP, schema, biasQTy);
629}
630
631std::vector<TensorQuantizationParams>
632getTensorQuantizationParams(const Tensor &tensor, Schema qSchema, ElemKind qTy,
633 dim_t qDim, dim_t qStep) {
634
635 // Validate tensor parameters.
636 assert(qDim < tensor.dims().size() &&
637 "Quantization dimension exceeds max tensor dimension!");
638 assert(qStep > 0 &&
639 "Quantization step (granularity) must be greater than 0!");
640 assert((tensor.dims()[qDim] % qStep) == 0 &&
641 "Quantization step must divide dimension length!");
642 assert(tensor.getElementType() == ElemKind::FloatTy &&
643 "Tensor type should be float!");
644 dim_t groupNum = tensor.dims()[qDim] / qStep;
645
646 // Get tensor view with max of 6 dimensions.
647 auto dimsMax = expandDimsToMax(tensor.dims());
648 Tensor tensorMax = tensor.getUnowned(dimsMax);
649 auto tensorH = tensorMax.getHandle<float>();
650
651 // Find min/max for each quantization group.
652 std::vector<float> minArray(groupNum, std::numeric_limits<float>::max());
653 std::vector<float> maxArray(groupNum, std::numeric_limits<float>::lowest());
654 assert(dimsMax.size() == 6 &&
655 "Invalid number of dimensions for tensor expansion!");
656 for (dim_t idx0 = 0; idx0 < dimsMax[0]; idx0++) {
657 for (dim_t idx1 = 0; idx1 < dimsMax[1]; idx1++) {
658 for (dim_t idx2 = 0; idx2 < dimsMax[2]; idx2++) {
659 for (dim_t idx3 = 0; idx3 < dimsMax[3]; idx3++) {
660 for (dim_t idx4 = 0; idx4 < dimsMax[4]; idx4++) {
661 for (dim_t idx5 = 0; idx5 < dimsMax[5]; idx5++) {
662
663 // Current sample multidimensional index.
664 std::array<dim_t, 6> sampleIdx{
665 {idx0, idx1, idx2, idx3, idx4, idx5}};
666
667 // Find quantization group to which this sample belongs.
668 dim_t groupIdx = (dim_t)(sampleIdx[qDim] / qStep);
669
670 // Adjust min/max for current group.
671 if (tensorH.at(sampleIdx) < minArray[groupIdx]) {
672 minArray[groupIdx] = tensorH.at(sampleIdx);
673 }
674 if (tensorH.at(sampleIdx) > maxArray[groupIdx]) {
675 maxArray[groupIdx] = tensorH.at(sampleIdx);
676 }
677 }
678 }
679 }
680 }
681 }
682 }
683
684 // Compute the quantization parameters for each group.
685 std::vector<TensorQuantizationParams> TQP;
686 for (dim_t groupIdx = 0; groupIdx < groupNum; groupIdx++) {
687 TQP.push_back(chooseQuantizationParams(
688 {minArray[groupIdx], maxArray[groupIdx]}, qSchema, qTy));
689 }
690 return TQP;
691}
692
693void getTensorQuantizationParams(const Tensor &tensor, Tensor &scales,
694 Tensor &offsets, Schema qSchema, ElemKind qTy,
695 dim_t qDim, dim_t qStep) {
696 auto TQP = getTensorQuantizationParams(tensor, qSchema, qTy, qDim, qStep);
697 assert(scales.size() == TQP.size() && "Scales tensor size invalid!");
698 assert(offsets.size() == TQP.size() && "Offsets tensor size invalid!");
699 auto scalesH = scales.getHandle<float>();
700 auto offsetsH = offsets.getHandle<int32_t>();
701 for (dim_t idx = 0; idx < TQP.size(); idx++) {
702 scalesH.raw(idx) = TQP[idx].scale;
703 offsetsH.raw(idx) = TQP[idx].offset;
704 }
705}
706
707template <class eTy = int8_t>
708static void quantizeTensorImpl(Tensor *dest, const Tensor &src,
709 llvm::ArrayRef<TensorQuantizationParams> TQP,
710 dim_t qDim, dim_t qStep) {
711
712 // Validate tensor parameters.
713 assert(qDim < src.dims().size() &&
714 "Quantization dimension exceeds max tensor dimension!");
715 assert(qStep > 0 &&
716 "Quantization step (granularity) must be greater than 0!");
717 assert((src.dims()[qDim] % qStep) == 0 &&
718 "Quantization step must divide dimension length!");
719 assert(src.getElementType() == ElemKind::FloatTy &&
720 "Tensor type should be float!");
721 assert(TQP.size() == (src.dims()[qDim] / qStep) &&
722 "TensorQuantizationParams array size invalid!");
723
724 // Get tensor views with maximum dimensions.
725 auto dimsMax = expandDimsToMax(src.dims());
726 Tensor srcMax = src.getUnowned(dimsMax);
727 auto srcH = srcMax.getHandle<float>();
728 Tensor destMax = dest->getUnowned(dimsMax);
729 auto destH = destMax.getHandle<eTy>();
730
731 // Perform quantization for each group.
732 assert(dimsMax.size() == 6 &&
733 "Invalid number of dimensions for tensor expansion!");
734 for (dim_t idx0 = 0; idx0 < dimsMax[0]; idx0++) {
735 for (dim_t idx1 = 0; idx1 < dimsMax[1]; idx1++) {
736 for (dim_t idx2 = 0; idx2 < dimsMax[2]; idx2++) {
737 for (dim_t idx3 = 0; idx3 < dimsMax[3]; idx3++) {
738 for (dim_t idx4 = 0; idx4 < dimsMax[4]; idx4++) {
739 for (dim_t idx5 = 0; idx5 < dimsMax[5]; idx5++) {
740
741 // Current sample multidimensional index.
742 std::array<dim_t, 6> sampleIdx{
743 {idx0, idx1, idx2, idx3, idx4, idx5}};
744
745 // Find quantization group to which this sample belongs.
746 dim_t groupIdx = sampleIdx[qDim] / qStep;
747
748 // Quantize current sample with group specific quantization
749 // parameters.
750 destH.at(sampleIdx) = quantization::quantize<eTy>(
751 srcH.at(sampleIdx), TQP[groupIdx]);
752 }
753 }
754 }
755 }
756 }
757 }
758}
759
760Tensor quantizeTensor(const Tensor &tensor,
761 llvm::ArrayRef<TensorQuantizationParams> TQP,
762 ElemKind qTy, dim_t qDim, dim_t qStep) {
763 Tensor tensorQ(qTy, tensor.dims(), 1.0, 0);
764 assert(tensor.getType().isFPType() && "Type not supported yet");
765 if (qTy == ElemKind::Int8QTy) {
766 quantizeTensorImpl<int8_t>(&tensorQ, tensor, TQP, qDim, qStep);
767 } else if (qTy == ElemKind::UInt8QTy) {
768 quantizeTensorImpl<uint8_t>(&tensorQ, tensor, TQP, qDim, qStep);
769 } else if (qTy == ElemKind::Int16QTy) {
770 quantizeTensorImpl<int16_t>(&tensorQ, tensor, TQP, qDim, qStep);
771 } else if (qTy == ElemKind::Int32QTy) {
772 quantizeTensorImpl<int32_t>(&tensorQ, tensor, TQP, qDim, qStep);
773 } else {
774 llvm_unreachable("Quantization type not supported");
775 }
776 return tensorQ;
777}
778
779Tensor quantizeTensor(const Tensor &tensor, const Tensor &scales,
780 const Tensor &offsets, ElemKind qTy, dim_t qDim,
781 dim_t qStep) {
782 assert(scales.size() == offsets.size() &&
783 "Scales/Offsets tensor size invalid!");
784 auto scalesH = scales.getHandle<float>();
785 auto offsetsH = offsets.getHandle<int32_t>();
786 std::vector<TensorQuantizationParams> TQP;
787 for (dim_t idx = 0; idx < scales.size(); idx++) {
788 TQP.push_back({scalesH.raw(idx), offsetsH.raw(idx)});
789 }
790 return quantizeTensor(tensor, TQP, qTy, qDim, qStep);
791}
792
793bool isFloatPowerOf2(float val) {
794 // frexp returns mantissa normalized in [0.5,1) so compare with 0.5.
795 int exp;
796 return (std::abs(std::frexp(val, &exp)) == 0.5);
797}
798
799int getFloat2Exp(float val) { return std::ilogb(val); }
800
801} // namespace quantization
802} // namespace glow
803