1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include "glow/Quantization/Base/Base.h" |
18 | #include "glow/Base/Tensor.h" |
19 | #include "glow/Quantization/Base/Calibration.h" |
20 | #include "glow/Quantization/Base/Profile.h" |
21 | |
22 | #include <cmath> |
23 | |
24 | namespace glow { |
25 | namespace quantization { |
26 | |
27 | float getTensorAverageValue(const TensorProfilingParams &profParams) { |
28 | size_t numBins = profParams.histogram.size(); |
29 | assert(numBins > 0 && "Histogram is empty!" ); |
30 | float histDelta = (profParams.max - profParams.min) / (float)(numBins); |
31 | float histOff = profParams.min + histDelta / 2.0; |
32 | float histAvg = 0.0; |
33 | float histSum = 0.0; |
34 | for (size_t idx = 0; idx < numBins; ++idx) { |
35 | float histBinCenter = histOff + histDelta * (float)idx; |
36 | float histBinCount = profParams.histogram[idx]; |
37 | histAvg += histBinCenter * histBinCount; |
38 | histSum += histBinCount; |
39 | } |
40 | histAvg /= histSum; |
41 | return histAvg; |
42 | } |
43 | |
44 | template <class eTy = int8_t> |
45 | static void quantizeTensorUtil(Tensor *dest, const Tensor &src) { |
46 | auto destH = dest->getHandle<eTy>(); |
47 | TensorQuantizationParams TQP{dest->getType().getScale(), |
48 | dest->getType().getOffset()}; |
49 | switch (src.getElementType()) { |
50 | case ElemKind::FloatTy: { |
51 | auto srcHandle = src.getHandle<float>(); |
52 | for (size_t i = 0, e = destH.size(); i < e; ++i) { |
53 | destH.raw(i) = quantization::quantize<eTy>( |
54 | static_cast<float>(srcHandle.raw(i)), TQP); |
55 | } |
56 | break; |
57 | } |
58 | case ElemKind::Float16Ty: { |
59 | auto srcHandle = src.getHandle<float16_t>(); |
60 | for (size_t i = 0, e = destH.size(); i < e; ++i) { |
61 | destH.raw(i) = quantization::quantize<eTy>( |
62 | static_cast<float>(srcHandle.raw(i)), TQP); |
63 | } |
64 | break; |
65 | } |
66 | case ElemKind::BFloat16Ty: { |
67 | auto srcHandle = src.getHandle<bfloat16_t>(); |
68 | for (size_t i = 0, e = destH.size(); i < e; ++i) { |
69 | destH.raw(i) = quantization::quantize<eTy>( |
70 | static_cast<float>(srcHandle.raw(i)), TQP); |
71 | } |
72 | break; |
73 | } |
74 | default: |
75 | llvm_unreachable("Cannot quantize a type" ); |
76 | } |
77 | } |
78 | |
79 | Tensor quantizeTensor(const Tensor &tensor, const TensorQuantizationParams &TQP, |
80 | ElemKind Ty) { |
81 | Tensor tmp(Ty, tensor.dims(), TQP.scale, TQP.offset); |
82 | assert(tensor.getType().isFPType() && "Type not supported yet" ); |
83 | if (Ty == ElemKind::Int8QTy) { |
84 | quantizeTensorUtil<int8_t>(&tmp, tensor); |
85 | } else if (Ty == ElemKind::UInt8QTy) { |
86 | quantizeTensorUtil<uint8_t>(&tmp, tensor); |
87 | } else if (Ty == ElemKind::Int16QTy) { |
88 | quantizeTensorUtil<int16_t>(&tmp, tensor); |
89 | } else if (Ty == ElemKind::Int32QTy) { |
90 | quantizeTensorUtil<int32_t>(&tmp, tensor); |
91 | } else { |
92 | llvm_unreachable("Quantized type not supported" ); |
93 | } |
94 | return tmp; |
95 | } |
96 | |
97 | template <class eTy = int8_t> |
98 | static void dequantizeTensorUtil(Tensor *dest, const Tensor &src) { |
99 | TensorQuantizationParams TQP{src.getType().getScale(), |
100 | src.getType().getOffset()}; |
101 | auto srcHandle = src.getHandle<eTy>(); |
102 | switch (dest->getElementType()) { |
103 | case ElemKind::FloatTy: { |
104 | auto destH = dest->getHandle<float>(); |
105 | for (size_t i = 0, e = destH.size(); i < e; ++i) { |
106 | destH.raw(i) = quantization::dequantize<eTy>( |
107 | static_cast<eTy>(srcHandle.raw(i)), TQP); |
108 | } |
109 | break; |
110 | } |
111 | case ElemKind::Float16Ty: { |
112 | auto destH = dest->getHandle<float16_t>(); |
113 | for (size_t i = 0, e = destH.size(); i < e; ++i) { |
114 | destH.raw(i) = quantization::dequantize<eTy>( |
115 | static_cast<eTy>(srcHandle.raw(i)), TQP); |
116 | } |
117 | break; |
118 | } |
119 | case ElemKind::BFloat16Ty: { |
120 | auto destH = dest->getHandle<bfloat16_t>(); |
121 | for (size_t i = 0, e = destH.size(); i < e; ++i) { |
122 | destH.raw(i) = quantization::dequantize<eTy>( |
123 | static_cast<eTy>(srcHandle.raw(i)), TQP); |
124 | } |
125 | break; |
126 | } |
127 | default: |
128 | llvm_unreachable("Cannot dequantize to the given type" ); |
129 | } |
130 | } |
131 | |
132 | /// Helper for dequantizing UInt8FusedQTy \p src to \p dest. |
133 | template <typename DeqElemTy, typename ScaleOffsetTy> |
134 | static void dequantizeFusedRowwiseTensorUtil(Tensor &dest, const Tensor &src) { |
135 | auto dims = dest.dims(); |
136 | auto srcH = src.getHandle<uint8_t>(); |
137 | auto destH = dest.getHandle<DeqElemTy>(); |
138 | for (dim_t i = 0, e = dims[0]; i < e; ++i) { |
139 | float scale, offset; |
140 | std::tie(scale, offset) = srcH.getFusedScaleOffsetFromRow<ScaleOffsetTy>(i); |
141 | for (dim_t j = 0, f = dims[1]; j < f; ++j) { |
142 | destH.at({i, j}) = |
143 | static_cast<DeqElemTy>(quantization::dequantizeWithFloatOffset( |
144 | srcH.at({i, j}), scale, offset)); |
145 | } |
146 | } |
147 | } |
148 | |
149 | Tensor dequantizeTensor(const Tensor &tensor, ElemKind floatKind) { |
150 | assert(isFloatElemKind(floatKind) && |
151 | "Non supported output floating point type" ); |
152 | auto Ty = tensor.getType().getElementType(); |
153 | |
154 | if (Ty == ElemKind::UInt8FusedQTy || Ty == ElemKind::UInt8FusedFP16QTy) { |
155 | const bool scaleOffsetFP16 = Ty == ElemKind::UInt8FusedFP16QTy; |
156 | const dim_t scaleOffsetSize = |
157 | scaleOffsetFP16 ? sizeof(float16_t) : sizeof(float); |
158 | assert(tensor.dims().size() == 2 && "Fused tensors should be 2D" ); |
159 | assert(tensor.dims()[1] > 2 * scaleOffsetSize && |
160 | "Expected space for per-row scale/offset" ); |
161 | Tensor tmp(floatKind, {tensor.dims()[0], |
162 | tensor.dims()[1] - (dim_t)(2 * scaleOffsetSize)}); |
163 | switch (floatKind) { |
164 | case ElemKind::FloatTy: |
165 | if (scaleOffsetFP16) { |
166 | dequantizeFusedRowwiseTensorUtil<float, float16_t>(tmp, tensor); |
167 | } else { |
168 | dequantizeFusedRowwiseTensorUtil<float, float>(tmp, tensor); |
169 | } |
170 | break; |
171 | case ElemKind::Float16Ty: |
172 | if (scaleOffsetFP16) { |
173 | dequantizeFusedRowwiseTensorUtil<float16_t, float16_t>(tmp, tensor); |
174 | } else { |
175 | dequantizeFusedRowwiseTensorUtil<float16_t, float>(tmp, tensor); |
176 | } |
177 | break; |
178 | default: |
179 | llvm_unreachable("Cannot dequantize to the given type" ); |
180 | } |
181 | return tmp; |
182 | } |
183 | |
184 | Tensor tmp(floatKind, tensor.dims()); |
185 | if (Ty == ElemKind::Int8QTy) { |
186 | dequantizeTensorUtil<int8_t>(&tmp, tensor); |
187 | } else if (Ty == ElemKind::UInt8QTy) { |
188 | dequantizeTensorUtil<uint8_t>(&tmp, tensor); |
189 | } else if (Ty == ElemKind::Int16QTy) { |
190 | dequantizeTensorUtil<int16_t>(&tmp, tensor); |
191 | } else if (Ty == ElemKind::Int32QTy) { |
192 | dequantizeTensorUtil<int32_t>(&tmp, tensor); |
193 | } else { |
194 | llvm_unreachable("Input quantized type not supported" ); |
195 | } |
196 | return tmp; |
197 | } |
198 | |
199 | template <class T = float16_t> |
200 | static Tensor tensor4BitsFusedRowwiseDequantizationUtil(const Tensor &input) { |
201 | assert(input.dims().size() == 2 && "Input must be 2 dimensional." ); |
202 | // The output tensor should have the same raw as input tensor. Since the |
203 | // quantized tensor is in the following format: | 4bit quantized data | |
204 | // T scale | T offset| The columns of dequantized float data |
205 | // should be (input.dims()[1] - 2*sizeof(T)) * 2. |
206 | Tensor output( |
207 | ElemKind::FloatTy, |
208 | {input.dims()[0], (dim_t)(input.dims()[1] - 2 * sizeof(T)) * 2}); |
209 | auto srcH = input.getHandle<uint8_t>(); |
210 | auto destH = output.getHandle<float>(); |
211 | for (dim_t i = 0; i < input.dims()[0]; i++) { |
212 | T scale, offset; |
213 | std::tie(scale, offset) = srcH.getFusedScaleOffsetFromRow<T>(i); |
214 | for (dim_t j = 0; j < output.dims()[1]; j++) { |
215 | bool isMSB = (j % 2 == 1); |
216 | destH.at({i, j}) = dequantize4BitWithFloatOffset( |
217 | srcH.at({i, j / 2}), static_cast<float>(scale), |
218 | static_cast<float>(offset), isMSB); |
219 | } |
220 | } |
221 | return output; |
222 | } |
223 | |
224 | Tensor tensor4BitsFusedRowwiseDequantization(const Tensor &input) { |
225 | auto Ty = input.getType().getElementType(); |
226 | assert((Ty == ElemKind::UInt4FusedFP16QTy || Ty == ElemKind::UInt4FusedQTy) && |
227 | "Unsupported 4bits fused rw quantization type." ); |
228 | if (Ty == ElemKind::UInt4FusedFP16QTy) { |
229 | return tensor4BitsFusedRowwiseDequantizationUtil<float16_t>(input); |
230 | } |
231 | return tensor4BitsFusedRowwiseDequantizationUtil<float>(input); |
232 | } |
233 | |
234 | QuantizationTransform32To8 quantizeScaleOffset32To8(float scale, |
235 | int32_t offset) { |
236 | // In this function we compute an efficient way to convert signed 32-bit |
237 | // integers into signed 8-bit integers without the use of floating-point |
238 | // multiplication. Instead, we represent the original calculation: |
239 | // |
240 | // result = (x * scale + offset) |
241 | // |
242 | // as the following sequence of integer calculations: |
243 | // |
244 | // ((x >> pre_scale * integer_scale) >> post_scale) + offset |
245 | // |
246 | // This function converts the floating-point scale and offset values to the |
247 | // constants in the integer formula. |
248 | // |
249 | // In this method we assume that any signed 32-bit integer in the input word |
250 | // must be mapped into an 8-bit integer. If the scale factor is 2X, then the |
251 | // number 1000 won't be a legal input because after scaling the result would |
252 | // fall outside of the signed 8-bit range. Any 32-bit number that falls |
253 | // outside of signed the 8-bit output integer will be clipped. This gives us |
254 | // the ability to perform 32-bit arithmetic, as explained below. |
255 | // |
256 | // We can't accurately represent fraction scales (in the range zero to one), |
257 | // because the lowest integer multiplication value is one. For example, the |
258 | // scaling factor 0.25 must be represented as integer multiplication of either |
259 | // zero or one, which would result in an highly inaccurate output. |
260 | // Similarly, rounding the scaling factor of 1.6 to 2.0 would produce |
261 | // inaccurate results because drop a significant part of the number. |
262 | // |
263 | // The solution here is to scale (increase in size) the signed integer scalar, |
264 | // and divide the result by shifting it to the right hand side. For example, |
265 | // the floating-point scalar 0.41 is multiplied by 32x (to 13.12, rounded to |
266 | // 13). Then the signed 32-bit integer input is multiplied by 13, and then |
267 | // shifted 5 times to the right (to shrink the result back). The output of |
268 | // this calculation is (13.0 / 32), which is about ~0.4. |
269 | // |
270 | // This approach works well for some scale values. Notice that the modified |
271 | // integer multiplication requires more bits because the intermediate result |
272 | // is larger. Notice that it's always safe to promote the scalar value from a |
273 | // fraction up to one. When you multiply by the integer value one, the |
274 | // intermediate result does not overflow (does not require more bits). |
275 | // |
276 | // It is actually always safe to perform 16-bit post-multiplication |
277 | // right-shifts. Let's consider two cases. If the value of the floating-point |
278 | // scale is greater than 1.0 then we know that at most 8 of the 32-bits in the |
279 | // input register are used, because the result must fit in 8-bits. The result |
280 | // of 8-bit times 8-bit multiplication is 16-bits, which leaves another 16 |
281 | // bits that are unused. We can use these 16-bits to increase the size of the |
282 | // integer scale, and shift the result, as described above, without |
283 | // overflowing the register. |
284 | // The second case is where the scalar value is smaller than 1.0. |
285 | // Multiplication of any number by zero or one does not increase the number of |
286 | // bits which are used by the number. |
287 | // |
288 | // Now, we need to consider another problem. In the previous section we |
289 | // described how we scaled small fractions into a number that's close to one. |
290 | // But scaling to around 1.0 is not accurate enough. Rounding a scale factor |
291 | // like 0.6 to integer would give a very high error rate. Generally, we can't |
292 | // increase the size of the integer multiplier without a limit because this |
293 | // would overflow large values that are close to the upper signed 32-bit |
294 | // limit. |
295 | // |
296 | // To solve the accuracy problem we need to continue to increase the size of |
297 | // the integer scalar without overflowing the signed 32-bit register. |
298 | // The solution here is to perform right-shift on the input, in addition to |
299 | // the output. The idea here is that by performing the post-multiplication |
300 | // right-shift we pick the high bits from the result of the multiplication, |
301 | // and the low bits are ignored. This means that we can continue to increase |
302 | // the size of the integer multiplier and continue to increase the accuracy of |
303 | // the calculation by pre-shifting the 32-bit input. Shifting the input to the |
304 | // right would flip some input bits to zero, but the accuracy loss would be |
305 | // minimal. |
306 | // |
307 | // If the floating point scale factor small then it spans a small part of the |
308 | // 32-bit word. For example, a scale factor of 0.125 (1/8) scales some range |
309 | // into the signed 8-bit result. This range is 8 + 3 bits. This means that we |
310 | // can shift as much as 32-11 bits without overflowing the register. This is |
311 | // a net win because we get to increase the accuracy of the floating point |
312 | // scale factor. For very small scale factors, the used range is very large |
313 | // and can take up the whole 32-bit register, so overflow is a real problem. |
314 | // Here we can use the post-shift value to estimate how many bits will be |
315 | // discarded from the after the multiplication operation and figure out how |
316 | // many bits we can take from the bottom of the input word by shifting it to |
317 | // the right and add more precision to the integer scale multiplier. |
318 | int preShift = 0; |
319 | int postShift = 0; |
320 | |
321 | // We treat first the particular case when scale is a power of 2 (2 ^ exp, |
322 | // where exp is a signed integer exponent). The operation is specialized as: |
323 | // - for positive 2's exponent: |
324 | // x * scale + offset (pre = 0, post = 0, scale = (int)scale). |
325 | // - for negative 2's exponent: |
326 | // x >> post + offset (pre = 0, post = -exp, scale = 1). |
327 | if (isFloatPowerOf2(scale)) { |
328 | int exp = getFloat2Exp(scale); |
329 | if (exp > 0) { |
330 | return QuantizationTransform32To8(0, // pre |
331 | 0, // post |
332 | static_cast<int>(scale), // scale |
333 | offset); // offset |
334 | } else { |
335 | return QuantizationTransform32To8(0, // pre |
336 | -exp, // post |
337 | 1, // scale |
338 | offset); // offset |
339 | } |
340 | } |
341 | |
342 | // Calculate the post-shift value. It's always safe to increase scale as long |
343 | // as it's below one, and it's always legal to shift at least 15 bits for |
344 | // small scale values. |
345 | while (scale < 0.5 || (scale < 256 && postShift < 15)) { |
346 | scale *= 2; |
347 | postShift++; |
348 | } |
349 | |
350 | // Calculate the pre-multiplication shift. Estimate how many bits we can take |
351 | // from the input number and pass to the integer scale. |
352 | while (scale < 255 && preShift < (postShift / 2)) { |
353 | scale *= 2; |
354 | preShift++; |
355 | } |
356 | |
357 | return QuantizationTransform32To8(preShift, postShift, std::round(scale), |
358 | offset); |
359 | } |
360 | |
361 | QuantizedRange getQuantizedRange(ElemKind qTy) { |
362 | // Pick int64_t in order to cover the uint32_t range. |
363 | int64_t qmin; |
364 | int64_t qmax; |
365 | |
366 | switch (qTy) { |
367 | case ElemKind::Int8QTy: { |
368 | qmin = std::numeric_limits<int8_t>::min(); |
369 | qmax = std::numeric_limits<int8_t>::max(); |
370 | break; |
371 | } |
372 | case ElemKind::UInt8QTy: { |
373 | qmin = std::numeric_limits<uint8_t>::min(); |
374 | qmax = std::numeric_limits<uint8_t>::max(); |
375 | break; |
376 | } |
377 | case ElemKind::Int16QTy: { |
378 | qmin = std::numeric_limits<int16_t>::min(); |
379 | qmax = std::numeric_limits<int16_t>::max(); |
380 | break; |
381 | } |
382 | case ElemKind::Int32QTy: { |
383 | // A corner case is when quantizing the bias tensor which is later used in |
384 | // arithmetic computations as (int32)(bias[idx] - biasOffset) (e.g. in the |
385 | // LIBJIT function "libjit_scale_i32i8"). To avoid overflow we must restrict |
386 | // the quantization range such that the subtraction result fits int32. Since |
387 | // both bias[idx] and biasOffset are within the range [qmin, qmax] we will |
388 | // impose: min(int32) <= qmin - qmax and qmax - qmin <= max(int32). In other |
389 | // words we will restrict the quantized dynamic range to int31. Furthermore, |
390 | // since scale is computed as scale = (max - min) / (qmax - qmin) where |
391 | // (qmax - qmin) is large (~2^31) the scale computation has large errors. |
392 | // We will further limit the quantized range to int30 (one extra bit) in |
393 | // order for the computed scale to provide safe quantization within the |
394 | // intended range. |
395 | qmin = std::numeric_limits<int32_t>::min() >> 2; |
396 | qmax = std::numeric_limits<int32_t>::max() >> 2; |
397 | break; |
398 | } |
399 | default: |
400 | llvm_unreachable("Quantized type not supported" ); |
401 | } |
402 | return QuantizedRange(qmin, qmax); |
403 | } |
404 | |
405 | void validateQuantizationParams(TensorQuantizationParams qParams, Schema schema, |
406 | ElemKind qTy) { |
407 | |
408 | // Get the quantized range. |
409 | auto minMaxPair = getQuantizedRange(qTy); |
410 | int64_t qmin = minMaxPair.first; |
411 | int64_t qmax = minMaxPair.second; |
412 | |
413 | // Validate params. |
414 | (void)(qmin); |
415 | (void)(qmax); |
416 | assert((qmin <= qParams.offset) && (qParams.offset <= qmax) && |
417 | "The offset must be within the quantized range" ); |
418 | if (schema == quantization::Schema::Symmetric) { |
419 | assert((qParams.offset == 0) && |
420 | "Symmetric quantization should have offset 0" ); |
421 | } else if (schema == quantization::Schema::SymmetricWithUnsigned) { |
422 | assert((qParams.offset == qmin || qParams.offset == 0) && |
423 | "SymmetricWithUnsigned quantization should have offset 0 or qmin" ); |
424 | } else if (schema == quantization::Schema::SymmetricWithPower2Scale) { |
425 | assert((qParams.offset == 0) && |
426 | "SymmetricWithPower2Scale quantization should have offset 0" ); |
427 | assert(isFloatPowerOf2(qParams.scale) && |
428 | "SymmetricWithPower2Scale quantization parameter should be a power " |
429 | "of 2" ); |
430 | } |
431 | } |
432 | |
433 | TensorQuantizationParams |
434 | chooseQuantizationParams(TensorProfilingParams profParams, Schema schema, |
435 | ElemKind qTy, Calibration calibration) { |
436 | float min = profParams.min; |
437 | float max = profParams.max; |
438 | assert(min <= max && "min must not be bigger than max" ); |
439 | |
440 | // Get the quantized range. |
441 | auto minMaxPair = getQuantizedRange(qTy); |
442 | int64_t qmin = minMaxPair.first; |
443 | int64_t qmax = minMaxPair.second; |
444 | |
445 | // We extend the [min, max] interval to ensure that it contains 0. |
446 | // Otherwise, we would not meet the requirement that 0 be an exactly |
447 | // representable value. |
448 | min = std::min(min, 0.f); |
449 | max = std::max(max, 0.f); |
450 | |
451 | if (schema == quantization::Schema::SymmetricWithUnsigned) { |
452 | // Check if the range we try to encode is purely positive. |
453 | // If not, we cannot use the Unsigned mapping and we fall back |
454 | // to the symmetric schema. |
455 | if (min >= 0.f) { |
456 | // By construction we always have zero to our range. |
457 | // Since min is >= 0 and 0 is in our range, min is |
458 | // actually zero. |
459 | // Therefore zero is going to be mapped to the first |
460 | // element of the quantized range qmin and thus the |
461 | // offset is going to be qmin. |
462 | assert(min <= std::numeric_limits<float>::epsilon() && |
463 | "Our range should start at zero" ); |
464 | } else { |
465 | schema = quantization::Schema::Symmetric; |
466 | } |
467 | } |
468 | if (schema == quantization::Schema::Symmetric || |
469 | schema == quantization::Schema::SymmetricWithPower2Scale) { |
470 | // Check which end saturates the output dynamic range earlier |
471 | // and extend the other end to map the zero-point to quantized 0. |
472 | assert(qmin < 0 && "Symmetric schema incompatible with unsigned range" ); |
473 | double rmin = min / (double)qmin; |
474 | double rmax = max / (double)qmax; |
475 | if (rmin > rmax) { |
476 | max = rmin * qmax; |
477 | } else { |
478 | min = rmax * qmin; |
479 | } |
480 | } |
481 | |
482 | min = std::max(min, std::numeric_limits<float>::lowest()); |
483 | max = std::min(max, std::numeric_limits<float>::max()); |
484 | |
485 | // Calibrate the min/max range (for non-zero ranges only). |
486 | if ((profParams.min != profParams.max) && (min != max) && |
487 | (calibration == Calibration::KLMinimization)) { |
488 | |
489 | // Rescale the profiled histogram with the new constrained min/max range. |
490 | auto histRescaled = rescaleHistogram(profParams.histogram, profParams.min, |
491 | profParams.max, min, max); |
492 | |
493 | // Number of quantized bins. Default value from TVM / MXNet. |
494 | const size_t numQuantizedBins = 255; |
495 | |
496 | // Set symmetric, only if schema is Symmetric or SymmetricWithPower2Scale |
497 | const bool symmetric = |
498 | (schema == quantization::Schema::Symmetric || |
499 | schema == quantization::Schema::SymmetricWithPower2Scale); |
500 | |
501 | // Optimize the range. |
502 | FloatRange rangeOpt = |
503 | optimizeKL(histRescaled, min, max, numQuantizedBins, symmetric); |
504 | |
505 | // Update the min/max range with the optimized range. |
506 | min = rangeOpt.first; |
507 | max = rangeOpt.second; |
508 | } |
509 | |
510 | // Compute scale. |
511 | double scale = ((double)max - min) / ((double)qmax - qmin); |
512 | |
513 | // Dequantization uses the following formula scale * (X - offset), so |
514 | // scale should not be equal to zero. |
515 | // If scale is 0, we arbitrary adjust the scale to 0.1. |
516 | if (scale == 0) { |
517 | scale = 0.1; |
518 | } |
519 | |
520 | assert(scale > 0 && "Scale must be non negative" ); |
521 | |
522 | // Zero-point computation. |
523 | // First the initial floating-point computation. The zero-point can be |
524 | // determined from solving an affine equation for any known pair |
525 | // (real value, corresponding quantized value). |
526 | // We know two such pairs: (rmin, qmin) and (rmax, qmax). |
527 | // The arithmetic error on the zero point computed from either pair |
528 | // will be roughly machine_epsilon * (sum of absolute values of terms) |
529 | // so we want to use the variant that adds the smaller terms. |
530 | double zeroPointFromMin = qmin - min / scale; |
531 | double zeroPointFromMax = qmax - max / scale; |
532 | double zeroPointFromMinError = std::abs(qmin) + std::abs(min / scale); |
533 | double zeroPointFromMaxError = std::abs(qmax) + std::abs(max / scale); |
534 | double initialZeroPoint = zeroPointFromMinError < zeroPointFromMaxError |
535 | ? zeroPointFromMin |
536 | : zeroPointFromMax; |
537 | |
538 | // For symmetric quantization, if min == -max, force the zero point to be 0. |
539 | float difference = std::abs(max + min); |
540 | if (difference <= std::numeric_limits<float>::epsilon()) { |
541 | initialZeroPoint = 0; |
542 | } |
543 | |
544 | // Now we need to nudge the zero point to be an integer (our zero points are |
545 | // integer, and this is motivated by the requirement to be able to represent |
546 | // the real value "0" exactly as a quantized value, which is required in |
547 | // multiple places, for example in Im2col with SAME padding). |
548 | int32_t nudgedZeroPoint = 0; |
549 | if (initialZeroPoint < qmin) { |
550 | nudgedZeroPoint = qmin; |
551 | } else if (initialZeroPoint > qmax) { |
552 | nudgedZeroPoint = qmax; |
553 | } else { |
554 | nudgedZeroPoint = static_cast<int32_t>(round(initialZeroPoint)); |
555 | } |
556 | |
557 | // For SymmetricWithPower2Scale, round scale to nearest higher power of 2. |
558 | if (schema == quantization::Schema::SymmetricWithPower2Scale) { |
559 | scale = std::exp2(std::ceil(std::log2(scale))); |
560 | } |
561 | |
562 | TensorQuantizationParams result{static_cast<float>(scale), nudgedZeroPoint}; |
563 | validateQuantizationParams(result, schema, qTy); |
564 | return result; |
565 | } |
566 | |
567 | TensorQuantizationParams |
568 | specializeBiasQuantizationParams(const TensorQuantizationParams &biasTQP, |
569 | const TensorQuantizationParams &inputTQP, |
570 | const TensorQuantizationParams &weightsTQP, |
571 | Schema schema, ElemKind biasQTy, |
572 | bool biasZero) { |
573 | // Choose bias offset. For int32 bias we always force offset 0 in order |
574 | // to simplify the implementation since the dynamic range allows it. |
575 | int32_t biasOffset = biasTQP.offset; |
576 | if (biasQTy == ElemKind::Int32QTy) { |
577 | biasOffset = 0; |
578 | } |
579 | // Choose bias scale. We try to force the bias scale value to the product |
580 | // inputScale * weightsScale but only if the resulting scale is larger |
581 | // (in order to avoid bias data saturation). |
582 | float inputScale = inputTQP.scale; |
583 | float weightsScale = weightsTQP.scale; |
584 | float biasScale = biasTQP.scale; |
585 | if (inputScale * weightsScale >= biasScale || biasZero) { |
586 | biasScale = inputScale * weightsScale; |
587 | } |
588 | // Validate new bias TQP and return. |
589 | TensorQuantizationParams biasTQPNew = {biasScale, biasOffset}; |
590 | validateQuantizationParams(biasTQPNew, schema, biasQTy); |
591 | return biasTQPNew; |
592 | } |
593 | |
594 | void specializeBiasWeightsQuantizationParams( |
595 | TensorQuantizationParams &biasTQP, const TensorQuantizationParams &inputTQP, |
596 | TensorQuantizationParams &weightsTQP, Schema schema, ElemKind biasQTy, |
597 | bool biasZero) { |
598 | // Choose bias offset. For int32 bias we always force offset 0 in order |
599 | // to simplify the implementation since the dynamic range allows it. |
600 | if (biasQTy == ElemKind::Int32QTy) { |
601 | biasTQP.offset = 0; |
602 | } |
603 | // Choose bias scale. We try to force the bias scale value to the product |
604 | // inputScale * weightsScale but only if the resulting scale is larger |
605 | // (in order to avoid bias data saturation). Otherwise, for INT32 bias |
606 | // only, we change the weightsScale to enforce the equality. |
607 | float inputScale = inputTQP.scale; |
608 | float weightsScale = weightsTQP.scale; |
609 | float biasScale = biasTQP.scale; |
610 | if (inputScale * weightsScale >= biasScale || biasZero) { |
611 | biasScale = inputScale * weightsScale; |
612 | } else { |
613 | if (biasQTy == ElemKind::Int32QTy) { |
614 | weightsScale = biasScale / inputScale; |
615 | // The division above does not always ensure that biasScale equals the |
616 | // product inputScale * weightsScale because float32 division is not |
617 | // that accurate. Instead we force the equality explicitly. |
618 | biasScale = inputScale * weightsScale; |
619 | } |
620 | } |
621 | biasTQP.scale = biasScale; |
622 | weightsTQP.scale = weightsScale; |
623 | // Validate new bias and weights TQP. |
624 | if (biasQTy == ElemKind::Int32QTy) { |
625 | assert((biasTQP.scale == (inputTQP.scale * weightsTQP.scale)) && |
626 | "Bias scale invalid!" ); |
627 | } |
628 | validateQuantizationParams(biasTQP, schema, biasQTy); |
629 | } |
630 | |
631 | std::vector<TensorQuantizationParams> |
632 | getTensorQuantizationParams(const Tensor &tensor, Schema qSchema, ElemKind qTy, |
633 | dim_t qDim, dim_t qStep) { |
634 | |
635 | // Validate tensor parameters. |
636 | assert(qDim < tensor.dims().size() && |
637 | "Quantization dimension exceeds max tensor dimension!" ); |
638 | assert(qStep > 0 && |
639 | "Quantization step (granularity) must be greater than 0!" ); |
640 | assert((tensor.dims()[qDim] % qStep) == 0 && |
641 | "Quantization step must divide dimension length!" ); |
642 | assert(tensor.getElementType() == ElemKind::FloatTy && |
643 | "Tensor type should be float!" ); |
644 | dim_t groupNum = tensor.dims()[qDim] / qStep; |
645 | |
646 | // Get tensor view with max of 6 dimensions. |
647 | auto dimsMax = expandDimsToMax(tensor.dims()); |
648 | Tensor tensorMax = tensor.getUnowned(dimsMax); |
649 | auto tensorH = tensorMax.getHandle<float>(); |
650 | |
651 | // Find min/max for each quantization group. |
652 | std::vector<float> minArray(groupNum, std::numeric_limits<float>::max()); |
653 | std::vector<float> maxArray(groupNum, std::numeric_limits<float>::lowest()); |
654 | assert(dimsMax.size() == 6 && |
655 | "Invalid number of dimensions for tensor expansion!" ); |
656 | for (dim_t idx0 = 0; idx0 < dimsMax[0]; idx0++) { |
657 | for (dim_t idx1 = 0; idx1 < dimsMax[1]; idx1++) { |
658 | for (dim_t idx2 = 0; idx2 < dimsMax[2]; idx2++) { |
659 | for (dim_t idx3 = 0; idx3 < dimsMax[3]; idx3++) { |
660 | for (dim_t idx4 = 0; idx4 < dimsMax[4]; idx4++) { |
661 | for (dim_t idx5 = 0; idx5 < dimsMax[5]; idx5++) { |
662 | |
663 | // Current sample multidimensional index. |
664 | std::array<dim_t, 6> sampleIdx{ |
665 | {idx0, idx1, idx2, idx3, idx4, idx5}}; |
666 | |
667 | // Find quantization group to which this sample belongs. |
668 | dim_t groupIdx = (dim_t)(sampleIdx[qDim] / qStep); |
669 | |
670 | // Adjust min/max for current group. |
671 | if (tensorH.at(sampleIdx) < minArray[groupIdx]) { |
672 | minArray[groupIdx] = tensorH.at(sampleIdx); |
673 | } |
674 | if (tensorH.at(sampleIdx) > maxArray[groupIdx]) { |
675 | maxArray[groupIdx] = tensorH.at(sampleIdx); |
676 | } |
677 | } |
678 | } |
679 | } |
680 | } |
681 | } |
682 | } |
683 | |
684 | // Compute the quantization parameters for each group. |
685 | std::vector<TensorQuantizationParams> TQP; |
686 | for (dim_t groupIdx = 0; groupIdx < groupNum; groupIdx++) { |
687 | TQP.push_back(chooseQuantizationParams( |
688 | {minArray[groupIdx], maxArray[groupIdx]}, qSchema, qTy)); |
689 | } |
690 | return TQP; |
691 | } |
692 | |
693 | void getTensorQuantizationParams(const Tensor &tensor, Tensor &scales, |
694 | Tensor &offsets, Schema qSchema, ElemKind qTy, |
695 | dim_t qDim, dim_t qStep) { |
696 | auto TQP = getTensorQuantizationParams(tensor, qSchema, qTy, qDim, qStep); |
697 | assert(scales.size() == TQP.size() && "Scales tensor size invalid!" ); |
698 | assert(offsets.size() == TQP.size() && "Offsets tensor size invalid!" ); |
699 | auto scalesH = scales.getHandle<float>(); |
700 | auto offsetsH = offsets.getHandle<int32_t>(); |
701 | for (dim_t idx = 0; idx < TQP.size(); idx++) { |
702 | scalesH.raw(idx) = TQP[idx].scale; |
703 | offsetsH.raw(idx) = TQP[idx].offset; |
704 | } |
705 | } |
706 | |
707 | template <class eTy = int8_t> |
708 | static void quantizeTensorImpl(Tensor *dest, const Tensor &src, |
709 | llvm::ArrayRef<TensorQuantizationParams> TQP, |
710 | dim_t qDim, dim_t qStep) { |
711 | |
712 | // Validate tensor parameters. |
713 | assert(qDim < src.dims().size() && |
714 | "Quantization dimension exceeds max tensor dimension!" ); |
715 | assert(qStep > 0 && |
716 | "Quantization step (granularity) must be greater than 0!" ); |
717 | assert((src.dims()[qDim] % qStep) == 0 && |
718 | "Quantization step must divide dimension length!" ); |
719 | assert(src.getElementType() == ElemKind::FloatTy && |
720 | "Tensor type should be float!" ); |
721 | assert(TQP.size() == (src.dims()[qDim] / qStep) && |
722 | "TensorQuantizationParams array size invalid!" ); |
723 | |
724 | // Get tensor views with maximum dimensions. |
725 | auto dimsMax = expandDimsToMax(src.dims()); |
726 | Tensor srcMax = src.getUnowned(dimsMax); |
727 | auto srcH = srcMax.getHandle<float>(); |
728 | Tensor destMax = dest->getUnowned(dimsMax); |
729 | auto destH = destMax.getHandle<eTy>(); |
730 | |
731 | // Perform quantization for each group. |
732 | assert(dimsMax.size() == 6 && |
733 | "Invalid number of dimensions for tensor expansion!" ); |
734 | for (dim_t idx0 = 0; idx0 < dimsMax[0]; idx0++) { |
735 | for (dim_t idx1 = 0; idx1 < dimsMax[1]; idx1++) { |
736 | for (dim_t idx2 = 0; idx2 < dimsMax[2]; idx2++) { |
737 | for (dim_t idx3 = 0; idx3 < dimsMax[3]; idx3++) { |
738 | for (dim_t idx4 = 0; idx4 < dimsMax[4]; idx4++) { |
739 | for (dim_t idx5 = 0; idx5 < dimsMax[5]; idx5++) { |
740 | |
741 | // Current sample multidimensional index. |
742 | std::array<dim_t, 6> sampleIdx{ |
743 | {idx0, idx1, idx2, idx3, idx4, idx5}}; |
744 | |
745 | // Find quantization group to which this sample belongs. |
746 | dim_t groupIdx = sampleIdx[qDim] / qStep; |
747 | |
748 | // Quantize current sample with group specific quantization |
749 | // parameters. |
750 | destH.at(sampleIdx) = quantization::quantize<eTy>( |
751 | srcH.at(sampleIdx), TQP[groupIdx]); |
752 | } |
753 | } |
754 | } |
755 | } |
756 | } |
757 | } |
758 | } |
759 | |
760 | Tensor quantizeTensor(const Tensor &tensor, |
761 | llvm::ArrayRef<TensorQuantizationParams> TQP, |
762 | ElemKind qTy, dim_t qDim, dim_t qStep) { |
763 | Tensor tensorQ(qTy, tensor.dims(), 1.0, 0); |
764 | assert(tensor.getType().isFPType() && "Type not supported yet" ); |
765 | if (qTy == ElemKind::Int8QTy) { |
766 | quantizeTensorImpl<int8_t>(&tensorQ, tensor, TQP, qDim, qStep); |
767 | } else if (qTy == ElemKind::UInt8QTy) { |
768 | quantizeTensorImpl<uint8_t>(&tensorQ, tensor, TQP, qDim, qStep); |
769 | } else if (qTy == ElemKind::Int16QTy) { |
770 | quantizeTensorImpl<int16_t>(&tensorQ, tensor, TQP, qDim, qStep); |
771 | } else if (qTy == ElemKind::Int32QTy) { |
772 | quantizeTensorImpl<int32_t>(&tensorQ, tensor, TQP, qDim, qStep); |
773 | } else { |
774 | llvm_unreachable("Quantization type not supported" ); |
775 | } |
776 | return tensorQ; |
777 | } |
778 | |
779 | Tensor quantizeTensor(const Tensor &tensor, const Tensor &scales, |
780 | const Tensor &offsets, ElemKind qTy, dim_t qDim, |
781 | dim_t qStep) { |
782 | assert(scales.size() == offsets.size() && |
783 | "Scales/Offsets tensor size invalid!" ); |
784 | auto scalesH = scales.getHandle<float>(); |
785 | auto offsetsH = offsets.getHandle<int32_t>(); |
786 | std::vector<TensorQuantizationParams> TQP; |
787 | for (dim_t idx = 0; idx < scales.size(); idx++) { |
788 | TQP.push_back({scalesH.raw(idx), offsetsH.raw(idx)}); |
789 | } |
790 | return quantizeTensor(tensor, TQP, qTy, qDim, qStep); |
791 | } |
792 | |
793 | bool isFloatPowerOf2(float val) { |
794 | // frexp returns mantissa normalized in [0.5,1) so compare with 0.5. |
795 | int exp; |
796 | return (std::abs(std::frexp(val, &exp)) == 0.5); |
797 | } |
798 | |
799 | int getFloat2Exp(float val) { return std::ilogb(val); } |
800 | |
801 | } // namespace quantization |
802 | } // namespace glow |
803 | |