Base.cpp source code [glow/lib/Quantization/Base/Base.cpp]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#include "glow/Quantization/Base/Base.h"
18	#include "glow/Base/Tensor.h"
19	#include "glow/Quantization/Base/Calibration.h"
20	#include "glow/Quantization/Base/Profile.h"
21
22	#include <cmath>
23
24	namespace glow {
25	namespace quantization {
26
27	float getTensorAverageValue(const TensorProfilingParams &profParams) {
28	size_t numBins = profParams.histogram.size();
29	assert(numBins > `0` && "Histogram is empty!");
30	float histDelta = (profParams.max - profParams.min) / (float)(numBins);
31	float histOff = profParams.min + histDelta / `2.0`;
32	float histAvg = `0.0`;
33	float histSum = `0.0`;
34	for (size_t idx = `0`; idx < numBins; ++idx) {
35	float histBinCenter = histOff + histDelta * (float)idx;
36	float histBinCount = profParams.histogram [idx];
37	histAvg += histBinCenter * histBinCount;
38	histSum += histBinCount;
39	}
40	histAvg /= histSum;
41	return histAvg;
42	}
43
44	template <class eTy = int8_t>
45	static void quantizeTensorUtil(Tensor dest, const* Tensor &src) {
46	auto destH = dest->getHandle<eTy>();
47	TensorQuantizationParams TQP{dest->getType().getScale(),
48	dest->getType().getOffset()};
49	switch (src.getElementType()) {
50	case ElemKind::FloatTy: {
51	auto srcHandle = src.getHandle<float>();
52	for (size_t i = `0`, e = destH.size(); i < e; ++i) {
53	destH.raw(i) = quantization::quantize<eTy>(
54	static_cast<float>(srcHandle.raw(i)), TQP);
55	}
56	break;
57	}
58	case ElemKind::Float16Ty: {
59	auto srcHandle = src.getHandle<float16_t>();
60	for (size_t i = `0`, e = destH.size(); i < e; ++i) {
61	destH.raw(i) = quantization::quantize<eTy>(
62	static_cast<float>(srcHandle.raw(i)), TQP);
63	}
64	break;
65	}
66	case ElemKind::BFloat16Ty: {
67	auto srcHandle = src.getHandle<bfloat16_t>();
68	for (size_t i = `0`, e = destH.size(); i < e; ++i) {
69	destH.raw(i) = quantization::quantize<eTy>(
70	static_cast<float>(srcHandle.raw(i)), TQP);
71	}
72	break;
73	}
74	default:
75	llvm_unreachable("Cannot quantize a type");
76	}
77	}
78
79	Tensor quantizeTensor(const Tensor &tensor, const TensorQuantizationParams &TQP,
80	ElemKind Ty) {
81	Tensor tmp(Ty, tensor.dims(), TQP.scale, TQP.offset);
82	assert(tensor.getType().isFPType() && "Type not supported yet");
83	if (Ty == ElemKind::Int8QTy) {
84	quantizeTensorUtil<int8_t>(&tmp, tensor);
85	} else if (Ty == ElemKind::UInt8QTy) {
86	quantizeTensorUtil<uint8_t>(&tmp, tensor);
87	} else if (Ty == ElemKind::Int16QTy) {
88	quantizeTensorUtil<int16_t>(&tmp, tensor);
89	} else if (Ty == ElemKind::Int32QTy) {
90	quantizeTensorUtil<int32_t>(&tmp, tensor);
91	} else {
92	llvm_unreachable("Quantized type not supported");
93	}
94	return tmp;
95	}
96
97	template <class eTy = int8_t>
98	static void dequantizeTensorUtil(Tensor dest, const* Tensor &src) {
99	TensorQuantizationParams TQP{src.getType().getScale(),
100	src.getType().getOffset()};
101	auto srcHandle = src.getHandle<eTy>();
102	switch (dest->getElementType()) {
103	case ElemKind::FloatTy: {
104	auto destH = dest->getHandle<float>();
105	for (size_t i = `0`, e = destH.size(); i < e; ++i) {
106	destH.raw(i) = quantization::dequantize<eTy>(
107	static_cast<eTy>(srcHandle.raw(i)), TQP);
108	}
109	break;
110	}
111	case ElemKind::Float16Ty: {
112	auto destH = dest->getHandle<float16_t>();
113	for (size_t i = `0`, e = destH.size(); i < e; ++i) {
114	destH.raw(i) = quantization::dequantize<eTy>(
115	static_cast<eTy>(srcHandle.raw(i)), TQP);
116	}
117	break;
118	}
119	case ElemKind::BFloat16Ty: {
120	auto destH = dest->getHandle<bfloat16_t>();
121	for (size_t i = `0`, e = destH.size(); i < e; ++i) {
122	destH.raw(i) = quantization::dequantize<eTy>(
123	static_cast<eTy>(srcHandle.raw(i)), TQP);
124	}
125	break;
126	}
127	default:
128	llvm_unreachable("Cannot dequantize to the given type");
129	}
130	}
131
132	/// Helper for dequantizing UInt8FusedQTy \p src to \p dest.
133	template <typename DeqElemTy, typename ScaleOffsetTy>
134	static void dequantizeFusedRowwiseTensorUtil(Tensor &dest, const Tensor &src) {
135	auto dims = dest.dims();
136	auto srcH = src.getHandle<uint8_t>();
137	auto destH = dest.getHandle<DeqElemTy>();
138	for (dim_t i = `0`, e = dims [`0`]; i < e; ++i) {
139	float scale, offset;
140	std::tie(scale, offset) = srcH.getFusedScaleOffsetFromRow<ScaleOffsetTy>(i);
141	for (dim_t j = `0`, f = dims [`1`]; j < f; ++j) {
142	destH.at({i, j}) =
143	static_cast<DeqElemTy>(quantization::dequantizeWithFloatOffset(
144	srcH.at({i, j}), scale, offset));
145	}
146	}
147	}
148
149	Tensor dequantizeTensor(const Tensor &tensor, ElemKind floatKind) {
150	assert(isFloatElemKind(floatKind) &&
151	"Non supported output floating point type");
152	auto Ty = tensor.getType().getElementType();
153
154	if (Ty == ElemKind::UInt8FusedQTy \|\| Ty == ElemKind::UInt8FusedFP16QTy) {
155	const bool scaleOffsetFP16 = Ty == ElemKind::UInt8FusedFP16QTy;
156	const dim_t scaleOffsetSize =
157	scaleOffsetFP16 ? sizeof(float16_t) : sizeof(float);
158	assert(tensor.dims().size() == `2` && "Fused tensors should be 2D");
159	assert(tensor.dims()[`1`] > `2` * scaleOffsetSize &&
160	"Expected space for per-row scale/offset");
161	Tensor tmp(floatKind, {tensor.dims()[`0`],
162	tensor.dims()[`1`] - (dim_t)(`2` * scaleOffsetSize)});
163	switch (floatKind) {
164	case ElemKind::FloatTy:
165	if (scaleOffsetFP16) {
166	dequantizeFusedRowwiseTensorUtil<float, float16_t>(tmp, tensor);
167	} else {
168	dequantizeFusedRowwiseTensorUtil<float, float>(tmp, tensor);
169	}
170	break;
171	case ElemKind::Float16Ty:
172	if (scaleOffsetFP16) {
173	dequantizeFusedRowwiseTensorUtil<float16_t, float16_t>(tmp, tensor);
174	} else {
175	dequantizeFusedRowwiseTensorUtil<float16_t, float>(tmp, tensor);
176	}
177	break;
178	default:
179	llvm_unreachable("Cannot dequantize to the given type");
180	}
181	return tmp;
182	}
183
184	Tensor tmp(floatKind, tensor.dims());
185	if (Ty == ElemKind::Int8QTy) {
186	dequantizeTensorUtil<int8_t>(&tmp, tensor);
187	} else if (Ty == ElemKind::UInt8QTy) {
188	dequantizeTensorUtil<uint8_t>(&tmp, tensor);
189	} else if (Ty == ElemKind::Int16QTy) {
190	dequantizeTensorUtil<int16_t>(&tmp, tensor);
191	} else if (Ty == ElemKind::Int32QTy) {
192	dequantizeTensorUtil<int32_t>(&tmp, tensor);
193	} else {
194	llvm_unreachable("Input quantized type not supported");
195	}
196	return tmp;
197	}
198
199	template <class T = float16_t>
200	static Tensor tensor4BitsFusedRowwiseDequantizationUtil(const Tensor &input) {
201	assert(input.dims().size() == `2` && "Input must be 2 dimensional.");
202	// The output tensor should have the same raw as input tensor. Since the
203	// quantized tensor is in the following format: \| 4bit quantized data \|
204	// T scale \| T offset\| The columns of dequantized float data
205	// should be (input.dims()[1] - 2sizeof(T)) * 2.*
206	Tensor output(
207	ElemKind::FloatTy,
208	{input.dims()[`0`], (dim_t)(input.dims()[`1`] - `2` * sizeof(T)) * `2`});
209	auto srcH = input.getHandle<uint8_t>();
210	auto destH = output.getHandle<float>();
211	for (dim_t i = `0`; i < input.dims()[`0`]; i++) {
212	T scale, offset;
213	std::tie(scale, offset) = srcH.getFusedScaleOffsetFromRow<T>(i);
214	for (dim_t j = `0`; j < output.dims()[`1`]; j++) {
215	bool isMSB = (j % `2` == `1`);
216	destH.at({i, j}) = dequantize4BitWithFloatOffset(
217	srcH.at({i, j / `2`}), static_cast<float>(scale),
218	static_cast<float>(offset), isMSB);
219	}
220	}
221	return output;
222	}
223
224	Tensor tensor4BitsFusedRowwiseDequantization(const Tensor &input) {
225	auto Ty = input.getType().getElementType();
226	assert((Ty == ElemKind::UInt4FusedFP16QTy \|\| Ty == ElemKind::UInt4FusedQTy) &&
227	"Unsupported 4bits fused rw quantization type.");
228	if (Ty == ElemKind::UInt4FusedFP16QTy) {
229	return tensor4BitsFusedRowwiseDequantizationUtil<float16_t>(input);
230	}
231	return tensor4BitsFusedRowwiseDequantizationUtil<float>(input);
232	}
233
234	QuantizationTransform32To8 quantizeScaleOffset32To8(float scale,
235	int32_t offset) {
236	// In this function we compute an efficient way to convert signed 32-bit
237	// integers into signed 8-bit integers without the use of floating-point
238	// multiplication. Instead, we represent the original calculation:
239	//
240	// result = (x scale + offset)*
241	//
242	// as the following sequence of integer calculations:
243	//
244	// ((x >> pre_scale integer_scale) >> post_scale) + offset*
245	//
246	// This function converts the floating-point scale and offset values to the
247	// constants in the integer formula.
248	//
249	// In this method we assume that any signed 32-bit integer in the input word
250	// must be mapped into an 8-bit integer. If the scale factor is 2X, then the
251	// number 1000 won't be a legal input because after scaling the result would
252	// fall outside of the signed 8-bit range. Any 32-bit number that falls
253	// outside of signed the 8-bit output integer will be clipped. This gives us
254	// the ability to perform 32-bit arithmetic, as explained below.
255	//
256	// We can't accurately represent fraction scales (in the range zero to one),
257	// because the lowest integer multiplication value is one. For example, the
258	// scaling factor 0.25 must be represented as integer multiplication of either
259	// zero or one, which would result in an highly inaccurate output.
260	// Similarly, rounding the scaling factor of 1.6 to 2.0 would produce
261	// inaccurate results because drop a significant part of the number.
262	//
263	// The solution here is to scale (increase in size) the signed integer scalar,
264	// and divide the result by shifting it to the right hand side. For example,
265	// the floating-point scalar 0.41 is multiplied by 32x (to 13.12, rounded to
266	// 13). Then the signed 32-bit integer input is multiplied by 13, and then
267	// shifted 5 times to the right (to shrink the result back). The output of
268	// this calculation is (13.0 / 32), which is about ~0.4.
269	//
270	// This approach works well for some scale values. Notice that the modified
271	// integer multiplication requires more bits because the intermediate result
272	// is larger. Notice that it's always safe to promote the scalar value from a
273	// fraction up to one. When you multiply by the integer value one, the
274	// intermediate result does not overflow (does not require more bits).
275	//
276	// It is actually always safe to perform 16-bit post-multiplication
277	// right-shifts. Let's consider two cases. If the value of the floating-point
278	// scale is greater than 1.0 then we know that at most 8 of the 32-bits in the
279	// input register are used, because the result must fit in 8-bits. The result
280	// of 8-bit times 8-bit multiplication is 16-bits, which leaves another 16
281	// bits that are unused. We can use these 16-bits to increase the size of the
282	// integer scale, and shift the result, as described above, without
283	// overflowing the register.
284	// The second case is where the scalar value is smaller than 1.0.
285	// Multiplication of any number by zero or one does not increase the number of
286	// bits which are used by the number.
287	//
288	// Now, we need to consider another problem. In the previous section we
289	// described how we scaled small fractions into a number that's close to one.
290	// But scaling to around 1.0 is not accurate enough. Rounding a scale factor
291	// like 0.6 to integer would give a very high error rate. Generally, we can't
292	// increase the size of the integer multiplier without a limit because this
293	// would overflow large values that are close to the upper signed 32-bit
294	// limit.
295	//
296	// To solve the accuracy problem we need to continue to increase the size of
297	// the integer scalar without overflowing the signed 32-bit register.
298	// The solution here is to perform right-shift on the input, in addition to
299	// the output. The idea here is that by performing the post-multiplication
300	// right-shift we pick the high bits from the result of the multiplication,
301	// and the low bits are ignored. This means that we can continue to increase
302	// the size of the integer multiplier and continue to increase the accuracy of
303	// the calculation by pre-shifting the 32-bit input. Shifting the input to the
304	// right would flip some input bits to zero, but the accuracy loss would be
305	// minimal.
306	//
307	// If the floating point scale factor small then it spans a small part of the
308	// 32-bit word. For example, a scale factor of 0.125 (1/8) scales some range
309	// into the signed 8-bit result. This range is 8 + 3 bits. This means that we
310	// can shift as much as 32-11 bits without overflowing the register. This is
311	// a net win because we get to increase the accuracy of the floating point
312	// scale factor. For very small scale factors, the used range is very large
313	// and can take up the whole 32-bit register, so overflow is a real problem.
314	// Here we can use the post-shift value to estimate how many bits will be
315	// discarded from the after the multiplication operation and figure out how
316	// many bits we can take from the bottom of the input word by shifting it to
317	// the right and add more precision to the integer scale multiplier.
318	int preShift = `0`;
319	int postShift = `0`;
320
321	// We treat first the particular case when scale is a power of 2 (2 ^ exp,
322	// where exp is a signed integer exponent). The operation is specialized as:
323	// - for positive 2's exponent:
324	// x scale + offset (pre = 0, post = 0, scale = (int)scale).*
325	// - for negative 2's exponent:
326	// x >> post + offset (pre = 0, post = -exp, scale = 1).
327	if (isFloatPowerOf2(scale)) {
328	int exp = getFloat2Exp(scale);
329	if (exp > `0`) {
330	return QuantizationTransform32To8 (`0`, // pre
331	`0`, // post
332	static_cast<int>(scale), // scale
333	offset); // offset
334	} else {
335	return QuantizationTransform32To8 (`0`, // pre
336	-exp, // post
337	`1`, // scale
338	offset); // offset
339	}
340	}
341
342	// Calculate the post-shift value. It's always safe to increase scale as long
343	// as it's below one, and it's always legal to shift at least 15 bits for
344	// small scale values.
345	while (scale < `0.5` \|\| (scale < `256` && postShift < `15`)) {
346	scale *= `2`;
347	postShift++;
348	}
349
350	// Calculate the pre-multiplication shift. Estimate how many bits we can take
351	// from the input number and pass to the integer scale.
352	while (scale < `255` && preShift < (postShift / `2`)) {
353	scale *= `2`;
354	preShift++;
355	}
356
357	return QuantizationTransform32To8 (preShift, postShift, std::round(scale),
358	offset);
359	}
360
361	QuantizedRange getQuantizedRange(ElemKind qTy) {
362	// Pick int64_t in order to cover the uint32_t range.
363	int64_t qmin;
364	int64_t qmax;
365
366	switch (qTy) {
367	case ElemKind::Int8QTy: {
368	qmin = std::numeric_limits<int8_t>::min();
369	qmax = std::numeric_limits<int8_t>::max();
370	break;
371	}
372	case ElemKind::UInt8QTy: {
373	qmin = std::numeric_limits<uint8_t>::min();
374	qmax = std::numeric_limits<uint8_t>::max();
375	break;
376	}
377	case ElemKind::Int16QTy: {
378	qmin = std::numeric_limits<int16_t>::min();
379	qmax = std::numeric_limits<int16_t>::max();
380	break;
381	}
382	case ElemKind::Int32QTy: {
383	// A corner case is when quantizing the bias tensor which is later used in
384	// arithmetic computations as (int32)(bias[idx] - biasOffset) (e.g. in the
385	// LIBJIT function "libjit_scale_i32i8"). To avoid overflow we must restrict
386	// the quantization range such that the subtraction result fits int32. Since
387	// both bias[idx] and biasOffset are within the range [qmin, qmax] we will
388	// impose: min(int32) <= qmin - qmax and qmax - qmin <= max(int32). In other
389	// words we will restrict the quantized dynamic range to int31. Furthermore,
390	// since scale is computed as scale = (max - min) / (qmax - qmin) where
391	// (qmax - qmin) is large (~2^31) the scale computation has large errors.
392	// We will further limit the quantized range to int30 (one extra bit) in
393	// order for the computed scale to provide safe quantization within the
394	// intended range.
395	qmin = std::numeric_limits<int32_t>::min() >> `2`;
396	qmax = std::numeric_limits<int32_t>::max() >> `2`;
397	break;
398	}
399	default:
400	llvm_unreachable("Quantized type not supported");
401	}
402	return QuantizedRange (qmin, qmax);
403	}
404
405	void validateQuantizationParams(TensorQuantizationParams qParams, Schema schema,
406	ElemKind qTy) {
407
408	// Get the quantized range.
409	auto minMaxPair = getQuantizedRange(qTy);
410	int64_t qmin = minMaxPair.first;
411	int64_t qmax = minMaxPair.second;
412
413	// Validate params.
414	(void)(qmin);
415	(void)(qmax);
416	assert((qmin <= qParams.offset) && (qParams.offset <= qmax) &&
417	"The offset must be within the quantized range");
418	if (schema == quantization::Schema::Symmetric) {
419	assert((qParams.offset == `0`) &&
420	"Symmetric quantization should have offset 0");
421	} else if (schema == quantization::Schema::SymmetricWithUnsigned) {
422	assert((qParams.offset == qmin \|\| qParams.offset == `0`) &&
423	"SymmetricWithUnsigned quantization should have offset 0 or qmin");
424	} else if (schema == quantization::Schema::SymmetricWithPower2Scale) {
425	assert((qParams.offset == `0`) &&
426	"SymmetricWithPower2Scale quantization should have offset 0");
427	assert(isFloatPowerOf2(qParams.scale) &&
428	"SymmetricWithPower2Scale quantization parameter should be a power "
429	"of 2");
430	}
431	}
432
433	TensorQuantizationParams
434	chooseQuantizationParams(TensorProfilingParams profParams, Schema schema,
435	ElemKind qTy, Calibration calibration) {
436	float min = profParams.min;
437	float max = profParams.max;
438	assert(min <= max && "min must not be bigger than max");
439
440	// Get the quantized range.
441	auto minMaxPair = getQuantizedRange(qTy);
442	int64_t qmin = minMaxPair.first;
443	int64_t qmax = minMaxPair.second;
444
445	// We extend the [min, max] interval to ensure that it contains 0.
446	// Otherwise, we would not meet the requirement that 0 be an exactly
447	// representable value.
448	min = std::min(min, `0.f`);
449	max = std::max(max, `0.f`);
450
451	if (schema == quantization::Schema::SymmetricWithUnsigned) {
452	// Check if the range we try to encode is purely positive.
453	// If not, we cannot use the Unsigned mapping and we fall back
454	// to the symmetric schema.
455	if (min >= `0.f`) {
456	// By construction we always have zero to our range.
457	// Since min is >= 0 and 0 is in our range, min is
458	// actually zero.
459	// Therefore zero is going to be mapped to the first
460	// element of the quantized range qmin and thus the
461	// offset is going to be qmin.
462	assert(min <= std::numeric_limits<float>::epsilon() &&
463	"Our range should start at zero");
464	} else {
465	schema = quantization::Schema::Symmetric;
466	}
467	}
468	if (schema == quantization::Schema::Symmetric \|\|
469	schema == quantization::Schema::SymmetricWithPower2Scale) {
470	// Check which end saturates the output dynamic range earlier
471	// and extend the other end to map the zero-point to quantized 0.
472	assert(qmin < `0` && "Symmetric schema incompatible with unsigned range");
473	double rmin = min / (double)qmin;
474	double rmax = max / (double)qmax;
475	if (rmin > rmax) {
476	max = rmin * qmax;
477	} else {
478	min = rmax * qmin;
479	}
480	}
481
482	min = std::max(min, std::numeric_limits<float>::lowest());
483	max = std::min(max, std::numeric_limits<float>::max());
484
485	// Calibrate the min/max range (for non-zero ranges only).
486	if ((profParams.min != profParams.max) && (min != max) &&
487	(calibration == Calibration::KLMinimization)) {
488
489	// Rescale the profiled histogram with the new constrained min/max range.
490	auto histRescaled = rescaleHistogram(profParams.histogram, profParams.min,
491	profParams.max, min, max);
492
493	// Number of quantized bins. Default value from TVM / MXNet.
494	const size_t numQuantizedBins = `255`;
495
496	// Set symmetric, only if schema is Symmetric or SymmetricWithPower2Scale
497	const bool symmetric =
498	(schema == quantization::Schema::Symmetric \|\|
499	schema == quantization::Schema::SymmetricWithPower2Scale);
500
501	// Optimize the range.
502	FloatRange rangeOpt =
503	optimizeKL(histRescaled, min, max, numQuantizedBins, symmetric);
504
505	// Update the min/max range with the optimized range.
506	min = rangeOpt.first;
507	max = rangeOpt.second;
508	}
509
510	// Compute scale.
511	double scale = ((double)max - min) / ((double)qmax - qmin);
512
513	// Dequantization uses the following formula scale (X - offset), so*
514	// scale should not be equal to zero.
515	// If scale is 0, we arbitrary adjust the scale to 0.1.
516	if (scale == `0`) {
517	scale = `0.1`;
518	}
519
520	assert(scale > `0` && "Scale must be non negative");
521
522	// Zero-point computation.
523	// First the initial floating-point computation. The zero-point can be
524	// determined from solving an affine equation for any known pair
525	// (real value, corresponding quantized value).
526	// We know two such pairs: (rmin, qmin) and (rmax, qmax).
527	// The arithmetic error on the zero point computed from either pair
528	// will be roughly machine_epsilon (sum of absolute values of terms)*
529	// so we want to use the variant that adds the smaller terms.
530	double zeroPointFromMin = qmin - min / scale;
531	double zeroPointFromMax = qmax - max / scale;
532	double zeroPointFromMinError = std::abs(qmin) + std::abs(min / scale);
533	double zeroPointFromMaxError = std::abs(qmax) + std::abs(max / scale);
534	double initialZeroPoint = zeroPointFromMinError < zeroPointFromMaxError
535	? zeroPointFromMin
536	: zeroPointFromMax;
537
538	// For symmetric quantization, if min == -max, force the zero point to be 0.
539	float difference = std::abs(max + min);
540	if (difference <= std::numeric_limits<float>::epsilon()) {
541	initialZeroPoint = `0`;
542	}
543
544	// Now we need to nudge the zero point to be an integer (our zero points are
545	// integer, and this is motivated by the requirement to be able to represent
546	// the real value "0" exactly as a quantized value, which is required in
547	// multiple places, for example in Im2col with SAME padding).
548	int32_t nudgedZeroPoint = `0`;
549	if (initialZeroPoint < qmin) {
550	nudgedZeroPoint = qmin;
551	} else if (initialZeroPoint > qmax) {
552	nudgedZeroPoint = qmax;
553	} else {
554	nudgedZeroPoint = static_cast<int32_t>(round(initialZeroPoint));
555	}
556
557	// For SymmetricWithPower2Scale, round scale to nearest higher power of 2.
558	if (schema == quantization::Schema::SymmetricWithPower2Scale) {
559	scale = std::exp2(std::ceil(std::log2(scale)));
560	}
561
562	TensorQuantizationParams result{static_cast<float>(scale), nudgedZeroPoint};
563	validateQuantizationParams(result, schema, qTy);
564	return result;
565	}
566
567	TensorQuantizationParams
568	specializeBiasQuantizationParams(const TensorQuantizationParams &biasTQP,
569	const TensorQuantizationParams &inputTQP,
570	const TensorQuantizationParams &weightsTQP,
571	Schema schema, ElemKind biasQTy,
572	bool biasZero) {
573	// Choose bias offset. For int32 bias we always force offset 0 in order
574	// to simplify the implementation since the dynamic range allows it.
575	int32_t biasOffset = biasTQP.offset;
576	if (biasQTy == ElemKind::Int32QTy) {
577	biasOffset = `0`;
578	}
579	// Choose bias scale. We try to force the bias scale value to the product
580	// inputScale weightsScale but only if the resulting scale is larger*
581	// (in order to avoid bias data saturation).
582	float inputScale = inputTQP.scale;
583	float weightsScale = weightsTQP.scale;
584	float biasScale = biasTQP.scale;
585	if (inputScale * weightsScale >= biasScale \|\| biasZero) {
586	biasScale = inputScale * weightsScale;
587	}
588	// Validate new bias TQP and return.
589	TensorQuantizationParams biasTQPNew = {biasScale, biasOffset};
590	validateQuantizationParams(biasTQPNew, schema, biasQTy);
591	return biasTQPNew;
592	}
593
594	void specializeBiasWeightsQuantizationParams(
595	TensorQuantizationParams &biasTQP, const TensorQuantizationParams &inputTQP,
596	TensorQuantizationParams &weightsTQP, Schema schema, ElemKind biasQTy,
597	bool biasZero) {
598	// Choose bias offset. For int32 bias we always force offset 0 in order
599	// to simplify the implementation since the dynamic range allows it.
600	if (biasQTy == ElemKind::Int32QTy) {
601	biasTQP.offset = `0`;
602	}
603	// Choose bias scale. We try to force the bias scale value to the product
604	// inputScale weightsScale but only if the resulting scale is larger*
605	// (in order to avoid bias data saturation). Otherwise, for INT32 bias
606	// only, we change the weightsScale to enforce the equality.
607	float inputScale = inputTQP.scale;
608	float weightsScale = weightsTQP.scale;
609	float biasScale = biasTQP.scale;
610	if (inputScale * weightsScale >= biasScale \|\| biasZero) {
611	biasScale = inputScale * weightsScale;
612	} else {
613	if (biasQTy == ElemKind::Int32QTy) {
614	weightsScale = biasScale / inputScale;
615	// The division above does not always ensure that biasScale equals the
616	// product inputScale weightsScale because float32 division is not*
617	// that accurate. Instead we force the equality explicitly.
618	biasScale = inputScale * weightsScale;
619	}
620	}
621	biasTQP.scale = biasScale;
622	weightsTQP.scale = weightsScale;
623	// Validate new bias and weights TQP.
624	if (biasQTy == ElemKind::Int32QTy) {
625	assert((biasTQP.scale == (inputTQP.scale * weightsTQP.scale)) &&
626	"Bias scale invalid!");
627	}
628	validateQuantizationParams(biasTQP, schema, biasQTy);
629	}
630
631	std::vector<TensorQuantizationParams>
632	getTensorQuantizationParams(const Tensor &tensor, Schema qSchema, ElemKind qTy,
633	dim_t qDim, dim_t qStep) {
634
635	// Validate tensor parameters.
636	assert(qDim < tensor.dims().size() &&
637	"Quantization dimension exceeds max tensor dimension!");
638	assert(qStep > `0` &&
639	"Quantization step (granularity) must be greater than 0!");
640	assert((tensor.dims()[qDim] % qStep) == `0` &&
641	"Quantization step must divide dimension length!");
642	assert(tensor.getElementType() == ElemKind::FloatTy &&
643	"Tensor type should be float!");
644	dim_t groupNum = tensor.dims()[qDim] / qStep;
645
646	// Get tensor view with max of 6 dimensions.
647	auto dimsMax = expandDimsToMax(tensor.dims());
648	Tensor tensorMax = tensor.getUnowned(dimsMax);
649	auto tensorH = tensorMax.getHandle<float>();
650
651	// Find min/max for each quantization group.
652	std::vector<float> minArray(groupNum, std::numeric_limits<float>::max());
653	std::vector<float> maxArray(groupNum, std::numeric_limits<float>::lowest());
654	assert(dimsMax.size() == `6` &&
655	"Invalid number of dimensions for tensor expansion!");
656	for (dim_t idx0 = `0`; idx0 < dimsMax [`0`]; idx0++) {
657	for (dim_t idx1 = `0`; idx1 < dimsMax [`1`]; idx1++) {
658	for (dim_t idx2 = `0`; idx2 < dimsMax [`2`]; idx2++) {
659	for (dim_t idx3 = `0`; idx3 < dimsMax [`3`]; idx3++) {
660	for (dim_t idx4 = `0`; idx4 < dimsMax [`4`]; idx4++) {
661	for (dim_t idx5 = `0`; idx5 < dimsMax [`5`]; idx5++) {
662
663	// Current sample multidimensional index.
664	std::array<dim_t, `6`> sampleIdx{
665	{idx0, idx1, idx2, idx3, idx4, idx5}};
666
667	// Find quantization group to which this sample belongs.
668	dim_t groupIdx = (dim_t)(sampleIdx [qDim] / qStep);
669
670	// Adjust min/max for current group.
671	if (tensorH.at(sampleIdx) < minArray [groupIdx]) {
672	minArray [groupIdx] = tensorH.at(sampleIdx);
673	}
674	if (tensorH.at(sampleIdx) > maxArray [groupIdx]) {
675	maxArray [groupIdx] = tensorH.at(sampleIdx);
676	}
677	}
678	}
679	}
680	}
681	}
682	}
683
684	// Compute the quantization parameters for each group.
685	std::vector<TensorQuantizationParams> TQP;
686	for (dim_t groupIdx = `0`; groupIdx < groupNum; groupIdx++) {
687	TQP.push_back(chooseQuantizationParams(
688	{minArray [groupIdx], maxArray [groupIdx]}, qSchema, qTy));
689	}
690	return TQP;
691	}
692
693	void getTensorQuantizationParams(const Tensor &tensor, Tensor &scales,
694	Tensor &offsets, Schema qSchema, ElemKind qTy,
695	dim_t qDim, dim_t qStep) {
696	auto TQP = getTensorQuantizationParams(tensor, qSchema, qTy, qDim, qStep);
697	assert(scales.size() == TQP.size() && "Scales tensor size invalid!");
698	assert(offsets.size() == TQP.size() && "Offsets tensor size invalid!");
699	auto scalesH = scales.getHandle<float>();
700	auto offsetsH = offsets.getHandle<int32_t>();
701	for (dim_t idx = `0`; idx < TQP.size(); idx++) {
702	scalesH.raw(idx) = TQP [idx].scale;
703	offsetsH.raw(idx) = TQP [idx].offset;
704	}
705	}
706
707	template <class eTy = int8_t>
708	static void quantizeTensorImpl(Tensor dest, const* Tensor &src,
709	llvm::ArrayRef<TensorQuantizationParams> TQP,
710	dim_t qDim, dim_t qStep) {
711
712	// Validate tensor parameters.
713	assert(qDim < src.dims().size() &&
714	"Quantization dimension exceeds max tensor dimension!");
715	assert(qStep > `0` &&
716	"Quantization step (granularity) must be greater than 0!");
717	assert((src.dims()[qDim] % qStep) == `0` &&
718	"Quantization step must divide dimension length!");
719	assert(src.getElementType() == ElemKind::FloatTy &&
720	"Tensor type should be float!");
721	assert(TQP.size() == (src.dims()[qDim] / qStep) &&
722	"TensorQuantizationParams array size invalid!");
723
724	// Get tensor views with maximum dimensions.
725	auto dimsMax = expandDimsToMax(src.dims());
726	Tensor srcMax = src.getUnowned(dimsMax);
727	auto srcH = srcMax.getHandle<float>();
728	Tensor destMax = dest->getUnowned(dimsMax);
729	auto destH = destMax.getHandle<eTy>();
730
731	// Perform quantization for each group.
732	assert(dimsMax.size() == `6` &&
733	"Invalid number of dimensions for tensor expansion!");
734	for (dim_t idx0 = `0`; idx0 < dimsMax [`0`]; idx0++) {
735	for (dim_t idx1 = `0`; idx1 < dimsMax [`1`]; idx1++) {
736	for (dim_t idx2 = `0`; idx2 < dimsMax [`2`]; idx2++) {
737	for (dim_t idx3 = `0`; idx3 < dimsMax [`3`]; idx3++) {
738	for (dim_t idx4 = `0`; idx4 < dimsMax [`4`]; idx4++) {
739	for (dim_t idx5 = `0`; idx5 < dimsMax [`5`]; idx5++) {
740
741	// Current sample multidimensional index.
742	std::array<dim_t, `6`> sampleIdx{
743	{idx0, idx1, idx2, idx3, idx4, idx5}};
744
745	// Find quantization group to which this sample belongs.
746	dim_t groupIdx = sampleIdx [qDim] / qStep;
747
748	// Quantize current sample with group specific quantization
749	// parameters.
750	destH.at(sampleIdx) = quantization::quantize<eTy>(
751	srcH.at(sampleIdx), TQP [groupIdx]);
752	}
753	}
754	}
755	}
756	}
757	}
758	}
759
760	Tensor quantizeTensor(const Tensor &tensor,
761	llvm::ArrayRef<TensorQuantizationParams> TQP,
762	ElemKind qTy, dim_t qDim, dim_t qStep) {
763	Tensor tensorQ(qTy, tensor.dims(), `1.0`, `0`);
764	assert(tensor.getType().isFPType() && "Type not supported yet");
765	if (qTy == ElemKind::Int8QTy) {
766	quantizeTensorImpl<int8_t>(&tensorQ, tensor, TQP, qDim, qStep);
767	} else if (qTy == ElemKind::UInt8QTy) {
768	quantizeTensorImpl<uint8_t>(&tensorQ, tensor, TQP, qDim, qStep);
769	} else if (qTy == ElemKind::Int16QTy) {
770	quantizeTensorImpl<int16_t>(&tensorQ, tensor, TQP, qDim, qStep);
771	} else if (qTy == ElemKind::Int32QTy) {
772	quantizeTensorImpl<int32_t>(&tensorQ, tensor, TQP, qDim, qStep);
773	} else {
774	llvm_unreachable("Quantization type not supported");
775	}
776	return tensorQ;
777	}
778
779	Tensor quantizeTensor(const Tensor &tensor, const Tensor &scales,
780	const Tensor &offsets, ElemKind qTy, dim_t qDim,
781	dim_t qStep) {
782	assert(scales.size() == offsets.size() &&
783	"Scales/Offsets tensor size invalid!");
784	auto scalesH = scales.getHandle<float>();
785	auto offsetsH = offsets.getHandle<int32_t>();
786	std::vector<TensorQuantizationParams> TQP;
787	for (dim_t idx = `0`; idx < scales.size(); idx++) {
788	TQP.push_back({scalesH.raw(idx), offsetsH.raw(idx)});
789	}
790	return quantizeTensor(tensor, TQP, qTy, qDim, qStep);
791	}
792
793	bool isFloatPowerOf2(float val) {
794	// frexp returns mantissa normalized in [0.5,1) so compare with 0.5.
795	int exp;
796	return (std::abs(std::frexp(val, &exp)) == `0.5`);
797	}
798
799	int getFloat2Exp(float val) { return std::ilogb(val); }
800
801	} // namespace quantization
802	} // namespace glow
803

Browse the source code of glow/lib/Quantization/Base/Base.cpp