Base.h source code [glow/include/glow/Quantization/Base/Base.h]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#ifndef GLOW_QUANTIZATION_BASE_BASE_H
18	#define GLOW_QUANTIZATION_BASE_BASE_H
19
20	#include "glow/Base/Tensor.h"
21	#include "glow/Base/Traits.h"
22	#include "glow/Base/Type.h"
23
24	#include <algorithm>
25	#include <cassert>
26	#include <cstdlib>
27	#include <limits>
28
29	namespace glow {
30
31	/// Dummy scale used for representing dummy quantization parameters that have
32	/// been loaded in place of real quantization parameters.
33	constexpr float dummyScale = `0.123456813395023345947265625`;
34
35	/// Profiling parameters of a tensor consisting in the global minimum and global
36	/// maximum values and also the histogram obtained during profiling. To be noted
37	/// that the histogram is not normalized.
38	struct TensorProfilingParams {
39	float min;
40	float max;
41	std::vector<float> histogram;
42
43	TensorProfilingParams() = default;
44	TensorProfilingParams(float min, float max) : min(min), max(max) {}
45	TensorProfilingParams(float min, float max, const std::vector<float> &hist)
46	: min(min), max(max), histogram (hist) {}
47	TensorProfilingParams(float min, float max, const Tensor &hist)
48	: min(min), max(max) {
49	auto histH = hist.getHandle<float>();
50	histogram = std::vector<float>(histH.size());
51	for (dim_t idx = `0`, e = histH.size(); idx < e; idx++) {
52	histogram [idx] = histH.raw(idx);
53	}
54	}
55	};
56
57	/// Main attributes of a quantized tensor.
58	/// Scale and Offset allow quantization of a float tensor and dequantization of
59	/// integer tensor back to float one.
60	struct TensorQuantizationParams {
61	float scale;
62	int32_t offset;
63	};
64
65	/// A data structure that represents the 32-bit to 8-bit quantization
66	/// scaling operation. This data structure represents the transformation:
67	/// (((input >> pre) scale) + rtn) >> post + offset.*
68	struct QuantizationTransform32To8 {
69	int pre;
70	int post;
71	int scale;
72	int offset;
73
74	/// Initializes the transformation based on the conversion formula (above).
75	QuantizationTransform32To8(int pre, int post, int scale, int offset)
76	: pre(pre), post(post), scale(scale), offset(offset) {}
77
78	/// \returns the scaled integer.
79	int32_t transform(int32_t input) {
80	// The operation x >> post is rounded down to negative infinity. To get to
81	// round-nearest we add (1 << (post - 1)) to the value prior to shifting.
82	// Rounding is performed only when shifting right (pos > 0).
83	int rtn = (post > `0`) ? (`1` << (post - `1`)) : `0`;
84	return ((((input >> pre) * scale) + rtn) >> post) + offset;
85	}
86	};
87
88	/// Tensor profiling parameters for a given node.
89	struct NodeProfilingInfo {
90	std::string nodeOutputName_;
91	TensorProfilingParams tensorProfilingParams_;
92
93	NodeProfilingInfo() = default;
94	NodeProfilingInfo(const std::string &nodeOutputName,
95	const TensorProfilingParams &tensorProfilingParams)
96	: nodeOutputName_(nodeOutputName),
97	tensorProfilingParams_(tensorProfilingParams) {}
98
99	float min() const { return tensorProfilingParams_.min; }
100	float max() const { return tensorProfilingParams_.max; }
101	const std::vector<float> &histogram() const {
102	return tensorProfilingParams_.histogram;
103	}
104	};
105
106	/// Tensor quantization parameters for a given node.
107	struct NodeQuantizationInfo {
108	std::string nodeOutputName_;
109	TensorQuantizationParams tensorQuantizationParams_;
110
111	NodeQuantizationInfo() = default;
112	NodeQuantizationInfo(const std::string &nodeOutputName,
113	const TensorQuantizationParams &tensorQuantizationParams)
114	: nodeOutputName_(nodeOutputName),
115	tensorQuantizationParams_(tensorQuantizationParams) {}
116
117	float scale() const { return tensorQuantizationParams_.scale; }
118	int32_t offset() const { return tensorQuantizationParams_.offset; }
119	};
120
121	/// Primitive to encode an integer in 32-bit unsigned fixed-point format.
122	class FixedPointUInt32 {
123	private:
124	/// Encoded fixed-point value.
125	uint32_t val_;
126	/// Number of integer bits.
127	unsigned intBits_;
128	/// Number of fractional bits.
129	unsigned fracBits_;
130
131	public:
132	/// Default constructor.
133	FixedPointUInt32() = default;
134
135	/// Construct a fixed-point representation of the floating-point value
136	/// \p floatVal using the fixed-point configuration with minimum approximation
137	/// error by using the least amount of integer bits and the highest amount
138	/// of fractional bits.
139	FixedPointUInt32(float floatVal) {
140	assert(floatVal >= `0` && "Floating point value must be positive!");
141	val_ = floatingToFixedPoint(floatVal, `32` - minBitsIntegerPart(floatVal));
142	intBits_ = minBitsIntegerPart(floatVal);
143	fracBits_ = `32` - intBits_;
144	};
145
146	/// Construct a fixed-point representation of the floating-point value
147	/// \p floatVal using the given number of integer bits \p intBits.
148	FixedPointUInt32(float floatVal, unsigned intBits) {
149	assert(floatVal >= `0` && "Floating point value must be positive!");
150	assert(intBits >= `0` && intBits <= `32` &&
151	"Integer bits must be between 0 and 32");
152	val_ = floatingToFixedPoint(floatVal, `32` - intBits);
153	intBits_ = intBits;
154	fracBits_ = `32` - intBits_;
155	}
156
157	/// \returns the encoded fixed-point value as integer.
158	uint32_t getFixedVal() const { return val_; }
159
160	/// \returns the encoded fixed-point value as float.
161	float getFloatVal() const { return (float)(val_) / std::exp2(fracBits_); }
162
163	/// \returns the number of integer bits.
164	unsigned getIntBits() const { return intBits_; }
165
166	/// \returns the number of fractional bits.
167	unsigned getFracBits() const { return fracBits_; }
168
169	private:
170	// \p number.
171	// \returns the minimum number of bits representing the
172	// integer part of the fixed point representation of a
173	// floating point number.
174	uint32_t minBitsIntegerPart(float number) {
175	assert(number >= `0` && "Floating point value must be positive!");
176	uint32_t aux = (uint32_t)number;
177	uint32_t integerPart = `0`;
178
179	while (aux / `2` != `0` \|\| aux % `2` != `0`) {
180	integerPart += `1`;
181	aux /= `2`;
182	}
183
184	assert(integerPart >= `0` && integerPart <= `32` &&
185	"Overflow caused by input number\n");
186	return integerPart;
187	}
188
189	// \p elem.
190	// \p fracPart representing number of bits for fixed point representation.
191	// \returns the fixed point representation of the input floating point number
192	// using the format Q(32- fracPart).fracPart.
193	uint32_t floatingToFixedPoint(float elem, uint32_t fracPart) {
194	assert(elem >= `0` && "Floating point value must be positive!");
195	double result = (double)elem * (double)std::exp2((double)fracPart);
196	assert(result >= (double)std::numeric_limits<uint32_t>::min() &&
197	result <= (double)std::numeric_limits<uint32_t>::max() &&
198	"Float to fix point conversion overflow\n");
199	return round(result);
200	}
201
202	public:
203	/// \returns a string representation of the fixed-point value (e.g. "0.13").
204	std::string toString() const { return std::to_string(getFloatVal()); }
205	};
206
207	namespace quantization {
208
209	/// Type definition for a float min/max range.
210	using FloatRange = std::pair<float, float>;
211
212	/// Type definition for a quantized min/max range.
213	using QuantizedRange = std::pair<int64_t, int64_t>;
214
215	/// Quantization schema which influences the way the quantization parameters
216	/// scale and offset are computed based on the target min/max dynamic range.
217	enum Schema {
218	/// Asymmetric quantization produces ranges not necessarily centered on 0.
219	Asymmetric,
220	/// Symmetric quantization produces ranges centered on 0.
221	Symmetric,
222	/// Symmetric quantization produces ranges centered on 0 or -qmin, qmin being
223	/// the minimum value of the quantized type.
224	/// An offset of qmin (i.e., offset == -128 for int8) represents an unsigned
225	/// version of the quantized type with an offset of zero:
226	/// For example, int8 is [-128; 127] - (-128) == uint8 [0; 255] - 0
227	SymmetricWithUnsigned,
228	/// Quantization schema with:
229	/// - range centered on 0 (symmetric): offset == 0.
230	/// - scale parameter is a power of 2: scale = 2^E where E is a signed
231	/// exponent. Since the scale parameter is mostly subunitary, the
232	/// exponent is mostly negative.
233	/// Since the scale parameter is stored as floating point, the values
234	/// of E which are exactly representable range from -126 to 127.
235	SymmetricWithPower2Scale,
236	};
237
238	/// Calibration mode which influences the way the dynamic range min/max obtained
239	/// during profiling is narrowed in order to have a more precise representation
240	/// for the majority of the values with the price of saturating the outliers.
241	enum Calibration {
242	/// No calibration. The quantization parameters will be computed using the
243	/// unaltered dynamic range min/max obtained during profiling such that all
244	/// the profiled dynamic range will be representable without saturation.
245	None,
246	/// Calibration mode based on minimizing the Kullback-Leibler divergence.
247	KLMinimization
248	};
249
250	/// Configuration for Profiling, passed into \ref profileQuantization().
251	struct ProfilingConfiguration {
252	/// Number of bins used to compute the histogram during profiling.
253	unsigned numHistogramBins{`10`};
254	};
255
256	/// Configuration for Quantization, passed into \ref quantizeFunction().
257	struct QuantizationConfiguration {
258	/// Profiling infos to use when computing the scale and offset for all the
259	/// Nodes inside the function being quantized, including the referenced
260	/// Placeholders and Constants.
261	std::vector<NodeProfilingInfo> infos{};
262
263	/// The hash of the graph obtained during profiling in the pre lowering stage.
264	/// This hash is used to verify during quantization that the graph being
265	/// compiled matches the graph used for obtaining the profiling information.
266	llvm::hash_code graphPreLowerHash{`0`};
267
268	/// Whether to check the graph hash during quantization.
269	bool checkGraphPreLowerHash{false};
270
271	/// Precision to use when quantizing a Function.
272	ElemKind precision{ElemKind::Int8QTy};
273
274	/// Schema to use when quantizing a Function.
275	Schema schema{Schema::Asymmetric};
276
277	/// Calibration mode used when computing the quantization parameters.
278	Calibration calibration{Calibration::None};
279
280	/// Whether to enable the calibration for constant weights.
281	bool calibrateConstants{false};
282
283	/// Whether to use rowwise quantization when quantizing a Function.
284	bool enableRowwise{false};
285
286	/// Whether to use channelwise quantization when quantizing a Function.
287	bool enableChannelwise{false};
288
289	/// New name for the quantized function. If no name is given then
290	/// \ref quantizeFunction() will generate a name.
291	std::string newFuncName{""};
292
293	/// If true, the quantizer will abort when encountering a node that it would
294	/// like to quantize but the backend cannot support. Note that node kinds in
295	/// doNotQuantizeKinds will skip this check and not cause an abort.
296	bool assertAllNodesQuantized{false};
297
298	/// Precision used for bias quantization for Convolution and FullyConnected.
299	/// This allows specializing the bias quantization. Default is int32.
300	ElemKind precisionBias{ElemKind::Int32QTy};
301
302	/// If true, don't apply quantization to FC bias inputs.
303	bool skipQuantizeFCBias{false};
304
305	QuantizationConfiguration() = default;
306	QuantizationConfiguration(llvm::ArrayRef<NodeProfilingInfo> i) : infos (i) {}
307	};
308
309	/// \returns the tensor average value based on the profiling info \p profParams.
310	float getTensorAverageValue(const TensorProfilingParams &profParams);
311
312	/// \returns the value \p in as clipped to the range of \p DestTy.
313	template <class SrcTy, class DestTy> DestTy clip(SrcTy in) {
314	static_assert(sizeof(SrcTy) >= sizeof(DestTy), "Invalid types");
315
316	auto mx = std::numeric_limits<DestTy>::max();
317	auto mn = std::numeric_limits<DestTy>::min();
318	return std::max<SrcTy>(mn, std::min<SrcTy>(mx, in));
319	}
320
321	/// Converts floating point value \p input to DestTy (quantized type) based
322	/// on the quantization parameters \p TQP.
323	template <class DestTy = int8_t>
324	inline DestTy quantize(float input, const TensorQuantizationParams &TQP) {
325	float result = input / TQP.scale + TQP.offset;
326	// Note: use int64_t since casts of large values might be wrapped around
327	// before clipping, for example for result = 2147483648.00 (float).
328	return quantization::clip<int64_t, DestTy>((int64_t)nearbyintf(result));
329	}
330
331	/// Converts floating point value \p input to \p DestTy (quantized type) based
332	/// on the quantization parameters \p TQP. The value is returned as int64.
333	inline int64_t quantize(float input, const TensorQuantizationParams &TQP,
334	ElemKind DestTy) {
335	if (DestTy == ElemKind::Int8QTy) {
336	return quantize<int8_t>(input, TQP);
337	} else if (DestTy == ElemKind::Int16QTy) {
338	return quantize<int16_t>(input, TQP);
339	} else if (DestTy == ElemKind::Int32QTy) {
340	return quantize<int32_t>(input, TQP);
341	} else if (DestTy == ElemKind::Int64QTy) {
342	return quantize<int64_t>(input, TQP);
343	} else {
344	llvm_unreachable("Precision not supported!");
345	}
346	}
347
348	/// Converts a quantized value (type eTy) to floating point based on the
349	/// quantization parameters \p TQP.
350	/// Note: use int64_t to cover the 'symmetric int32 with unsigned' case.
351	template <class eTy = int8_t>
352	inline float dequantize(eTy input, const TensorQuantizationParams &TQP) {
353	return TQP.scale * ((int64_t)input - TQP.offset);
354	}
355
356	/// Converts floating point value to DestTy (quantized type) based on the
357	/// quantization parameters \p scale and \p offset. If the destination type is
358	/// int8_t then an offset of 128 is subtracted to convert to int8_t. If the
359	/// destination type is int16_t then an offset of 32768 is subtracted to convert
360	/// to int16_t.
361	template <class DestTy>
362	inline DestTy quantizeWithFloatOffset(float input, float scale, float offset) {
363	uint16_t d = static_cast<uint16_t>(std::round((input - offset) / scale));
364	if (std::is_same<int8_t, DestTy>::value) {
365	d -= `128`;
366	} else if (std::is_same<int16_t, DestTy>::value) {
367	d -= `32768`;
368	}
369	return static_cast<DestTy>(d);
370	}
371
372	/// Converts floating point value \p input to 4-bit quantization based on the
373	/// quantization parameters \p scale and \p offset.
374	inline uint8_t quantize4BitsWithFloatOffset(float input, float scale,
375	float offset) {
376	uint8_t d = std::max(
377	`0`, std::min(static_cast<int>(std::round((input - offset) / scale)), `15`));
378	return d;
379	}
380
381	/// Converts a quantized value (type eTy) to floating point based on the
382	/// quantization parameters \p scale and \p offset. If the input type is int8_t,
383	/// then an offset of 128 is added to convert to uint8_t. If the input type is
384	/// int16_t, then an offset of 32768 is added to convert to uint16_t.
385	template <class eTy>
386	inline float dequantizeWithFloatOffset(eTy input, float scale, float offset) {
387	uint16_t d = static_cast<uint16_t>(input);
388	if (std::is_same<int8_t, eTy>::value) {
389	d += `128`;
390	} else if (std::is_same<int16_t, eTy>::value) {
391	d += `32768`;
392	}
393	return (d * scale) + offset;
394	}
395
396	/// Converts a 4-bit quantized value, which is stored in \p input (MSB if \p
397	/// isMSB is true, otherwise LSB), to floating point based on the quantization
398	/// parameters \p scale and \p offset.
399	inline float dequantize4BitWithFloatOffset(uint8_t input, float scale,
400	float offset, bool isMSB) {
401	if (isMSB) {
402	input >>= `4`;
403	}
404	input &= `0x0f`;
405	return (input * scale) + offset;
406	}
407
408	/// Converts a floating point \p tensor to quantized tensor based on the
409	/// quantization parameters \p TQP and \p Ty.
410	Tensor quantizeTensor(const Tensor &tensor, const TensorQuantizationParams &TQP,
411	ElemKind Ty = ElemKind::Int8QTy);
412
413	/// Converts quantized tensor \p tensor to floating point tensor of type \p Ty
414	/// floatKind.
415	Tensor dequantizeTensor(const Tensor &tensor, ElemKind floatKind);
416
417	/// Dequantize 4-bit fused quantized tensor \p input. \returns the float type
418	/// output.
419	Tensor tensor4BitsFusedRowwiseDequantization(const Tensor &input);
420
421	/// Convert the floating point quantization parameters \p scale and \p offset
422	/// into the integer sequence of:
423	/// result = ((input >> pre) scale) >> post + offset.*
424	/// This scales a 32-bit signed integer word into an 8-bit signed integer.
425	/// \returns transformation parameters.
426	QuantizationTransform32To8 quantizeScaleOffset32To8(float scale,
427	int32_t offset);
428
429	/// Function to get the quantized range for a given precision type \p qTy.
430	/// \returns the range as a (min, max) pair.
431	QuantizedRange getQuantizedRange(ElemKind qTy);
432
433	/// Function to validate that the given quantization parameters \p qParams
434	/// comply with the given quantization \p schema and precision \p qTy.
435	void validateQuantizationParams(TensorQuantizationParams qParams, Schema schema,
436	ElemKind qTy);
437
438	/// Calculate the TensorQuantizationParams from the TensorProfilingParams
439	/// \p profParams using the quantization type \p qTy and the quantization
440	/// method described by \p schema. The calibration of the quantization
441	/// parameters will be done using the method given by \p calibration.
442	TensorQuantizationParams
443	chooseQuantizationParams(TensorProfilingParams profParams,
444	Schema schema = Asymmetric,
445	ElemKind qTy = ElemKind::Int8QTy,
446	Calibration calibration = Calibration::None);
447
448	/// Function to specialize the TensorQuantizationParams of the bias operand
449	/// for nodes like Convolution and FullyConnected given the initially computed
450	/// parameters \p biasTQP and the parameters of the input \p inputTQP and the
451	/// weights \p weightsTQP, for given quantization schema \p schema and bias type
452	/// \p biasQTy. The parameter \p biasZero provides the information whether bias
453	/// data is zero. The bias operand requires a more thoughtful quantization since
454	/// every bias value has a higher impact on the precision of the output value
455	/// than any particular weight value. The specialization logic is:
456	/// - for INT32 bias quantization: since the dynamic range of INT32 is large we
457	/// can always force symmetric quantization (offset = 0). This allows a faster
458	/// implementation since no offset subtraction is required at run-time.
459	/// - for INT8/INT16 bias quantization: since the dynamic range is small we
460	/// will keep the original offset.
461	/// - regardless of precision, we try to force the bias scale parameter to
462	/// bias_scale = input_scale weights_scale since this has a performance*
463	/// benefit by specializing the parameters to biasPre = 0, biasPost = 0,
464	/// biasScale = 1. We must verify that by changing the bias scale we don`t
465	/// saturate the bias data. This is also equivalent to forcing the effective
466	/// scale applied at run-time (bias_scale / (input_scale weights_scale))*
467	/// to be always greater than or equal to 1.0 which is a common constraint
468	/// for the bias for most libraries with quantized implementations.
469	TensorQuantizationParams
470	specializeBiasQuantizationParams(const TensorQuantizationParams &biasTQP,
471	const TensorQuantizationParams &inputTQP,
472	const TensorQuantizationParams &weightsTQP,
473	Schema schema, ElemKind biasQTy,
474	bool biasZero = false);
475
476	/// Function similar to \ref specializeBiasQuantizationParams with the main
477	/// distinction that this function is also allowed to change the quantization
478	/// parameters of the weights. The modification is done in place. This function
479	/// is used for per-channel quantization. When the requested bias precision is
480	/// INT32 this function ensures that bias_scale = input_scale weights_scale*
481	/// while making sure the bias data is not saturated by changing both the bias
482	/// and weights quantization parameters.
483	void specializeBiasWeightsQuantizationParams(
484	TensorQuantizationParams &biasTQP, const TensorQuantizationParams &inputTQP,
485	TensorQuantizationParams &weightsTQP, Schema schema, ElemKind biasQTy,
486	bool biasZero = false);
487
488	/// \returns an integer mapping from the \p inTy to the \p outTy given the
489	/// floating-point function \p func.
490	/// \pre inTy and outTy must be quantized types.
491	template <typename T = int8_t>
492	std::vector<T> createMapping(TypeRef inTy, TypeRef outTy,
493	std::function<float(float)> func) {
494	assert(inTy->isQuantizedType() && "Input type must be quantized!");
495	assert(outTy->isQuantizedType() && "Output type must be quantized!");
496	assert(outTy->isType<T>() && "Output type must match template type!");
497
498	// Calculate the step which will be added to the currInputVal repeatedly in
499	// order to cover the input range of the input type.
500	auto inputRange = inTy->getQuantizedValueRange();
501	const float step = inTy->getQuantizedValueStep();
502	float currInputVal = inputRange.first;
503
504	// Calculate the output int value for each possible input value.
505	std::vector<T> mapping(inTy->getQuantizedValueCount());
506	TensorQuantizationParams outputTQP{outTy->getScale(), outTy->getOffset()};
507	for (size_t i = `0`; i < mapping.size(); i++, currInputVal += step) {
508	float currOutputVal = func (currInputVal);
509	mapping[i] = quantization::quantize<T>(currOutputVal, outputTQP);
510	}
511	return mapping;
512	}
513
514	/// Row-wise quantize the tensor \p input. \p scales and \p offsets are
515	/// generated by each row of \p input, \p output is tensor of the same shape as
516	/// input, quantized from \p input using \p scales and \p offsets for each
517	/// row. Note that the shape of input/output can be any non-zero number of
518	/// dimensions; row refers to all data in the first dimension of the shape.
519	/// Template parameter \p ScaleT and OffsetT represent the type to use for the
520	/// scales and offsets for quantization respectively. Template parameter \p QP
521	/// represents quantization precision, typically int8_t or uint8_t.
522	template <typename ScaleT, typename OffsetT, typename QP>
523	void tensorRowwiseQuantization(const Tensor &input, Tensor &output,
524	Tensor &scales, Tensor &offsets,
525	quantization::Schema schema) {
526	constexpr bool offsetIsFP = std::is_same<float, OffsetT>::value \|\|
527	std::is_same<float16_t, OffsetT>::value;
528	constexpr bool offsetIsInt32 = std::is_same<int32_t, OffsetT>::value;
529	static_assert((offsetIsInt32 && std::is_same<float, ScaleT>::value) \|\|
530	(offsetIsFP && std::is_same<ScaleT, OffsetT>::value),
531	"Invalid combination of Scale/Offset types.");
532
533	const auto fDims = flattenCdr(input.dims());
534	Tensor finalIn = input.getUnowned({fDims.first, fDims.second});
535	Tensor finalOut = output.getUnowned({fDims.first, fDims.second});
536	ShapeHW idim(finalIn.dims());
537
538	auto srcH = finalIn.getHandle<float>();
539	auto destH = finalOut.getHandle<QP>();
540	auto scalesH = scales.getHandle<ScaleT>();
541	auto offsetsH = offsets.getHandle<OffsetT>();
542	for (dim_t i = `0`; i < idim.height; i++) {
543	auto slice = srcH.extractSlice(i);
544	auto rSrc = slice.getHandle<float>();
545	auto res = rSrc.minMaxArg();
546	float min = rSrc.raw(res.first);
547	float max = rSrc.raw(res.second);
548
549	// Handle rowwise quantization for FCs.
550	if (offsetIsInt32) {
551	TensorQuantizationParams qParams =
552	chooseQuantizationParams({min, max}, schema);
553	for (dim_t j = `0`; j < idim.width; j++) {
554	destH.at({i, j}) = quantization::quantize(srcH.at({i, j}), qParams);
555	}
556	scalesH.raw(i) = qParams.scale;
557	offsetsH.raw(i) = qParams.offset;
558	} else if (offsetIsFP) {
559	// Handle rowwise quantization for Rowwise quantized SLS.
560	constexpr float kEqualityThreshold = `1e-10f`;
561	const float scale = ((max - min) < kEqualityThreshold)
562	? `1.0`
563	: ((double)max - (double)min) / `255.0`;
564	float offset = min;
565
566	for (dim_t j = `0`; j < idim.width; j++) {
567	destH.at({i, j}) = quantization::quantizeWithFloatOffset<QP>(
568	srcH.at({i, j}), scale, offset);
569	}
570	scalesH.raw(i) = static_cast<ScaleT>(scale);
571	offsetsH.raw(i) = static_cast<OffsetT>(offset);
572	} else {
573	llvm_unreachable("Unsupported offset type.");
574	}
575	}
576	}
577
578	/// Fused-rowwise quantize the tensor \p input. Scales and offsets are generated
579	/// from each row of \p input. This function supports 8-bits quantization (i.e.
580	/// each quantized data uses 8 bits) and 4-bits quantization(i.e. each quantized
581	/// data uses 4 bits).
582	/// For 8-bits quantization, \p output is tensor of the same shape as input but
583	/// with extra columns for storing fused scales. Template parameter \p T
584	/// represents the datatype used for storing the scale and offset in the row
585	/// \| .... int8 data ... \| scale \| offset \|
586	/// \|num_of_input_columns 1B\| sizeof(T) \| sizeof(T) \|*
587	/// For 4-bits quantization, in \p output, 1 byte will contain 2 quantized data.
588	/// Template parameter \p T here could be either float or float16_t.
589	/// \| .... int4 data ... \| scale \| offset \|
590	/// \|num_of_input_columns 0.5B \| sizeof(T) \| sizeof(T) \|*
591	/// \pre input.dims().size() == 2
592	/// \pre output.dims().size() == 2
593	/// For 8-bits quantization:
594	/// \pre input.dims()[1] + 2 sizeof(T) == output.dims()[1]*
595	/// For 4-bits quantization:
596	/// \pre input.dims()[1] % 2 == 0
597	/// \pre input.dims()[1] / 2 + 2 sizeof(T) == output.dims()[1]*
598	template <typename T>
599	void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output) {
600	// We are fusing the scale and offset onto the end of each row. Thus input and
601	// output must both be 2 dimensional, with output having 2sizeof(T) extra*
602	// columns for the scale and offset.
603	auto outputType = output.getElementType();
604	assert(input.dims().size() == `2` && output.dims().size() == `2` &&
605	"Input and output must be 2 dimensional.");
606	if (outputType == ElemKind::UInt8FusedFP16QTy \|\|
607	outputType == ElemKind::UInt8FusedQTy) {
608	assert(input.dims()[`1`] + `2` * sizeof(T) == output.dims()[`1`] &&
609	"Output must have 2*sizeof(T) more columns than input for 8-bits "
610	"quantization.");
611	} else if (outputType == ElemKind::UInt4FusedFP16QTy \|\|
612	outputType == ElemKind::UInt4FusedQTy) {
613	assert(
614	input.dims()[`1`] % `2` == `0` &&
615	"4-bits fused quantization only works for the number of input column "
616	"a multiple of 2");
617	assert(
618	input.dims()[`1`] / `2` + `2` * sizeof(T) == output.dims()[`1`] &&
619	"Output must have 2*sizeof(T) more columns than half of input columns "
620	"for 4-bits quantization.");
621	}
622
623	auto srcH = input.getHandle<float>();
624	auto destH = output.getHandle<uint8_t>();
625	for (dim_t i = `0`, e = input.dims()[`0`]; i < e; i++) {
626	auto slice = srcH.extractSlice(i);
627	auto rSrc = slice.getHandle<float>();
628	auto res = rSrc.minMaxArg();
629	float min = rSrc.raw(res.first);
630	float max = rSrc.raw(res.second);
631
632	float range;
633	switch (outputType) {
634	case ElemKind::UInt8FusedQTy:
635	case ElemKind::UInt8FusedFP16QTy:
636	range = `255.0`;
637	break;
638	case ElemKind::UInt4FusedFP16QTy:
639	case ElemKind::UInt4FusedQTy:
640	range = `15.0`;
641	break;
642	default:
643	llvm_unreachable("Not yet supported");
644	}
645
646	// This matches the Caffe2 implementation for FloatToRowwiseQuantized8BitsOp
647	// found in operators/lengths_reducer_rowwise_8bit_ops.h.
648	constexpr float kEqualityThreshold = `1e-10f`;
649	const float scale = ((max - min) < kEqualityThreshold)
650	? `1.0`
651	: ((double)max - (double)min) / range;
652	const float offset = min;
653
654	for (dim_t j = `0`, f = input.dims()[`1`]; j < f; j++) {
655	if (outputType == ElemKind::UInt8FusedFP16QTy \|\|
656	outputType == ElemKind::UInt8FusedQTy) {
657	destH.at({i, j}) = quantization::quantizeWithFloatOffset<uint8_t>(
658	srcH.at({i, j}), scale, offset);
659	} else if (outputType == ElemKind::UInt4FusedFP16QTy \|\|
660	outputType == ElemKind::UInt4FusedQTy) {
661	uint8_t quantized = quantization::quantize4BitsWithFloatOffset(
662	srcH.at({i, j}), scale, offset);
663	if (j % `2` == `0`) {
664	// Even columns use LSB 4-bit.
665	destH.at({i, j / `2`}) = quantized;
666	} else {
667	// Odd columns use MSB 4-bit.
668	destH.at({i, j / `2`}) \|= quantized << `4`;
669	}
670	} else {
671	llvm_unreachable("Not yet supported");
672	}
673	}
674
675	// Now set the scale/offset at the end of each row.
676	destH.setFusedScaleOffsetInRow<T>(i, scale, offset);
677	}
678	}
679
680	/// Generic function to compute the quantization parameters for an input
681	/// floating-point tensor \p tensor with given schema \p qSchema and type
682	/// \p qTy. A separate set of quantization parameters (scale, offset) will
683	/// be computed for each group of \p qStep indices along the \p qDim dimension.
684	/// This allows quantizing a given tensor with finer granularity (e.g. rowwise
685	/// or channelwise).
686	/// For example, for a tensor of size [4, 6, 8, 10], qDim = 1 and qStep = 3:
687	/// -> one set of quantization parameters will be computed for [:,0:2,:,:].
688	/// -> one set of quantization parameters will be computed for [:,3:5,:,:].
689	/// The number of sets of computed quantization parameters (scale, offset) is
690	/// tensor.dims()[qDim] / qStep. \returns the set of quantization parameters.
691	std::vector<TensorQuantizationParams>
692	getTensorQuantizationParams(const Tensor &tensor, Schema qSchema = Asymmetric,
693	ElemKind qTy = ElemKind::Int8QTy, dim_t qDim = `0`,
694	dim_t qStep = `1`);
695
696	/// Similar function to the one above with the difference that the quantization
697	/// parameters scales and offsets are written into separate tensors \p scales
698	/// and \p offsets which are assummed allocated with the correct type and size.
699	void getTensorQuantizationParams(const Tensor &tensor, Tensor &scales,
700	Tensor &offsets, Schema qSchema = Asymmetric,
701	ElemKind qTy = ElemKind::Int8QTy,
702	dim_t qDim = `0`, dim_t qStep = `1`);
703
704	/// Generic function to quantize a given input floating-point tensor \p tensor
705	/// with given tensor quantization parameters \p TQP and type \p qTy. A separate
706	/// set of quantization parameters (scale, offset) is provided for each group
707	/// of \p qStep indices along the \p qDim dimension and can be obtained using
708	/// the function \ref getTensorQuantizationParams. This allows quantizing a
709	/// given tensor with finer granularity (e.g. rowwise or channelwise).
710	/// For example, for a tensor of size [4, 6, 8, 10], qDim = 1 and qStep = 3:
711	/// -> one set of quantization parameters will be provided for [:,0:2,:,:].
712	/// -> one set of quantization parameters will be provided for [:,3:5,:,:].
713	/// The number of sets of provided quantization parameters (scale, offset) is
714	/// tensor.dims()[qDim] / qStep. \returns the quantized tensor.
715	Tensor quantizeTensor(const Tensor &tensor,
716	llvm::ArrayRef<TensorQuantizationParams> TQP,
717	ElemKind qTy = ElemKind::Int8QTy, dim_t qDim = `0`,
718	dim_t qStep = `1`);
719
720	/// Similar function to the one above with the difference that the quantization
721	/// parameters scales and offsets are loaded from separate tensors \p scales
722	/// and \p offsets.
723	Tensor quantizeTensor(const Tensor &tensor, const Tensor &scales,
724	const Tensor &offsets, ElemKind qTy = ElemKind::Int8QTy,
725	dim_t qDim = `0`, dim_t qStep = `1`);
726
727	/// Verify if float is an exact power of 2 (mantissa is exactly 1.0).
728	bool isFloatPowerOf2(float val);
729
730	/// Get float 2's exponent.
731	int getFloat2Exp(float val);
732
733	} // namespace quantization
734	} // namespace glow
735
736	#endif // GLOW_QUANTIZATION_BASE_BASE_H
737

Browse the source code of glow/include/glow/Quantization/Base/Base.h