1#pragma once
2
3#include <c10/core/QScheme.h>
4#include <c10/core/MemoryFormat.h>
5#include <c10/macros/Macros.h>
6#include <c10/util/Exception.h>
7#include <c10/util/intrusive_ptr.h>
8#include <c10/core/ScalarType.h>
9#include <c10/core/TensorOptions.h>
10
11#include <ATen/Tensor.h>
12#include <ATen/TensorUtils.h>
13
14#include <ATen/core/QuantizerBase.h>
15
16#include <cmath>
17#include <memory>
18#include <utility>
19
20namespace at {
21
22/**
23 * UnknownQuantizer is a placeholder quantizer for functions that implement
24 * quantization in a two step process. First a tensor is allocated but with
25 * unknown quantizer, and then the quantization kernel decides what the final
26 * quantizer will be.
27 */
28struct TORCH_API UnknownQuantizer : public Quantizer {
29 explicit UnknownQuantizer(ScalarType scalar_type)
30 : Quantizer(scalar_type) {}
31
32 Tensor quantize(const Tensor& tensor) override;
33 Tensor dequantize(const Tensor& qtensor) override;
34 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
35 QScheme qscheme() const override;
36 bool equalTo(QuantizerPtr other) const override;
37};
38
39/**
40 * UniformQuantizer is the parent class for all uniform quantizers.
41 * These quantization scheme will map float value uniformly to
42 * the quantized value. For example, affine quantizer is
43 * the most commonly used scheme in this category.
44 */
45struct TORCH_API UniformQuantizer : public Quantizer {
46 explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
47};
48
49/**
50 * NonUniformQuantizer is the parent class for all non-uniform quantizers.
51 * These quantization scheme may map float value non-uniformly to the quantized
52 * value. K-means quantization is a representative example in this category.
53 */
54struct TORCH_API NonUniformQuantizer : public Quantizer {
55 explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
56};
57
58// There is also StochasticQuantizer which is uniform but not affine
59
60/**
61 * AffineQuantizer uses affine transformation to do quantization.
62 *
63 * For quantize:
64 * Y = clamp(round(X / scale + zero_point), min, max)
65 * For dequantize:
66 * X = (Y - zero_point) * scale
67 */
68struct TORCH_API AffineQuantizer : public UniformQuantizer {
69 explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
70};
71
72// Note that we will not have Symmetric Quantizer in backend to reduce
73// complications in quantized kernel implementation.
74
75/**
76 * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
77 * all the values in the Tensor.
78 */
79struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer {
80 explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
81 : AffineQuantizer(scalar_type),
82 scale_(scale),
83 zero_point_(zero_point) {}
84
85 Tensor quantize(const Tensor& tensor) override;
86 Tensor dequantize(const Tensor& qtensor) override;
87 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
88
89 QScheme qscheme() const override {
90 return kPerTensorAffine;
91 }
92
93 double scale() const {
94 return scale_;
95 }
96
97 int64_t zero_point() const {
98 return zero_point_;
99 }
100
101 bool equalTo(QuantizerPtr other) const override {
102 if (!other.get() || other->qscheme() != kPerTensorAffine) {
103 return false;
104 }
105 auto* other_per_tensor_affine =
106 static_cast<PerTensorAffineQuantizer*>(other.get());
107 return scalar_type() == other_per_tensor_affine->scalar_type() &&
108 scale() == other_per_tensor_affine->scale() &&
109 zero_point() == other_per_tensor_affine->zero_point();
110 }
111
112 private:
113 const double scale_;
114 // We use int64_t for consistency with Python
115 const int64_t zero_point_;
116};
117
118/**
119 * PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
120 * except that we have an independent scale and zero_point parameter
121 * for each channel.
122 *
123 * Also note that per channel quantization is mostly applied to output channels
124 * of weights since per-input channel of weight quantization or per-channel
125 * quantization for activations can't be efficiently supported in most of
126 * processors since it requires each multiplication result within a single
127 * dot-product to have a different scale.
128 */
129struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer {
130 explicit PerChannelAffineQuantizer(
131 ScalarType scalar_type,
132 Tensor scales,
133 Tensor zero_points,
134 int64_t axis)
135 : AffineQuantizer(scalar_type),
136 scales_(std::move(scales)),
137 zero_points_(std::move(zero_points)),
138 axis_(axis) {}
139
140 QScheme qscheme() const override {
141 return kPerChannelAffine;
142 }
143
144 Tensor scales() const {
145 return scales_;
146 }
147
148 Tensor zero_points() const {
149 return zero_points_;
150 }
151
152 int64_t axis() const {
153 return axis_;
154 }
155
156 Tensor quantize(const Tensor& tensor) override;
157 Tensor dequantize(const Tensor& qtensor) override;
158 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
159
160 bool equalTo(QuantizerPtr other) const override {
161 if (!other.get() || other->qscheme() != kPerChannelAffine) {
162 return false;
163 }
164 auto* other_per_channel_affine =
165 static_cast<PerChannelAffineQuantizer*>(other.get());
166 return scalar_type() == other_per_channel_affine->scalar_type() &&
167 scales().equal(other_per_channel_affine->scales()) &&
168 zero_points().equal(other_per_channel_affine->zero_points()) &&
169 axis() == other_per_channel_affine->axis();
170 }
171
172 protected:
173 Tensor scales_;
174 Tensor zero_points_;
175 const int64_t axis_;
176};
177
178/**
179 * PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer
180 * except that it expects both scale and zero point to be floating point values.
181 *
182 * This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of
183 * kPerChannelAffine.
184 *
185 * The quantize equation in this case looks like -
186 * Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale
187 *
188 * Note: Usage of floating point zero point is useful in cases where 0 doesn't need to
189 * be exactly represented in the quantized space. We can get additional precision by
190 * using floating point values for zero point.
191 */
192struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer {
193 explicit PerChannelAffineFloatQParamsQuantizer(
194 ScalarType scalar_type,
195 Tensor scales,
196 Tensor zero_points,
197 int64_t axis)
198 : PerChannelAffineQuantizer(scalar_type,
199 scales,
200 zero_points,
201 axis) {}
202
203 QScheme qscheme() const override {
204 return kPerChannelAffineFloatQParams;
205 }
206
207 Tensor quantize(const Tensor& tensor) override;
208 Tensor dequantize(const Tensor& qtensor) override;
209 Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
210
211 bool equalTo(QuantizerPtr other) const override {
212 if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) {
213 return false;
214 }
215 auto* other_per_channel_float_qparams =
216 static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get());
217 return scalar_type() == other_per_channel_float_qparams->scalar_type() &&
218 scales().equal(other_per_channel_float_qparams->scales()) &&
219 zero_points().equal(other_per_channel_float_qparams->zero_points()) &&
220 axis() == other_per_channel_float_qparams->axis();
221 }
222};
223
224// This is an internal utility function for getting at the QTensorImpl,
225// You should only use this for writing low level
226// setters/getters for QTensorImpl fields; otherwise, you should use
227// the low level setters/getters that were implemented using this.
228// This may be called repeatedly, so make sure it's pretty cheap.
229TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self);
230
231// double and int64_t are because of the native function API, we only have these
232// argument types right now in native functions
233TORCH_API QuantizerPtr
234make_per_tensor_affine_quantizer(
235 double scale, int64_t zero_point, ScalarType scalar_type);
236
237TORCH_API QuantizerPtr make_per_channel_affine_quantizer(
238 const Tensor& scales,
239 const Tensor& zero_points,
240 int64_t axis,
241 ScalarType scalar_type);
242
243TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);
244
245// Create a Quantized Tensor given arguments for normal Tensor and a quantizer
246TORCH_API Tensor new_qtensor(
247 IntArrayRef sizes,
248 const TensorOptions& options,
249 QuantizerPtr quantizer);
250
251TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);
252
253TORCH_API Tensor from_blob_quantized_per_tensor_affine(
254 void* data,
255 IntArrayRef sizes,
256 IntArrayRef strides,
257 std::function<void(void*)> deleter,
258 const float scale,
259 const int64_t zeroPoint,
260 const TensorOptions& options);
261
262TORCH_API Tensor from_blob_quantized_per_tensor_affine(
263 void* data,
264 IntArrayRef sizes,
265 std::function<void(void*)> deleter,
266 const float scale,
267 const int64_t zeroPoint,
268 const TensorOptions& options);
269
270TORCH_API Tensor from_blob_quantized_per_channel_affine(
271 void* data,
272 IntArrayRef sizes,
273 std::function<void(void*)> deleter,
274 const Tensor& scales,
275 const Tensor& zero_points,
276 const int64_t axis,
277 const TensorOptions& options);
278
279} // namespace at
280