1 | #pragma once |
2 | |
3 | #include <c10/core/QScheme.h> |
4 | #include <c10/core/MemoryFormat.h> |
5 | #include <c10/macros/Macros.h> |
6 | #include <c10/util/Exception.h> |
7 | #include <c10/util/intrusive_ptr.h> |
8 | #include <c10/core/ScalarType.h> |
9 | #include <c10/core/TensorOptions.h> |
10 | |
11 | #include <ATen/Tensor.h> |
12 | #include <ATen/TensorUtils.h> |
13 | |
14 | #include <ATen/core/QuantizerBase.h> |
15 | |
16 | #include <cmath> |
17 | #include <memory> |
18 | #include <utility> |
19 | |
20 | namespace at { |
21 | |
22 | /** |
23 | * UnknownQuantizer is a placeholder quantizer for functions that implement |
24 | * quantization in a two step process. First a tensor is allocated but with |
25 | * unknown quantizer, and then the quantization kernel decides what the final |
26 | * quantizer will be. |
27 | */ |
28 | struct TORCH_API UnknownQuantizer : public Quantizer { |
29 | explicit UnknownQuantizer(ScalarType scalar_type) |
30 | : Quantizer(scalar_type) {} |
31 | |
32 | Tensor quantize(const Tensor& tensor) override; |
33 | Tensor dequantize(const Tensor& qtensor) override; |
34 | Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; |
35 | QScheme qscheme() const override; |
36 | bool equalTo(QuantizerPtr other) const override; |
37 | }; |
38 | |
39 | /** |
40 | * UniformQuantizer is the parent class for all uniform quantizers. |
41 | * These quantization scheme will map float value uniformly to |
42 | * the quantized value. For example, affine quantizer is |
43 | * the most commonly used scheme in this category. |
44 | */ |
45 | struct TORCH_API UniformQuantizer : public Quantizer { |
46 | explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} |
47 | }; |
48 | |
49 | /** |
50 | * NonUniformQuantizer is the parent class for all non-uniform quantizers. |
51 | * These quantization scheme may map float value non-uniformly to the quantized |
52 | * value. K-means quantization is a representative example in this category. |
53 | */ |
54 | struct TORCH_API NonUniformQuantizer : public Quantizer { |
55 | explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} |
56 | }; |
57 | |
58 | // There is also StochasticQuantizer which is uniform but not affine |
59 | |
60 | /** |
61 | * AffineQuantizer uses affine transformation to do quantization. |
62 | * |
63 | * For quantize: |
64 | * Y = clamp(round(X / scale + zero_point), min, max) |
65 | * For dequantize: |
66 | * X = (Y - zero_point) * scale |
67 | */ |
68 | struct TORCH_API AffineQuantizer : public UniformQuantizer { |
69 | explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {} |
70 | }; |
71 | |
72 | // Note that we will not have Symmetric Quantizer in backend to reduce |
73 | // complications in quantized kernel implementation. |
74 | |
75 | /** |
76 | * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for |
77 | * all the values in the Tensor. |
78 | */ |
79 | struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer { |
80 | explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point) |
81 | : AffineQuantizer(scalar_type), |
82 | scale_(scale), |
83 | zero_point_(zero_point) {} |
84 | |
85 | Tensor quantize(const Tensor& tensor) override; |
86 | Tensor dequantize(const Tensor& qtensor) override; |
87 | Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; |
88 | |
89 | QScheme qscheme() const override { |
90 | return kPerTensorAffine; |
91 | } |
92 | |
93 | double scale() const { |
94 | return scale_; |
95 | } |
96 | |
97 | int64_t zero_point() const { |
98 | return zero_point_; |
99 | } |
100 | |
101 | bool equalTo(QuantizerPtr other) const override { |
102 | if (!other.get() || other->qscheme() != kPerTensorAffine) { |
103 | return false; |
104 | } |
105 | auto* other_per_tensor_affine = |
106 | static_cast<PerTensorAffineQuantizer*>(other.get()); |
107 | return scalar_type() == other_per_tensor_affine->scalar_type() && |
108 | scale() == other_per_tensor_affine->scale() && |
109 | zero_point() == other_per_tensor_affine->zero_point(); |
110 | } |
111 | |
112 | private: |
113 | const double scale_; |
114 | // We use int64_t for consistency with Python |
115 | const int64_t zero_point_; |
116 | }; |
117 | |
118 | /** |
119 | * PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer |
120 | * except that we have an independent scale and zero_point parameter |
121 | * for each channel. |
122 | * |
123 | * Also note that per channel quantization is mostly applied to output channels |
124 | * of weights since per-input channel of weight quantization or per-channel |
125 | * quantization for activations can't be efficiently supported in most of |
126 | * processors since it requires each multiplication result within a single |
127 | * dot-product to have a different scale. |
128 | */ |
129 | struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer { |
130 | explicit PerChannelAffineQuantizer( |
131 | ScalarType scalar_type, |
132 | Tensor scales, |
133 | Tensor zero_points, |
134 | int64_t axis) |
135 | : AffineQuantizer(scalar_type), |
136 | scales_(std::move(scales)), |
137 | zero_points_(std::move(zero_points)), |
138 | axis_(axis) {} |
139 | |
140 | QScheme qscheme() const override { |
141 | return kPerChannelAffine; |
142 | } |
143 | |
144 | Tensor scales() const { |
145 | return scales_; |
146 | } |
147 | |
148 | Tensor zero_points() const { |
149 | return zero_points_; |
150 | } |
151 | |
152 | int64_t axis() const { |
153 | return axis_; |
154 | } |
155 | |
156 | Tensor quantize(const Tensor& tensor) override; |
157 | Tensor dequantize(const Tensor& qtensor) override; |
158 | Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; |
159 | |
160 | bool equalTo(QuantizerPtr other) const override { |
161 | if (!other.get() || other->qscheme() != kPerChannelAffine) { |
162 | return false; |
163 | } |
164 | auto* other_per_channel_affine = |
165 | static_cast<PerChannelAffineQuantizer*>(other.get()); |
166 | return scalar_type() == other_per_channel_affine->scalar_type() && |
167 | scales().equal(other_per_channel_affine->scales()) && |
168 | zero_points().equal(other_per_channel_affine->zero_points()) && |
169 | axis() == other_per_channel_affine->axis(); |
170 | } |
171 | |
172 | protected: |
173 | Tensor scales_; |
174 | Tensor zero_points_; |
175 | const int64_t axis_; |
176 | }; |
177 | |
178 | /** |
179 | * PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer |
180 | * except that it expects both scale and zero point to be floating point values. |
181 | * |
182 | * This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of |
183 | * kPerChannelAffine. |
184 | * |
185 | * The quantize equation in this case looks like - |
186 | * Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale |
187 | * |
188 | * Note: Usage of floating point zero point is useful in cases where 0 doesn't need to |
189 | * be exactly represented in the quantized space. We can get additional precision by |
190 | * using floating point values for zero point. |
191 | */ |
192 | struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer { |
193 | explicit PerChannelAffineFloatQParamsQuantizer( |
194 | ScalarType scalar_type, |
195 | Tensor scales, |
196 | Tensor zero_points, |
197 | int64_t axis) |
198 | : PerChannelAffineQuantizer(scalar_type, |
199 | scales, |
200 | zero_points, |
201 | axis) {} |
202 | |
203 | QScheme qscheme() const override { |
204 | return kPerChannelAffineFloatQParams; |
205 | } |
206 | |
207 | Tensor quantize(const Tensor& tensor) override; |
208 | Tensor dequantize(const Tensor& qtensor) override; |
209 | Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; |
210 | |
211 | bool equalTo(QuantizerPtr other) const override { |
212 | if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) { |
213 | return false; |
214 | } |
215 | auto* other_per_channel_float_qparams = |
216 | static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get()); |
217 | return scalar_type() == other_per_channel_float_qparams->scalar_type() && |
218 | scales().equal(other_per_channel_float_qparams->scales()) && |
219 | zero_points().equal(other_per_channel_float_qparams->zero_points()) && |
220 | axis() == other_per_channel_float_qparams->axis(); |
221 | } |
222 | }; |
223 | |
224 | // This is an internal utility function for getting at the QTensorImpl, |
225 | // You should only use this for writing low level |
226 | // setters/getters for QTensorImpl fields; otherwise, you should use |
227 | // the low level setters/getters that were implemented using this. |
228 | // This may be called repeatedly, so make sure it's pretty cheap. |
229 | TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self); |
230 | |
231 | // double and int64_t are because of the native function API, we only have these |
232 | // argument types right now in native functions |
233 | TORCH_API QuantizerPtr |
234 | make_per_tensor_affine_quantizer( |
235 | double scale, int64_t zero_point, ScalarType scalar_type); |
236 | |
237 | TORCH_API QuantizerPtr make_per_channel_affine_quantizer( |
238 | const Tensor& scales, |
239 | const Tensor& zero_points, |
240 | int64_t axis, |
241 | ScalarType scalar_type); |
242 | |
243 | TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type); |
244 | |
245 | // Create a Quantized Tensor given arguments for normal Tensor and a quantizer |
246 | TORCH_API Tensor new_qtensor( |
247 | IntArrayRef sizes, |
248 | const TensorOptions& options, |
249 | QuantizerPtr quantizer); |
250 | |
251 | TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer); |
252 | |
253 | TORCH_API Tensor from_blob_quantized_per_tensor_affine( |
254 | void* data, |
255 | IntArrayRef sizes, |
256 | IntArrayRef strides, |
257 | std::function<void(void*)> deleter, |
258 | const float scale, |
259 | const int64_t zeroPoint, |
260 | const TensorOptions& options); |
261 | |
262 | TORCH_API Tensor from_blob_quantized_per_tensor_affine( |
263 | void* data, |
264 | IntArrayRef sizes, |
265 | std::function<void(void*)> deleter, |
266 | const float scale, |
267 | const int64_t zeroPoint, |
268 | const TensorOptions& options); |
269 | |
270 | TORCH_API Tensor from_blob_quantized_per_channel_affine( |
271 | void* data, |
272 | IntArrayRef sizes, |
273 | std::function<void(void*)> deleter, |
274 | const Tensor& scales, |
275 | const Tensor& zero_points, |
276 | const int64_t axis, |
277 | const TensorOptions& options); |
278 | |
279 | } // namespace at |
280 | |