1 | #include <ATen/ArrayRef.h> |
2 | #include <ATen/ATen.h> |
3 | #include <ATen/ceil_div.h> |
4 | #include <ATen/core/Tensor.h> |
5 | #include <ATen/detail/CUDAHooksInterface.h> |
6 | #include <ATen/Dispatch.h> |
7 | #include <ATen/native/quantized/AffineQuantizer.h> |
8 | #include <ATen/native/TensorFactories.h> |
9 | #include <ATen/NativeFunctions.h> |
10 | #include <ATen/Parallel.h> |
11 | #include <ATen/quantized/QTensorImpl.h> |
12 | #include <ATen/quantized/Quantizer.h> |
13 | #include <c10/core/CPUAllocator.h> |
14 | #include <c10/util/accumulate.h> |
15 | |
16 | #include <cmath> |
17 | #include <typeinfo> |
18 | #include <utility> |
19 | |
20 | namespace at { |
21 | |
22 | namespace { |
23 | |
24 | void checkPerChannelParamDims(const Tensor& scales, const Tensor& zero_points) { |
25 | TORCH_CHECK(scales.dim() == 1, "scale tensor must have dimension 1" ); |
26 | TORCH_CHECK( |
27 | zero_points.dim() == 1, "zero_points tensor must have dimension 1" ); |
28 | TORCH_CHECK( |
29 | scales.numel() == zero_points.numel(), |
30 | "number of elements in scales and zero_points must match" ); |
31 | } |
32 | |
33 | } // anonymous namespace |
34 | |
35 | // Note: this is not a native function as Quantizer is not exposed to python yet |
36 | QuantizerPtr TensorBase::quantizer() const { |
37 | // This is a terrible hack to emulate what VariableType is doing |
38 | at::AutoDispatchBelowAutograd mode; |
39 | return get_qtensorimpl(*this)->quantizer(); |
40 | } |
41 | |
42 | QuantizerPtr make_per_tensor_affine_quantizer( |
43 | double scale, |
44 | int64_t zero_point, |
45 | ScalarType scalar_type) { |
46 | return c10::make_intrusive<PerTensorAffineQuantizer>(scalar_type, |
47 | scale, zero_point); |
48 | } |
49 | |
50 | QuantizerPtr make_per_channel_affine_quantizer( |
51 | const Tensor& scales, |
52 | const Tensor& zero_points, |
53 | int64_t axis, |
54 | ScalarType scalar_type) { |
55 | checkPerChannelParamDims(scales, zero_points); |
56 | TORCH_CHECK( |
57 | isFloatingType(scales.scalar_type()), |
58 | "scale tensor must be floating point" ); |
59 | |
60 | if (isFloatingType(zero_points.scalar_type())) { |
61 | Tensor scales_float = scales.to(kFloat).contiguous(); |
62 | Tensor zero_points_float = zero_points.to(kFloat).contiguous(); |
63 | return c10::make_intrusive<PerChannelAffineFloatQParamsQuantizer>(scalar_type, |
64 | scales_float, |
65 | zero_points_float, |
66 | axis); |
67 | } |
68 | else { |
69 | Tensor scales_double = scales.to(kDouble).contiguous(); |
70 | Tensor zero_points_int64 = zero_points.to(kLong).contiguous(); |
71 | return c10::make_intrusive<PerChannelAffineQuantizer>(scalar_type, |
72 | scales_double, |
73 | zero_points_int64, |
74 | axis); |
75 | } |
76 | } |
77 | |
78 | QTensorImpl* get_qtensorimpl(const TensorBase& self) { |
79 | TORCH_CHECK( |
80 | !self.requires_grad(), |
81 | "quantized tensors do not support autograd" ); |
82 | TORCH_INTERNAL_ASSERT(self.is_quantized(), "get_qtensorimpl: not a quantized tensor" ); |
83 | return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl()); |
84 | } |
85 | |
86 | int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::ScalarType t) { |
87 | // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
88 | int64_t element_per_byte; |
89 | switch(t) { |
90 | case at::ScalarType::QUInt4x2: |
91 | element_per_byte = 2; |
92 | break; |
93 | case at::ScalarType::QUInt2x4: |
94 | element_per_byte = 4; |
95 | break; |
96 | default: |
97 | element_per_byte = 1; |
98 | } |
99 | // zero dim tensor |
100 | if (sizes.empty()) { |
101 | return c10::multiply_integers(sizes) * dtype_itemsize; |
102 | } |
103 | // Consider most inner dim as cols |
104 | int64_t cols = sizes.at(sizes.size()-1); |
105 | int64_t bytes_per_row = cols * dtype_itemsize; |
106 | // align qtensor most inner dim, compute ceil (bytes_per_row / element_per_byte) |
107 | return c10::multiply_integers(IntArrayRef(sizes.data(), sizes.size() - 1)) * at::ceil_div(bytes_per_row, element_per_byte); |
108 | } |
109 | |
110 | inline Tensor new_qtensor( |
111 | IntArrayRef sizes, |
112 | const TensorOptions& options, |
113 | QuantizerPtr quantizer) { |
114 | auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous); |
115 | auto device = options.device(); |
116 | at::Allocator* allocator = nullptr; |
117 | // TODO: why isn't this just using GetAllocator |
118 | if (device.is_cuda()) { |
119 | allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator(); |
120 | } else if (device.is_cpu()) { |
121 | allocator = at::getCPUAllocator(); |
122 | } else if (device.is_meta()) { |
123 | allocator = GetAllocator(kMeta); |
124 | } else { |
125 | TORCH_INTERNAL_ASSERT(0, "unrecognized device for new_qtensor: " , device); |
126 | } |
127 | |
128 | #ifdef USE_PYTORCH_QNNPACK |
129 | if (at::globalContext().qEngine() == at::QEngine::QNNPACK) { |
130 | TORCH_CHECK(!device.is_cuda(), "It looks like you are trying to quantize a CUDA tensor " , |
131 | "while QNNPACK backend is enabled. Although not expected to happen in " , |
132 | "practice, you might have done it for testing purposes. " , |
133 | "Please, either change the quantization engine or move the tensor to a CPU." ); |
134 | allocator = c10::GetDefaultMobileCPUAllocator(); |
135 | } |
136 | #endif |
137 | |
138 | at::DispatchKey tensorDispatchKey = options.computeDispatchKey(); |
139 | native::check_size_nonnegative(sizes); |
140 | auto dtype = options.dtype(); |
141 | TORCH_CHECK( |
142 | isQIntType(typeMetaToScalarType(dtype)), |
143 | "ScalarType " , |
144 | typeMetaToScalarType(dtype), |
145 | " is not supported in new_qtensor." ); |
146 | auto scalar_type = typeMetaToScalarType(dtype); |
147 | int64_t size_bytes = get_sub_byte_tensor_size(sizes, dtype.itemsize(), scalar_type); |
148 | |
149 | auto storage = c10::make_intrusive<StorageImpl>( |
150 | StorageImpl::use_byte_size_t(), |
151 | size_bytes, |
152 | allocator->allocate(size_bytes), |
153 | allocator, |
154 | /*resizable=*/true); |
155 | auto tensor = detail::make_tensor<QTensorImpl>( |
156 | storage, at::DispatchKeySet(tensorDispatchKey), dtype, quantizer); |
157 | get_qtensorimpl(tensor)->set_sizes_contiguous(sizes); |
158 | get_qtensorimpl(tensor)->empty_tensor_restride(memory_format); |
159 | return tensor; |
160 | } |
161 | |
162 | Tensor PerTensorAffineQuantizer::quantize(const Tensor& rtensor) { |
163 | TORCH_CHECK( |
164 | rtensor.scalar_type() == kFloat, |
165 | "Quantize only works on Float Tensor, got " , rtensor.scalar_type()); |
166 | // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the |
167 | // quantizer that can be reused, so I'm using intrusive_from_this here |
168 | Tensor qtensor = new_qtensor( |
169 | rtensor.sizes(), |
170 | rtensor.options() |
171 | .dtype(scalar_type_) |
172 | .memory_format(rtensor.suggest_memory_format()), |
173 | intrusive_from_this()); |
174 | |
175 | auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format()); |
176 | native::quantize_tensor_per_tensor_affine( |
177 | *rtensor_contig, qtensor, scale_, zero_point_); |
178 | return qtensor; |
179 | } |
180 | |
181 | void per_tensor_affine_dequantize_impl( |
182 | Tensor& rtensor, |
183 | const Tensor& qtensor, |
184 | const double scale, |
185 | const int64_t zero_point) { |
186 | const auto qtensor_contig = |
187 | qtensor.expect_contiguous(qtensor.suggest_memory_format()); |
188 | native::dequantize_tensor_per_tensor_affine( |
189 | *qtensor_contig, rtensor, scale, zero_point); |
190 | } |
191 | |
192 | Tensor& PerTensorAffineQuantizer::dequantize_out( |
193 | Tensor& rtensor, const Tensor& qtensor) { |
194 | rtensor.resize_(qtensor.sizes()); |
195 | TORCH_CHECK( |
196 | rtensor.is_contiguous(qtensor.suggest_memory_format()) && |
197 | rtensor.scalar_type() == kFloat, |
198 | "Dequantize out should be a contiguous Float Tensor; instead got type " , |
199 | rtensor.scalar_type(), |
200 | ", and is_contiguous " , |
201 | rtensor.is_contiguous(qtensor.suggest_memory_format())); |
202 | per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_); |
203 | return rtensor; |
204 | } |
205 | |
206 | Tensor PerTensorAffineQuantizer::dequantize(const Tensor& qtensor) { |
207 | Tensor rtensor = at::empty( |
208 | qtensor.sizes(), |
209 | qtensor.options() |
210 | .dtype(at::kFloat) |
211 | .memory_format(qtensor.suggest_memory_format())); |
212 | per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_); |
213 | return rtensor; |
214 | } |
215 | |
216 | Tensor PerChannelAffineQuantizer::quantize(const Tensor& rtensor) { |
217 | // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the |
218 | // quantizer that can be reused, so I'm using intrusive_from_this here |
219 | Tensor qtensor = new_qtensor( |
220 | rtensor.sizes(), |
221 | rtensor.options() |
222 | .dtype(scalar_type_) |
223 | .memory_format(rtensor.suggest_memory_format()), |
224 | intrusive_from_this()); |
225 | auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format()); |
226 | native::quantize_tensor_per_channel_affine( |
227 | *rtensor_contig, qtensor, scales_, zero_points_, axis_); |
228 | return qtensor; |
229 | } |
230 | |
231 | void per_channel_affine_dequantize_impl( |
232 | Tensor& rtensor, |
233 | const Tensor& qtensor, |
234 | const Tensor& scale, |
235 | const Tensor& zero_point, |
236 | const int64_t axis) { |
237 | const auto qtensor_contig = |
238 | qtensor.expect_contiguous(qtensor.suggest_memory_format()); |
239 | native::dequantize_tensor_per_channel_affine( |
240 | *qtensor_contig, rtensor, scale, zero_point, axis); |
241 | } |
242 | |
243 | Tensor PerChannelAffineQuantizer::dequantize(const Tensor& qtensor) { |
244 | Tensor rtensor = at::empty( |
245 | qtensor.sizes(), |
246 | qtensor.options() |
247 | .dtype(at::kFloat) |
248 | .memory_format(qtensor.suggest_memory_format())); |
249 | per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_); |
250 | return rtensor; |
251 | } |
252 | |
253 | Tensor& PerChannelAffineQuantizer::dequantize_out( |
254 | Tensor& rtensor, const Tensor& qtensor) { |
255 | rtensor.resize_(qtensor.sizes()); |
256 | TORCH_CHECK( |
257 | rtensor.is_contiguous(qtensor.suggest_memory_format()) && |
258 | rtensor.scalar_type() == kFloat, |
259 | "Dequantize out should be a contiguous Float Tensor; instead got type " , |
260 | rtensor.scalar_type(), |
261 | ", and is_contiguous " , |
262 | rtensor.is_contiguous(qtensor.suggest_memory_format())); |
263 | per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_); |
264 | return rtensor; |
265 | } |
266 | |
267 | Tensor PerChannelAffineFloatQParamsQuantizer::quantize(const Tensor& rtensor) { |
268 | TORCH_CHECK( |
269 | rtensor.scalar_type() == kFloat, |
270 | "Quantize only works on Float Tensor, got " , rtensor.scalar_type()); |
271 | Tensor qtensor = new_qtensor( |
272 | rtensor.sizes(), |
273 | rtensor.options().dtype(scalar_type_), |
274 | intrusive_from_this()); |
275 | auto rtensor_contig = rtensor.expect_contiguous(); |
276 | native::quantize_tensor_per_channel_float_qparams( |
277 | *rtensor_contig, qtensor, scales_, zero_points_, axis_); |
278 | return qtensor; |
279 | } |
280 | |
281 | void per_channel_affine_float_q_params_dequantize_impl( |
282 | Tensor& rtensor, |
283 | const Tensor& qtensor, |
284 | const Tensor& scale, |
285 | const Tensor& zero_point, |
286 | const int64_t axis) { |
287 | const auto qtensor_contig = |
288 | qtensor.expect_contiguous(qtensor.suggest_memory_format()); |
289 | native::dequantize_tensor_per_channel_float_qparams( |
290 | *qtensor_contig, rtensor, scale, zero_point, axis); |
291 | } |
292 | |
293 | Tensor PerChannelAffineFloatQParamsQuantizer::dequantize(const Tensor& qtensor) { |
294 | Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat)); |
295 | per_channel_affine_float_q_params_dequantize_impl( |
296 | rtensor, qtensor, scales_, zero_points_, axis_); |
297 | return rtensor; |
298 | } |
299 | |
300 | Tensor& PerChannelAffineFloatQParamsQuantizer::dequantize_out( |
301 | Tensor& rtensor, const Tensor& qtensor) { |
302 | rtensor.resize_(qtensor.sizes()); |
303 | TORCH_CHECK( |
304 | rtensor.is_contiguous(qtensor.suggest_memory_format()) && |
305 | rtensor.scalar_type() == kFloat, |
306 | "Dequantize out should be a contiguous Float Tensor; instead got type " , |
307 | rtensor.scalar_type(), |
308 | ", and is_contiguous " , |
309 | rtensor.is_contiguous(qtensor.suggest_memory_format())); |
310 | per_channel_affine_float_q_params_dequantize_impl( |
311 | rtensor, qtensor, scales_, zero_points_, axis_); |
312 | return rtensor; |
313 | } |
314 | |
315 | Quantizer::~Quantizer() = default; |
316 | |
317 | C10_EXPORT void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer) { |
318 | get_qtensorimpl(self)->set_quantizer_(quantizer); |
319 | } |
320 | |
321 | Tensor from_blob_quantized_per_tensor_affine( |
322 | void* data, |
323 | IntArrayRef sizes, |
324 | IntArrayRef strides, |
325 | std::function<void(void*)> deleter, |
326 | const float scale, |
327 | const int64_t zeroPoint, |
328 | const TensorOptions& options) { |
329 | auto dtype = typeMetaToScalarType(options.dtype()); |
330 | TORCH_CHECK( |
331 | isQIntType(dtype), |
332 | "from_blob_quantized_per_tensor_affine expects QInt dtypes, got " , dtype); |
333 | |
334 | const std::size_t itemsize = options.dtype().itemsize(); |
335 | std::size_t size = 1; |
336 | for (std::int64_t s : sizes) { |
337 | size *= static_cast<std::size_t>(s); |
338 | } |
339 | const std::size_t datasize = size * itemsize; |
340 | |
341 | DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr( |
342 | data, deleter, options.device()); |
343 | |
344 | Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)}; |
345 | |
346 | QuantizerPtr quantizer = |
347 | make_per_tensor_affine_quantizer(scale, zeroPoint, dtype); |
348 | |
349 | Tensor qtensor = at::detail::make_tensor<QTensorImpl>( |
350 | std::move(storage), |
351 | at::DispatchKeySet(options.computeDispatchKey()), |
352 | options.dtype(), |
353 | quantizer); |
354 | get_qtensorimpl(qtensor)->set_sizes_and_strides(sizes, strides); |
355 | return qtensor; |
356 | } |
357 | |
358 | Tensor from_blob_quantized_per_tensor_affine( |
359 | void* data, |
360 | IntArrayRef sizes, |
361 | std::function<void(void*)> deleter, |
362 | const float scale, |
363 | const int64_t zeroPoint, |
364 | const TensorOptions& options) { |
365 | std::vector<int64_t> strides; |
366 | const auto ndim = sizes.size(); |
367 | if (ndim > 0) { |
368 | strides.resize(ndim); |
369 | // NOLINTNEXTLINE |
370 | int32_t i = ndim - 1; |
371 | // NOLINTNEXTLINE |
372 | strides[i] = 1; |
373 | while (--i >= 0) { |
374 | strides[i] = sizes[i + 1] * strides[i + 1]; |
375 | } |
376 | } |
377 | return from_blob_quantized_per_tensor_affine( |
378 | data, |
379 | sizes, |
380 | strides, |
381 | std::move(deleter), |
382 | scale, |
383 | zeroPoint, |
384 | options); |
385 | } |
386 | |
387 | Tensor from_blob_quantized_per_channel_affine( |
388 | void* data, |
389 | IntArrayRef sizes, |
390 | std::function<void(void*)> deleter, |
391 | const Tensor& scales, |
392 | const Tensor& zero_points, |
393 | const int64_t axis, |
394 | const TensorOptions& options) { |
395 | checkPerChannelParamDims(scales, zero_points); |
396 | int64_t channel = sizes[axis]; |
397 | TORCH_CHECK( |
398 | channel == int64_t(scales.numel()), |
399 | "length of scales must equal to channel, expected " , channel, " got, " , scales.numel()); |
400 | TORCH_CHECK( |
401 | channel == int64_t(zero_points.numel()), |
402 | "length of zero_points must equal to channel, expected " , channel, " got, " , zero_points.numel()); |
403 | |
404 | auto dtype = typeMetaToScalarType(options.dtype()); |
405 | TORCH_CHECK( |
406 | isQIntType(dtype), |
407 | "from_blob_quantized_per_channel_affine expects QInt dtypes, got " , dtype); |
408 | |
409 | const std::size_t itemsize = options.dtype().itemsize(); |
410 | std::size_t size = 1; |
411 | for (std::int64_t s : sizes) { |
412 | size *= static_cast<std::size_t>(s); |
413 | } |
414 | const std::size_t datasize = size * itemsize; |
415 | |
416 | DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr( |
417 | data, deleter, options.device()); |
418 | |
419 | Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)}; |
420 | |
421 | QuantizerPtr quantizer = |
422 | make_per_channel_affine_quantizer(scales, zero_points, axis, dtype); |
423 | |
424 | Tensor qtensor = at::detail::make_tensor<QTensorImpl>( |
425 | std::move(storage), |
426 | at::DispatchKeySet(options.computeDispatchKey()), |
427 | options.dtype(), |
428 | quantizer); |
429 | get_qtensorimpl(qtensor)->set_sizes_contiguous(sizes); |
430 | |
431 | return qtensor; |
432 | } |
433 | |
434 | Tensor UnknownQuantizer::quantize(const Tensor& tensor) { |
435 | TORCH_INTERNAL_ASSERT(false, "cannot call quantize on UnknownQuantizer" ); |
436 | } |
437 | Tensor UnknownQuantizer::dequantize(const Tensor& qtensor) { |
438 | TORCH_INTERNAL_ASSERT(false, "cannot call dequantize on UnknownQuantizer" ); |
439 | } |
440 | Tensor& UnknownQuantizer::dequantize_out(Tensor& rtensor, const Tensor& qtensor) { |
441 | TORCH_INTERNAL_ASSERT(false, "cannot call dequantize_out on UnknownQuantizer" ); |
442 | } |
443 | QScheme UnknownQuantizer::qscheme() const { |
444 | TORCH_INTERNAL_ASSERT(false, "cannot call qscheme on UnknownQuantizer" ); |
445 | } |
446 | bool UnknownQuantizer::equalTo(QuantizerPtr other) const{ |
447 | TORCH_INTERNAL_ASSERT(false, "cannot call equalTo on UnknownQuantizer" ); |
448 | } |
449 | QuantizerPtr make_unknown_quantizer(ScalarType scalar_type) { |
450 | return c10::make_intrusive<UnknownQuantizer>(scalar_type); |
451 | } |
452 | |
453 | } // namespace at |
454 | |