1#include <ATen/ArrayRef.h>
2#include <ATen/ATen.h>
3#include <ATen/ceil_div.h>
4#include <ATen/core/Tensor.h>
5#include <ATen/detail/CUDAHooksInterface.h>
6#include <ATen/Dispatch.h>
7#include <ATen/native/quantized/AffineQuantizer.h>
8#include <ATen/native/TensorFactories.h>
9#include <ATen/NativeFunctions.h>
10#include <ATen/Parallel.h>
11#include <ATen/quantized/QTensorImpl.h>
12#include <ATen/quantized/Quantizer.h>
13#include <c10/core/CPUAllocator.h>
14#include <c10/util/accumulate.h>
15
16#include <cmath>
17#include <typeinfo>
18#include <utility>
19
20namespace at {
21
22namespace {
23
24 void checkPerChannelParamDims(const Tensor& scales, const Tensor& zero_points) {
25 TORCH_CHECK(scales.dim() == 1, "scale tensor must have dimension 1");
26 TORCH_CHECK(
27 zero_points.dim() == 1, "zero_points tensor must have dimension 1");
28 TORCH_CHECK(
29 scales.numel() == zero_points.numel(),
30 "number of elements in scales and zero_points must match");
31 }
32
33} // anonymous namespace
34
35// Note: this is not a native function as Quantizer is not exposed to python yet
36QuantizerPtr TensorBase::quantizer() const {
37 // This is a terrible hack to emulate what VariableType is doing
38 at::AutoDispatchBelowAutograd mode;
39 return get_qtensorimpl(*this)->quantizer();
40}
41
42QuantizerPtr make_per_tensor_affine_quantizer(
43 double scale,
44 int64_t zero_point,
45 ScalarType scalar_type) {
46 return c10::make_intrusive<PerTensorAffineQuantizer>(scalar_type,
47 scale, zero_point);
48}
49
50QuantizerPtr make_per_channel_affine_quantizer(
51 const Tensor& scales,
52 const Tensor& zero_points,
53 int64_t axis,
54 ScalarType scalar_type) {
55 checkPerChannelParamDims(scales, zero_points);
56 TORCH_CHECK(
57 isFloatingType(scales.scalar_type()),
58 "scale tensor must be floating point");
59
60 if (isFloatingType(zero_points.scalar_type())) {
61 Tensor scales_float = scales.to(kFloat).contiguous();
62 Tensor zero_points_float = zero_points.to(kFloat).contiguous();
63 return c10::make_intrusive<PerChannelAffineFloatQParamsQuantizer>(scalar_type,
64 scales_float,
65 zero_points_float,
66 axis);
67 }
68 else {
69 Tensor scales_double = scales.to(kDouble).contiguous();
70 Tensor zero_points_int64 = zero_points.to(kLong).contiguous();
71 return c10::make_intrusive<PerChannelAffineQuantizer>(scalar_type,
72 scales_double,
73 zero_points_int64,
74 axis);
75 }
76}
77
78QTensorImpl* get_qtensorimpl(const TensorBase& self) {
79 TORCH_CHECK(
80 !self.requires_grad(),
81 "quantized tensors do not support autograd");
82 TORCH_INTERNAL_ASSERT(self.is_quantized(), "get_qtensorimpl: not a quantized tensor");
83 return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl());
84}
85
86int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::ScalarType t) {
87 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
88 int64_t element_per_byte;
89 switch(t) {
90 case at::ScalarType::QUInt4x2:
91 element_per_byte = 2;
92 break;
93 case at::ScalarType::QUInt2x4:
94 element_per_byte = 4;
95 break;
96 default:
97 element_per_byte = 1;
98 }
99 // zero dim tensor
100 if (sizes.empty()) {
101 return c10::multiply_integers(sizes) * dtype_itemsize;
102 }
103 // Consider most inner dim as cols
104 int64_t cols = sizes.at(sizes.size()-1);
105 int64_t bytes_per_row = cols * dtype_itemsize;
106 // align qtensor most inner dim, compute ceil (bytes_per_row / element_per_byte)
107 return c10::multiply_integers(IntArrayRef(sizes.data(), sizes.size() - 1)) * at::ceil_div(bytes_per_row, element_per_byte);
108}
109
110inline Tensor new_qtensor(
111 IntArrayRef sizes,
112 const TensorOptions& options,
113 QuantizerPtr quantizer) {
114 auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
115 auto device = options.device();
116 at::Allocator* allocator = nullptr;
117 // TODO: why isn't this just using GetAllocator
118 if (device.is_cuda()) {
119 allocator = at::detail::getCUDAHooks().getCUDADeviceAllocator();
120 } else if (device.is_cpu()) {
121 allocator = at::getCPUAllocator();
122 } else if (device.is_meta()) {
123 allocator = GetAllocator(kMeta);
124 } else {
125 TORCH_INTERNAL_ASSERT(0, "unrecognized device for new_qtensor: ", device);
126 }
127
128#ifdef USE_PYTORCH_QNNPACK
129 if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
130 TORCH_CHECK(!device.is_cuda(), "It looks like you are trying to quantize a CUDA tensor ",
131 "while QNNPACK backend is enabled. Although not expected to happen in ",
132 "practice, you might have done it for testing purposes. ",
133 "Please, either change the quantization engine or move the tensor to a CPU.");
134 allocator = c10::GetDefaultMobileCPUAllocator();
135 }
136#endif
137
138 at::DispatchKey tensorDispatchKey = options.computeDispatchKey();
139 native::check_size_nonnegative(sizes);
140 auto dtype = options.dtype();
141 TORCH_CHECK(
142 isQIntType(typeMetaToScalarType(dtype)),
143 "ScalarType ",
144 typeMetaToScalarType(dtype),
145 " is not supported in new_qtensor.");
146 auto scalar_type = typeMetaToScalarType(dtype);
147 int64_t size_bytes = get_sub_byte_tensor_size(sizes, dtype.itemsize(), scalar_type);
148
149 auto storage = c10::make_intrusive<StorageImpl>(
150 StorageImpl::use_byte_size_t(),
151 size_bytes,
152 allocator->allocate(size_bytes),
153 allocator,
154 /*resizable=*/true);
155 auto tensor = detail::make_tensor<QTensorImpl>(
156 storage, at::DispatchKeySet(tensorDispatchKey), dtype, quantizer);
157 get_qtensorimpl(tensor)->set_sizes_contiguous(sizes);
158 get_qtensorimpl(tensor)->empty_tensor_restride(memory_format);
159 return tensor;
160}
161
162Tensor PerTensorAffineQuantizer::quantize(const Tensor& rtensor) {
163 TORCH_CHECK(
164 rtensor.scalar_type() == kFloat,
165 "Quantize only works on Float Tensor, got ", rtensor.scalar_type());
166 // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
167 // quantizer that can be reused, so I'm using intrusive_from_this here
168 Tensor qtensor = new_qtensor(
169 rtensor.sizes(),
170 rtensor.options()
171 .dtype(scalar_type_)
172 .memory_format(rtensor.suggest_memory_format()),
173 intrusive_from_this());
174
175 auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format());
176 native::quantize_tensor_per_tensor_affine(
177 *rtensor_contig, qtensor, scale_, zero_point_);
178 return qtensor;
179}
180
181void per_tensor_affine_dequantize_impl(
182 Tensor& rtensor,
183 const Tensor& qtensor,
184 const double scale,
185 const int64_t zero_point) {
186 const auto qtensor_contig =
187 qtensor.expect_contiguous(qtensor.suggest_memory_format());
188 native::dequantize_tensor_per_tensor_affine(
189 *qtensor_contig, rtensor, scale, zero_point);
190}
191
192Tensor& PerTensorAffineQuantizer::dequantize_out(
193 Tensor& rtensor, const Tensor& qtensor) {
194 rtensor.resize_(qtensor.sizes());
195 TORCH_CHECK(
196 rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
197 rtensor.scalar_type() == kFloat,
198 "Dequantize out should be a contiguous Float Tensor; instead got type ",
199 rtensor.scalar_type(),
200 ", and is_contiguous ",
201 rtensor.is_contiguous(qtensor.suggest_memory_format()));
202 per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_);
203 return rtensor;
204}
205
206Tensor PerTensorAffineQuantizer::dequantize(const Tensor& qtensor) {
207 Tensor rtensor = at::empty(
208 qtensor.sizes(),
209 qtensor.options()
210 .dtype(at::kFloat)
211 .memory_format(qtensor.suggest_memory_format()));
212 per_tensor_affine_dequantize_impl(rtensor, qtensor, scale_, zero_point_);
213 return rtensor;
214}
215
216Tensor PerChannelAffineQuantizer::quantize(const Tensor& rtensor) {
217 // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
218 // quantizer that can be reused, so I'm using intrusive_from_this here
219 Tensor qtensor = new_qtensor(
220 rtensor.sizes(),
221 rtensor.options()
222 .dtype(scalar_type_)
223 .memory_format(rtensor.suggest_memory_format()),
224 intrusive_from_this());
225 auto rtensor_contig = rtensor.expect_contiguous(rtensor.suggest_memory_format());
226 native::quantize_tensor_per_channel_affine(
227 *rtensor_contig, qtensor, scales_, zero_points_, axis_);
228 return qtensor;
229}
230
231void per_channel_affine_dequantize_impl(
232 Tensor& rtensor,
233 const Tensor& qtensor,
234 const Tensor& scale,
235 const Tensor& zero_point,
236 const int64_t axis) {
237 const auto qtensor_contig =
238 qtensor.expect_contiguous(qtensor.suggest_memory_format());
239 native::dequantize_tensor_per_channel_affine(
240 *qtensor_contig, rtensor, scale, zero_point, axis);
241}
242
243Tensor PerChannelAffineQuantizer::dequantize(const Tensor& qtensor) {
244 Tensor rtensor = at::empty(
245 qtensor.sizes(),
246 qtensor.options()
247 .dtype(at::kFloat)
248 .memory_format(qtensor.suggest_memory_format()));
249 per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_);
250 return rtensor;
251}
252
253Tensor& PerChannelAffineQuantizer::dequantize_out(
254 Tensor& rtensor, const Tensor& qtensor) {
255 rtensor.resize_(qtensor.sizes());
256 TORCH_CHECK(
257 rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
258 rtensor.scalar_type() == kFloat,
259 "Dequantize out should be a contiguous Float Tensor; instead got type ",
260 rtensor.scalar_type(),
261 ", and is_contiguous ",
262 rtensor.is_contiguous(qtensor.suggest_memory_format()));
263 per_channel_affine_dequantize_impl(rtensor, qtensor, scales_, zero_points_, axis_);
264 return rtensor;
265}
266
267Tensor PerChannelAffineFloatQParamsQuantizer::quantize(const Tensor& rtensor) {
268 TORCH_CHECK(
269 rtensor.scalar_type() == kFloat,
270 "Quantize only works on Float Tensor, got ", rtensor.scalar_type());
271 Tensor qtensor = new_qtensor(
272 rtensor.sizes(),
273 rtensor.options().dtype(scalar_type_),
274 intrusive_from_this());
275 auto rtensor_contig = rtensor.expect_contiguous();
276 native::quantize_tensor_per_channel_float_qparams(
277 *rtensor_contig, qtensor, scales_, zero_points_, axis_);
278 return qtensor;
279}
280
281void per_channel_affine_float_q_params_dequantize_impl(
282 Tensor& rtensor,
283 const Tensor& qtensor,
284 const Tensor& scale,
285 const Tensor& zero_point,
286 const int64_t axis) {
287 const auto qtensor_contig =
288 qtensor.expect_contiguous(qtensor.suggest_memory_format());
289 native::dequantize_tensor_per_channel_float_qparams(
290 *qtensor_contig, rtensor, scale, zero_point, axis);
291}
292
293Tensor PerChannelAffineFloatQParamsQuantizer::dequantize(const Tensor& qtensor) {
294 Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat));
295 per_channel_affine_float_q_params_dequantize_impl(
296 rtensor, qtensor, scales_, zero_points_, axis_);
297 return rtensor;
298}
299
300Tensor& PerChannelAffineFloatQParamsQuantizer::dequantize_out(
301 Tensor& rtensor, const Tensor& qtensor) {
302 rtensor.resize_(qtensor.sizes());
303 TORCH_CHECK(
304 rtensor.is_contiguous(qtensor.suggest_memory_format()) &&
305 rtensor.scalar_type() == kFloat,
306 "Dequantize out should be a contiguous Float Tensor; instead got type ",
307 rtensor.scalar_type(),
308 ", and is_contiguous ",
309 rtensor.is_contiguous(qtensor.suggest_memory_format()));
310 per_channel_affine_float_q_params_dequantize_impl(
311 rtensor, qtensor, scales_, zero_points_, axis_);
312 return rtensor;
313}
314
315Quantizer::~Quantizer() = default;
316
317C10_EXPORT void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer) {
318 get_qtensorimpl(self)->set_quantizer_(quantizer);
319}
320
321Tensor from_blob_quantized_per_tensor_affine(
322 void* data,
323 IntArrayRef sizes,
324 IntArrayRef strides,
325 std::function<void(void*)> deleter,
326 const float scale,
327 const int64_t zeroPoint,
328 const TensorOptions& options) {
329 auto dtype = typeMetaToScalarType(options.dtype());
330 TORCH_CHECK(
331 isQIntType(dtype),
332 "from_blob_quantized_per_tensor_affine expects QInt dtypes, got ", dtype);
333
334 const std::size_t itemsize = options.dtype().itemsize();
335 std::size_t size = 1;
336 for (std::int64_t s : sizes) {
337 size *= static_cast<std::size_t>(s);
338 }
339 const std::size_t datasize = size * itemsize;
340
341 DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
342 data, deleter, options.device());
343
344 Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};
345
346 QuantizerPtr quantizer =
347 make_per_tensor_affine_quantizer(scale, zeroPoint, dtype);
348
349 Tensor qtensor = at::detail::make_tensor<QTensorImpl>(
350 std::move(storage),
351 at::DispatchKeySet(options.computeDispatchKey()),
352 options.dtype(),
353 quantizer);
354 get_qtensorimpl(qtensor)->set_sizes_and_strides(sizes, strides);
355 return qtensor;
356}
357
358Tensor from_blob_quantized_per_tensor_affine(
359 void* data,
360 IntArrayRef sizes,
361 std::function<void(void*)> deleter,
362 const float scale,
363 const int64_t zeroPoint,
364 const TensorOptions& options) {
365 std::vector<int64_t> strides;
366 const auto ndim = sizes.size();
367 if (ndim > 0) {
368 strides.resize(ndim);
369 // NOLINTNEXTLINE
370 int32_t i = ndim - 1;
371 // NOLINTNEXTLINE
372 strides[i] = 1;
373 while (--i >= 0) {
374 strides[i] = sizes[i + 1] * strides[i + 1];
375 }
376 }
377 return from_blob_quantized_per_tensor_affine(
378 data,
379 sizes,
380 strides,
381 std::move(deleter),
382 scale,
383 zeroPoint,
384 options);
385}
386
387Tensor from_blob_quantized_per_channel_affine(
388 void* data,
389 IntArrayRef sizes,
390 std::function<void(void*)> deleter,
391 const Tensor& scales,
392 const Tensor& zero_points,
393 const int64_t axis,
394 const TensorOptions& options) {
395 checkPerChannelParamDims(scales, zero_points);
396 int64_t channel = sizes[axis];
397 TORCH_CHECK(
398 channel == int64_t(scales.numel()),
399 "length of scales must equal to channel, expected ", channel, " got, ", scales.numel());
400 TORCH_CHECK(
401 channel == int64_t(zero_points.numel()),
402 "length of zero_points must equal to channel, expected ", channel, " got, ", zero_points.numel());
403
404 auto dtype = typeMetaToScalarType(options.dtype());
405 TORCH_CHECK(
406 isQIntType(dtype),
407 "from_blob_quantized_per_channel_affine expects QInt dtypes, got ", dtype);
408
409 const std::size_t itemsize = options.dtype().itemsize();
410 std::size_t size = 1;
411 for (std::int64_t s : sizes) {
412 size *= static_cast<std::size_t>(s);
413 }
414 const std::size_t datasize = size * itemsize;
415
416 DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
417 data, deleter, options.device());
418
419 Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};
420
421 QuantizerPtr quantizer =
422 make_per_channel_affine_quantizer(scales, zero_points, axis, dtype);
423
424 Tensor qtensor = at::detail::make_tensor<QTensorImpl>(
425 std::move(storage),
426 at::DispatchKeySet(options.computeDispatchKey()),
427 options.dtype(),
428 quantizer);
429 get_qtensorimpl(qtensor)->set_sizes_contiguous(sizes);
430
431 return qtensor;
432}
433
434Tensor UnknownQuantizer::quantize(const Tensor& tensor) {
435 TORCH_INTERNAL_ASSERT(false, "cannot call quantize on UnknownQuantizer");
436}
437Tensor UnknownQuantizer::dequantize(const Tensor& qtensor) {
438 TORCH_INTERNAL_ASSERT(false, "cannot call dequantize on UnknownQuantizer");
439}
440Tensor& UnknownQuantizer::dequantize_out(Tensor& rtensor, const Tensor& qtensor) {
441 TORCH_INTERNAL_ASSERT(false, "cannot call dequantize_out on UnknownQuantizer");
442}
443QScheme UnknownQuantizer::qscheme() const {
444 TORCH_INTERNAL_ASSERT(false, "cannot call qscheme on UnknownQuantizer");
445}
446bool UnknownQuantizer::equalTo(QuantizerPtr other) const{
447 TORCH_INTERNAL_ASSERT(false, "cannot call equalTo on UnknownQuantizer");
448}
449QuantizerPtr make_unknown_quantizer(ScalarType scalar_type) {
450 return c10::make_intrusive<UnknownQuantizer>(scalar_type);
451}
452
453} // namespace at
454