RegisterQuantizedCUDA.cpp source code [pytorch/build/aten/src/ATen/RegisterQuantizedCUDA.cpp]

1	// required for old g++ to compile PRId64 macros, see
2	// https://github.com/pytorch/pytorch/issues/3571
3	// for context
4	#ifndef __STDC_FORMAT_MACROS
5	#define __STDC_FORMAT_MACROS
6	#endif
7
8	// an external backend might generate file within its code tree
9	// and check all the source files within the tree with clang-format.
10	// so, disable it since the backend might have a different config.
11	// clang-format off
12
13	// NOTE: This condition is true for all PyTorch internal libraries, it
14	// just excludes external projects such as torch_xla which
15	// re-use some of the PyTorch codegen machinery.
16	#if defined(CAFFE2_BUILD_MAIN_LIB) \|\| \
17	defined(TORCH_CUDA_BUILD_MAIN_LIB) \|\| \
18	defined(TORCH_HIP_BUILD_MAIN_LIB) \|\| \
19	defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) \|\| \
20	defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB)
21	#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
22	#endif
23
24	// @generated by torchgen/gen.py from RegisterDispatchKey.cpp
25
26	#include <c10/core/TensorImpl.h>
27	#include <c10/core/Allocator.h>
28	#include <ATen/DeviceGuard.h>
29	#include <ATen/NamedTensorUtils.h>
30	#include <ATen/Utils.h>
31	#include <ATen/WrapDimUtils.h>
32	#include <ATen/Dispatch.h>
33	#include <c10/util/ExclusivelyOwned.h>
34	#include <c10/util/Half.h>
35	#include <c10/core/UndefinedTensorImpl.h>
36	#include <c10/util/Optional.h>
37	#include <ATen/Tensor.h>
38	#include <ATen/native/Resize.h>
39
40	#include <cstddef>
41	#include <functional>
42	#include <memory>
43	#include <utility>
44
45	#include <ATen/Config.h>
46	#include <ATen/core/op_registration/adaption.h>
47	#include <torch/library.h>
48	#include <c10/cuda/CUDAGuard.h>
49	#include <ATen/cuda/ATenCUDAGeneral.h>
50	#include <ATen/cuda/CUDADevice.h>
51	#include <ATen/cuda/CUDAContext.h>
52
53	#include <ATen/ops/as_strided_native.h>
54	#include <ATen/ops/empty.h>
55	#include <ATen/ops/empty_strided.h>
56	#include <ATen/ops/_copy_from_and_resize.h>
57	#include <ATen/ops/_copy_from.h>
58	#include <ATen/ops/_adaptive_avg_pool2d_native.h>
59	#include <ATen/ops/_empty_affine_quantized_native.h>
60	#include <ATen/ops/_empty_per_channel_affine_quantized_native.h>
61	#include <ATen/ops/_index_put_impl_native.h>
62	#include <ATen/ops/_reshape_alias_native.h>
63	#include <ATen/ops/as_strided_native.h>
64	#include <ATen/ops/clone_native.h>
65	#include <ATen/ops/dequantize_native.h>
66	#include <ATen/ops/empty_like_native.h>
67	#include <ATen/ops/empty_native.h>
68	#include <ATen/ops/empty_quantized_native.h>
69	#include <ATen/ops/empty_strided_native.h>
70	#include <ATen/ops/fill_native.h>
71	#include <ATen/ops/flip_native.h>
72	#include <ATen/ops/gelu_native.h>
73	#include <ATen/ops/index_select_native.h>
74	#include <ATen/ops/int_repr_native.h>
75	#include <ATen/ops/masked_fill_native.h>
76	#include <ATen/ops/max_native.h>
77	#include <ATen/ops/min_native.h>
78	#include <ATen/ops/q_per_channel_axis_native.h>
79	#include <ATen/ops/q_per_channel_scales_native.h>
80	#include <ATen/ops/q_per_channel_zero_points_native.h>
81	#include <ATen/ops/q_scale_native.h>
82	#include <ATen/ops/q_zero_point_native.h>
83	#include <ATen/ops/qscheme_native.h>
84	#include <ATen/ops/quantized_max_pool2d_native.h>
85	#include <ATen/ops/relu_native.h>
86	#include <ATen/ops/set_native.h>
87	#include <ATen/ops/squeeze_native.h>
88	#include <ATen/ops/unfold_native.h>
89	#include <ATen/ops/unsqueeze_native.h>
90	#include <ATen/ops/view_native.h>
91
92	// See template file RegisterDispatchDefinitions.ini
93	namespace at {
94	// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
95	// ambiguity with conflicting identifiers that may have been defined in
96	// at namespace already.
97	namespace {
98	Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
99	if (strides.empty()) {
100	return at::empty(sizes, options);
101	} else {
102	return at::empty_strided(sizes, strides, options);
103	}
104	}
105	void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
106	TORCH_CHECK(options.dtype() == out.dtype(),
107	"Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
108	TORCH_CHECK(options.device() == out.device(),
109	"Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
110	const bool resized = at::native::resize_output(out, sizes);
111	// Only restride if a resize occurred; otherwise we ignore the (advisory)
112	// strides from the meta function and directly use the output tensor's
113	// preexisting strides
114	if (resized) {
115	if (!strides.empty()) {
116	TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
117	// TODO: avoid the redispatch here
118	out.as_strided_(sizes, strides);
119	} else if (options.memory_format_opt().has_value()) {
120	out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
121	}
122	}
123	}
124	void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
125	// These checks are needed on those operators that:
126	// 1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
127	// 2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
128	// For other operators (e.g. 'add'), 'TensorIterator' already checks
129	// these things separately.
130	TORCH_CHECK(options.dtype() == self.dtype(),
131	"Bad in-place call: ",
132	"input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
133	TORCH_CHECK(options.device() == self.device(),
134	"Bad in-place call: ",
135	"input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
136	TORCH_CHECK(sizes == self.sizes(),
137	"Bad in-place call: ",
138	"input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
139	}
140	c10::optional<Tensor> maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
141	if (out.strides() != strides) {
142	return at::empty_strided(sizes, strides, options);
143	}
144	return c10::nullopt;
145	}
146	namespace {
147	at::Tensor wrapper_QuantizedCUDA__as_strided(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
148	// No device check
149	// DeviceGuard omitted
150	return at::native::as_strided_qtensorimpl(self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride), storage_offset.has_value() ? c10::make_optional(storage_offset ->expect_int()) : c10::nullopt);
151	}
152	} // anonymous namespace
153	namespace {
154	at::Tensor wrapper_QuantizedCUDA_memory_format_empty(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
155	c10::optional<Device> common_device = nullopt;
156	(void)common_device; // Suppress unused variable warning
157	globalContext().lazyInitCUDA();
158	const DeviceGuard device_guard(device_or_default(device));
159	return at::native::empty_unknown_quantized(C10_AS_INTARRAYREF_SLOW(size), dtype, layout, device, pin_memory, memory_format);
160	}
161	} // anonymous namespace
162	namespace {
163	at::Tensor wrapper_QuantizedCUDA___empty_affine_quantized(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format) {
164	c10::optional<Device> common_device = nullopt;
165	(void)common_device; // Suppress unused variable warning
166	globalContext().lazyInitCUDA();
167	const DeviceGuard device_guard(device_or_default(device));
168	return at::native::empty_affine_quantized(size, dtype, layout, device, pin_memory, scale, zero_point, memory_format);
169	}
170	} // anonymous namespace
171	namespace {
172	at::Tensor wrapper_QuantizedCUDA___empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
173	c10::optional<Device> common_device = nullopt;
174	(void)common_device; // Suppress unused variable warning
175	globalContext().lazyInitCUDA();
176	const DeviceGuard device_guard(device_or_default(device));
177	return at::native::empty_per_channel_affine_quantized(size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
178	}
179	} // anonymous namespace
180	namespace {
181	at::Tensor wrapper_QuantizedCUDA__empty_quantized(at::IntArrayRef size, const at::Tensor & qtensor, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
182	c10::optional<Device> common_device = nullopt;
183	(void)common_device; // Suppress unused variable warning
184	c10::impl::check_and_update_common_device(common_device, qtensor, "wrapper_QuantizedCUDA__empty_quantized", "qtensor");
185	globalContext().lazyInitCUDA();
186	const DeviceGuard device_guard(device_or_default(device));
187	return at::native::empty_quantized(size, qtensor, dtype, layout, device, pin_memory, memory_format);
188	}
189	} // anonymous namespace
190	namespace {
191	at::Tensor wrapper_QuantizedCUDA__empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
192	// No device check
193	// DeviceGuard omitted
194	return at::native::empty_like_quantized(self, dtype, layout, device, pin_memory, memory_format);
195	}
196	} // anonymous namespace
197	namespace {
198	at::Tensor wrapper_QuantizedCUDA__empty_strided(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
199	c10::optional<Device> common_device = nullopt;
200	(void)common_device; // Suppress unused variable warning
201	globalContext().lazyInitCUDA();
202	const DeviceGuard device_guard(device_or_default(device));
203	return at::native::empty_strided_unknown_quantized(C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride), dtype, layout, device, pin_memory);
204	}
205	} // anonymous namespace
206	namespace {
207	at::Tensor & wrapper_QuantizedCUDA_Scalar_fill_(at::Tensor & self, const at::Scalar & value) {
208	// No device check
209	const OptionalDeviceGuard device_guard(device_of(self));
210	return at::native::fill_quantized_(self, value);
211	}
212	} // anonymous namespace
213	namespace {
214	at::Tensor & wrapper_QuantizedCUDA_Tensor_fill_(at::Tensor & self, const at::Tensor & value) {
215	// No device check
216	const OptionalDeviceGuard device_guard(device_of(self));
217	return at::native::fill_quantized_(self, value);
218	}
219	} // anonymous namespace
220	namespace {
221	at::Tensor & wrapper_QuantizedCUDA___index_put_impl_(at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate, bool unsafe) {
222	// No device check
223	const OptionalDeviceGuard device_guard(device_of(self));
224	return at::native::_index_put_impl_quantized_cuda_(self, indices, values, accumulate, unsafe);
225	}
226	} // anonymous namespace
227	namespace {
228	::std::tuple<at::Tensor,at::Tensor> wrapper_QuantizedCUDA_dim_max(const at::Tensor & self, int64_t dim, bool keepdim) {
229	// No device check
230	const OptionalDeviceGuard device_guard(device_of(self));
231	return at::native::qmax(self, dim, keepdim);
232	}
233	} // anonymous namespace
234	namespace {
235	at::Tensor wrapper_QuantizedCUDA__quantized_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
236	c10::optional<Device> common_device = nullopt;
237	(void)common_device; // Suppress unused variable warning
238	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__quantized_max_pool2d", "self");
239	const OptionalDeviceGuard device_guard(device_of(self));
240	return at::native::quantized_max_pool2d_cudnn(self, kernel_size, stride, padding, dilation, ceil_mode);
241	}
242	} // anonymous namespace
243	namespace {
244	::std::tuple<at::Tensor,at::Tensor> wrapper_QuantizedCUDA_dim_min(const at::Tensor & self, int64_t dim, bool keepdim) {
245	// No device check
246	const OptionalDeviceGuard device_guard(device_of(self));
247	return at::native::qmin(self, dim, keepdim);
248	}
249	} // anonymous namespace
250	namespace {
251	at::Tensor wrapper_QuantizedCUDA___reshape_alias(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
252	// No device check
253	// DeviceGuard omitted
254	return at::native::_reshape_alias(self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
255	}
256	} // anonymous namespace
257	namespace {
258	at::Tensor wrapper_QuantizedCUDA__relu(const at::Tensor & self) {
259	// No device check
260	const OptionalDeviceGuard device_guard(device_of(self));
261	return at::native::relu_quantized_cuda(self);
262	}
263	} // anonymous namespace
264	namespace {
265	at::Tensor & wrapper_QuantizedCUDA__relu_(at::Tensor & self) {
266	// No device check
267	const OptionalDeviceGuard device_guard(device_of(self));
268	return at::native::relu_quantized_cuda_(self);
269	}
270	} // anonymous namespace
271	namespace {
272	at::Tensor wrapper_QuantizedCUDA__gelu(const at::Tensor & self, c10::string_view approximate) {
273	// No device check
274	const OptionalDeviceGuard device_guard(device_of(self));
275	return at::native::gelu_quantized_cuda(self, approximate);
276	}
277	} // anonymous namespace
278	namespace {
279	at::Tensor wrapper_QuantizedCUDA__squeeze(const at::Tensor & self) {
280	// No device check
281	// DeviceGuard omitted
282	return at::native::squeeze_quantized(self);
283	}
284	} // anonymous namespace
285	namespace {
286	at::Tensor wrapper_QuantizedCUDA_dim_squeeze(const at::Tensor & self, int64_t dim) {
287	// No device check
288	// DeviceGuard omitted
289	return at::native::squeeze_quantized(self, dim);
290	}
291	} // anonymous namespace
292	namespace {
293	at::Tensor wrapper_QuantizedCUDA_dims_squeeze(const at::Tensor & self, at::IntArrayRef dim) {
294	// No device check
295	// DeviceGuard omitted
296	return at::native::squeeze_quantized(self, dim);
297	}
298	} // anonymous namespace
299	namespace {
300	at::Tensor wrapper_QuantizedCUDA__flip(const at::Tensor & self, at::IntArrayRef dims) {
301	c10::optional<Device> common_device = nullopt;
302	(void)common_device; // Suppress unused variable warning
303	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__flip", "self");
304	const OptionalDeviceGuard device_guard(device_of(self));
305	return at::native::flip(self, dims);
306	}
307	} // anonymous namespace
308	namespace {
309	at::Tensor wrapper_QuantizedCUDA__unsqueeze(const at::Tensor & self, int64_t dim) {
310	// No device check
311	// DeviceGuard omitted
312	return at::native::unsqueeze_quantized(self, dim);
313	}
314	} // anonymous namespace
315	namespace {
316	at::Tensor wrapper_QuantizedCUDA__clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
317	c10::optional<Device> common_device = nullopt;
318	(void)common_device; // Suppress unused variable warning
319	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__clone", "self");
320	const OptionalDeviceGuard device_guard(device_of(self));
321	return at::native::quantized_clone(self, memory_format);
322	}
323	} // anonymous namespace
324	namespace {
325	at::Tensor wrapper_QuantizedCUDA_self_dequantize(const at::Tensor & self) {
326	c10::optional<Device> common_device = nullopt;
327	(void)common_device; // Suppress unused variable warning
328	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA_self_dequantize", "self");
329	const OptionalDeviceGuard device_guard(device_of(self));
330	return at::native::dequantize_quantized(self);
331	}
332	} // anonymous namespace
333	namespace {
334	double wrapper_QuantizedCUDA__q_scale(const at::Tensor & self) {
335	c10::optional<Device> common_device = nullopt;
336	(void)common_device; // Suppress unused variable warning
337	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__q_scale", "self");
338	const OptionalDeviceGuard device_guard(device_of(self));
339	return at::native::q_scale_quant(self);
340	}
341	} // anonymous namespace
342	namespace {
343	int64_t wrapper_QuantizedCUDA__q_zero_point(const at::Tensor & self) {
344	c10::optional<Device> common_device = nullopt;
345	(void)common_device; // Suppress unused variable warning
346	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__q_zero_point", "self");
347	const OptionalDeviceGuard device_guard(device_of(self));
348	return at::native::q_zero_point_quant(self);
349	}
350	} // anonymous namespace
351	namespace {
352	at::Tensor wrapper_QuantizedCUDA__q_per_channel_scales(const at::Tensor & self) {
353	c10::optional<Device> common_device = nullopt;
354	(void)common_device; // Suppress unused variable warning
355	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__q_per_channel_scales", "self");
356	const OptionalDeviceGuard device_guard(device_of(self));
357	return at::native::q_per_channel_scales(self);
358	}
359	} // anonymous namespace
360	namespace {
361	at::Tensor wrapper_QuantizedCUDA__q_per_channel_zero_points(const at::Tensor & self) {
362	c10::optional<Device> common_device = nullopt;
363	(void)common_device; // Suppress unused variable warning
364	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__q_per_channel_zero_points", "self");
365	const OptionalDeviceGuard device_guard(device_of(self));
366	return at::native::q_per_channel_zero_points(self);
367	}
368	} // anonymous namespace
369	namespace {
370	int64_t wrapper_QuantizedCUDA__q_per_channel_axis(const at::Tensor & self) {
371	c10::optional<Device> common_device = nullopt;
372	(void)common_device; // Suppress unused variable warning
373	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__q_per_channel_axis", "self");
374	const OptionalDeviceGuard device_guard(device_of(self));
375	return at::native::q_per_channel_axis(self);
376	}
377	} // anonymous namespace
378	namespace {
379	at::Tensor wrapper_QuantizedCUDA__int_repr(const at::Tensor & self) {
380	// No device check
381	const OptionalDeviceGuard device_guard(device_of(self));
382	return at::native::int_repr_quantized_cuda(self);
383	}
384	} // anonymous namespace
385	namespace {
386	at::QScheme wrapper_QuantizedCUDA__qscheme(const at::Tensor & self) {
387	c10::optional<Device> common_device = nullopt;
388	(void)common_device; // Suppress unused variable warning
389	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__qscheme", "self");
390	const OptionalDeviceGuard device_guard(device_of(self));
391	return at::native::qscheme_quant(self);
392	}
393	} // anonymous namespace
394	namespace {
395	at::Tensor & wrapper_QuantizedCUDA_source_Storage_storage_offset_set_(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
396	// No device check
397	// DeviceGuard omitted
398	return at::native::set_storage_quantized_(self, source, storage_offset.expect_int(), C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
399	}
400	} // anonymous namespace
401	namespace {
402	at::Tensor & wrapper_QuantizedCUDA_Scalar_masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value) {
403	// No device check
404	const OptionalDeviceGuard device_guard(device_of(self));
405	return at::native::masked_fill__quantized_cuda(self, mask, value);
406	}
407	} // anonymous namespace
408	namespace {
409	at::Tensor & wrapper_QuantizedCUDA_Tensor_masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value) {
410	// No device check
411	const OptionalDeviceGuard device_guard(device_of(self));
412	return at::native::masked_fill__quantized_cuda(self, mask, value);
413	}
414	} // anonymous namespace
415	namespace {
416	at::Tensor wrapper_QuantizedCUDA__view(const at::Tensor & self, c10::SymIntArrayRef size) {
417	// No device check
418	// DeviceGuard omitted
419	return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size));
420	}
421	} // anonymous namespace
422	namespace {
423	at::Tensor wrapper_QuantizedCUDA__index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index) {
424	c10::optional<Device> common_device = nullopt;
425	(void)common_device; // Suppress unused variable warning
426	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA__index_select", "self");
427	c10::impl::check_and_update_common_device(common_device, index, "wrapper_QuantizedCUDA__index_select", "index");
428	const OptionalDeviceGuard device_guard(device_of(self));
429	return at::native::index_select_quantized_cuda(self, dim, index);
430	}
431	} // anonymous namespace
432	namespace {
433	at::Tensor & wrapper_QuantizedCUDA_out_index_select_out(const at::Tensor & self, int64_t dim, const at::Tensor & index, at::Tensor & out) {
434	c10::optional<Device> common_device = nullopt;
435	(void)common_device; // Suppress unused variable warning
436	c10::impl::check_and_update_common_device(common_device, out, "wrapper_QuantizedCUDA_out_index_select_out", "out");
437	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA_out_index_select_out", "self");
438	c10::impl::check_and_update_common_device(common_device, index, "wrapper_QuantizedCUDA_out_index_select_out", "index");
439	const OptionalDeviceGuard device_guard(device_of(self));
440	return at::native::index_select_out_cuda(self, dim, index, out);
441	}
442	} // anonymous namespace
443	namespace {
444	at::Tensor wrapper_QuantizedCUDA__unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
445	// No device check
446	// DeviceGuard omitted
447	return at::native::unfold(self, dimension, size, step);
448	}
449	} // anonymous namespace
450	namespace {
451	at::Tensor wrapper_QuantizedCUDA___adaptive_avg_pool2d(const at::Tensor & self, c10::SymIntArrayRef output_size) {
452	c10::optional<Device> common_device = nullopt;
453	(void)common_device; // Suppress unused variable warning
454	c10::impl::check_and_update_common_device(common_device, self, "wrapper_QuantizedCUDA___adaptive_avg_pool2d", "self");
455	const OptionalDeviceGuard device_guard(device_of(self));
456	return at::native::adaptive_avg_pool2d_quantized_cuda(self, C10_AS_INTARRAYREF_SLOW(output_size));
457	}
458	} // anonymous namespace
459	TORCH_LIBRARY_IMPL(aten, QuantizedCUDA, m) {
460	m.impl("as_strided",
461	TORCH_FN(wrapper_QuantizedCUDA__as_strided));
462	m.impl("empty.memory_format",
463	TORCH_FN(wrapper_QuantizedCUDA_memory_format_empty));
464	m.impl("_empty_affine_quantized",
465	TORCH_FN(wrapper_QuantizedCUDA___empty_affine_quantized));
466	m.impl("_empty_per_channel_affine_quantized",
467	TORCH_FN(wrapper_QuantizedCUDA___empty_per_channel_affine_quantized));
468	m.impl("empty_quantized",
469	TORCH_FN(wrapper_QuantizedCUDA__empty_quantized));
470	m.impl("empty_like",
471	TORCH_FN(wrapper_QuantizedCUDA__empty_like));
472	m.impl("empty_strided",
473	TORCH_FN(wrapper_QuantizedCUDA__empty_strided));
474	m.impl("fill_.Scalar",
475	TORCH_FN(wrapper_QuantizedCUDA_Scalar_fill_));
476	m.impl("fill_.Tensor",
477	TORCH_FN(wrapper_QuantizedCUDA_Tensor_fill_));
478	m.impl("_index_put_impl_",
479	TORCH_FN(wrapper_QuantizedCUDA___index_put_impl_));
480	m.impl("max.dim",
481	TORCH_FN(wrapper_QuantizedCUDA_dim_max));
482	m.impl("quantized_max_pool2d",
483	TORCH_FN(wrapper_QuantizedCUDA__quantized_max_pool2d));
484	m.impl("min.dim",
485	TORCH_FN(wrapper_QuantizedCUDA_dim_min));
486	m.impl("_reshape_alias",
487	TORCH_FN(wrapper_QuantizedCUDA___reshape_alias));
488	m.impl("relu",
489	TORCH_FN(wrapper_QuantizedCUDA__relu));
490	m.impl("relu_",
491	TORCH_FN(wrapper_QuantizedCUDA__relu_));
492	m.impl("gelu",
493	TORCH_FN(wrapper_QuantizedCUDA__gelu));
494	m.impl("squeeze",
495	TORCH_FN(wrapper_QuantizedCUDA__squeeze));
496	m.impl("squeeze.dim",
497	TORCH_FN(wrapper_QuantizedCUDA_dim_squeeze));
498	m.impl("squeeze.dims",
499	TORCH_FN(wrapper_QuantizedCUDA_dims_squeeze));
500	m.impl("flip",
501	TORCH_FN(wrapper_QuantizedCUDA__flip));
502	m.impl("unsqueeze",
503	TORCH_FN(wrapper_QuantizedCUDA__unsqueeze));
504	m.impl("clone",
505	TORCH_FN(wrapper_QuantizedCUDA__clone));
506	m.impl("dequantize.self",
507	TORCH_FN(wrapper_QuantizedCUDA_self_dequantize));
508	m.impl("q_scale",
509	TORCH_FN(wrapper_QuantizedCUDA__q_scale));
510	m.impl("q_zero_point",
511	TORCH_FN(wrapper_QuantizedCUDA__q_zero_point));
512	m.impl("q_per_channel_scales",
513	TORCH_FN(wrapper_QuantizedCUDA__q_per_channel_scales));
514	m.impl("q_per_channel_zero_points",
515	TORCH_FN(wrapper_QuantizedCUDA__q_per_channel_zero_points));
516	m.impl("q_per_channel_axis",
517	TORCH_FN(wrapper_QuantizedCUDA__q_per_channel_axis));
518	m.impl("int_repr",
519	TORCH_FN(wrapper_QuantizedCUDA__int_repr));
520	m.impl("qscheme",
521	TORCH_FN(wrapper_QuantizedCUDA__qscheme));
522	m.impl("set_.source_Storage_storage_offset",
523	TORCH_FN(wrapper_QuantizedCUDA_source_Storage_storage_offset_set_));
524	m.impl("masked_fill_.Scalar",
525	TORCH_FN(wrapper_QuantizedCUDA_Scalar_masked_fill_));
526	m.impl("masked_fill_.Tensor",
527	TORCH_FN(wrapper_QuantizedCUDA_Tensor_masked_fill_));
528	m.impl("view",
529	TORCH_FN(wrapper_QuantizedCUDA__view));
530	m.impl("index_select",
531	TORCH_FN(wrapper_QuantizedCUDA__index_select));
532	m.impl("index_select.out",
533	TORCH_FN(wrapper_QuantizedCUDA_out_index_select_out));
534	m.impl("unfold",
535	TORCH_FN(wrapper_QuantizedCUDA__unfold));
536	m.impl("_adaptive_avg_pool2d",
537	TORCH_FN(wrapper_QuantizedCUDA___adaptive_avg_pool2d));
538	};
539	} // anonymous namespace
540	namespace quantizedcuda {
541	at::Tensor as_strided(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) {
542	return wrapper_QuantizedCUDA__as_strided(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt (*storage_offset)) : c10::nullopt);
543	}
544	at::Tensor as_strided_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
545	return wrapper_QuantizedCUDA__as_strided(self, size, stride, storage_offset);
546	}
547	at::Tensor empty(at::IntArrayRef size, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
548	return wrapper_QuantizedCUDA_memory_format_empty(c10::fromIntArrayRefSlow(size), optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
549	}
550	at::Tensor empty(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
551	return wrapper_QuantizedCUDA_memory_format_empty(c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, memory_format);
552	}
553	at::Tensor empty_symint(c10::SymIntArrayRef size, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
554	return wrapper_QuantizedCUDA_memory_format_empty(size, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
555	}
556	at::Tensor empty_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
557	return wrapper_QuantizedCUDA_memory_format_empty(size, dtype, layout, device, pin_memory, memory_format);
558	}
559	at::Tensor _empty_affine_quantized(at::IntArrayRef size, at::TensorOptions options, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format) {
560	return wrapper_QuantizedCUDA___empty_affine_quantized(size, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), scale, zero_point, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
561	}
562	at::Tensor _empty_affine_quantized(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format) {
563	return wrapper_QuantizedCUDA___empty_affine_quantized(size, dtype, layout, device, pin_memory, scale, zero_point, memory_format);
564	}
565	at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
566	return wrapper_QuantizedCUDA___empty_per_channel_affine_quantized(size, scales, zero_points, axis, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
567	}
568	at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
569	return wrapper_QuantizedCUDA___empty_per_channel_affine_quantized(size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
570	}
571	at::Tensor empty_quantized(at::IntArrayRef size, const at::Tensor & qtensor, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
572	return wrapper_QuantizedCUDA__empty_quantized(size, qtensor, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
573	}
574	at::Tensor empty_quantized(at::IntArrayRef size, const at::Tensor & qtensor, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
575	return wrapper_QuantizedCUDA__empty_quantized(size, qtensor, dtype, layout, device, pin_memory, memory_format);
576	}
577	at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
578	return wrapper_QuantizedCUDA__empty_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
579	}
580	at::Tensor empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
581	return wrapper_QuantizedCUDA__empty_like(self, dtype, layout, device, pin_memory, memory_format);
582	}
583	at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options) {
584	return wrapper_QuantizedCUDA__empty_strided(c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
585	}
586	at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
587	return wrapper_QuantizedCUDA__empty_strided(c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype, layout, device, pin_memory);
588	}
589	at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options) {
590	return wrapper_QuantizedCUDA__empty_strided(size, stride, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
591	}
592	at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
593	return wrapper_QuantizedCUDA__empty_strided(size, stride, dtype, layout, device, pin_memory);
594	}
595	at::Tensor & fill_(at::Tensor & self, const at::Scalar & value) {
596	return wrapper_QuantizedCUDA_Scalar_fill_(self, value);
597	}
598	at::Tensor & fill_(at::Tensor & self, const at::Tensor & value) {
599	return wrapper_QuantizedCUDA_Tensor_fill_(self, value);
600	}
601	at::Tensor & _index_put_impl_(at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate, bool unsafe) {
602	return wrapper_QuantizedCUDA___index_put_impl_(self, indices, values, accumulate, unsafe);
603	}
604	::std::tuple<at::Tensor,at::Tensor> max(const at::Tensor & self, int64_t dim, bool keepdim) {
605	return wrapper_QuantizedCUDA_dim_max(self, dim, keepdim);
606	}
607	at::Tensor quantized_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
608	return wrapper_QuantizedCUDA__quantized_max_pool2d(self, kernel_size, stride, padding, dilation, ceil_mode);
609	}
610	::std::tuple<at::Tensor,at::Tensor> min(const at::Tensor & self, int64_t dim, bool keepdim) {
611	return wrapper_QuantizedCUDA_dim_min(self, dim, keepdim);
612	}
613	at::Tensor _reshape_alias(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride) {
614	return wrapper_QuantizedCUDA___reshape_alias(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
615	}
616	at::Tensor _reshape_alias_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
617	return wrapper_QuantizedCUDA___reshape_alias(self, size, stride);
618	}
619	at::Tensor relu(const at::Tensor & self) {
620	return wrapper_QuantizedCUDA__relu(self);
621	}
622	at::Tensor & relu_(at::Tensor & self) {
623	return wrapper_QuantizedCUDA__relu_(self);
624	}
625	at::Tensor gelu(const at::Tensor & self, c10::string_view approximate) {
626	return wrapper_QuantizedCUDA__gelu(self, approximate);
627	}
628	at::Tensor squeeze(const at::Tensor & self) {
629	return wrapper_QuantizedCUDA__squeeze(self);
630	}
631	at::Tensor squeeze(const at::Tensor & self, int64_t dim) {
632	return wrapper_QuantizedCUDA_dim_squeeze(self, dim);
633	}
634	at::Tensor squeeze(const at::Tensor & self, at::IntArrayRef dim) {
635	return wrapper_QuantizedCUDA_dims_squeeze(self, dim);
636	}
637	at::Tensor flip(const at::Tensor & self, at::IntArrayRef dims) {
638	return wrapper_QuantizedCUDA__flip(self, dims);
639	}
640	at::Tensor unsqueeze(const at::Tensor & self, int64_t dim) {
641	return wrapper_QuantizedCUDA__unsqueeze(self, dim);
642	}
643	at::Tensor clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
644	return wrapper_QuantizedCUDA__clone(self, memory_format);
645	}
646	at::Tensor dequantize(const at::Tensor & self) {
647	return wrapper_QuantizedCUDA_self_dequantize(self);
648	}
649	double q_scale(const at::Tensor & self) {
650	return wrapper_QuantizedCUDA__q_scale(self);
651	}
652	int64_t q_zero_point(const at::Tensor & self) {
653	return wrapper_QuantizedCUDA__q_zero_point(self);
654	}
655	at::Tensor q_per_channel_scales(const at::Tensor & self) {
656	return wrapper_QuantizedCUDA__q_per_channel_scales(self);
657	}
658	at::Tensor q_per_channel_zero_points(const at::Tensor & self) {
659	return wrapper_QuantizedCUDA__q_per_channel_zero_points(self);
660	}
661	int64_t q_per_channel_axis(const at::Tensor & self) {
662	return wrapper_QuantizedCUDA__q_per_channel_axis(self);
663	}
664	at::Tensor int_repr(const at::Tensor & self) {
665	return wrapper_QuantizedCUDA__int_repr(self);
666	}
667	at::QScheme qscheme(const at::Tensor & self) {
668	return wrapper_QuantizedCUDA__qscheme(self);
669	}
670	at::Tensor & set_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride) {
671	return wrapper_QuantizedCUDA_source_Storage_storage_offset_set_(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
672	}
673	at::Tensor & set__symint(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
674	return wrapper_QuantizedCUDA_source_Storage_storage_offset_set_(self, source, storage_offset, size, stride);
675	}
676	at::Tensor & masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value) {
677	return wrapper_QuantizedCUDA_Scalar_masked_fill_(self, mask, value);
678	}
679	at::Tensor & masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value) {
680	return wrapper_QuantizedCUDA_Tensor_masked_fill_(self, mask, value);
681	}
682	at::Tensor view(const at::Tensor & self, at::IntArrayRef size) {
683	return wrapper_QuantizedCUDA__view(self, c10::fromIntArrayRefSlow(size));
684	}
685	at::Tensor view_symint(const at::Tensor & self, c10::SymIntArrayRef size) {
686	return wrapper_QuantizedCUDA__view(self, size);
687	}
688	at::Tensor index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index) {
689	return wrapper_QuantizedCUDA__index_select(self, dim, index);
690	}
691	at::Tensor & index_select_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index) {
692	return wrapper_QuantizedCUDA_out_index_select_out(self, dim, index, out);
693	}
694	at::Tensor & index_select_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, at::Tensor & out) {
695	return wrapper_QuantizedCUDA_out_index_select_out(self, dim, index, out);
696	}
697	at::Tensor unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
698	return wrapper_QuantizedCUDA__unfold(self, dimension, size, step);
699	}
700	at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) {
701	return wrapper_QuantizedCUDA___adaptive_avg_pool2d(self, c10::fromIntArrayRefSlow(output_size));
702	}
703	at::Tensor _adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) {
704	return wrapper_QuantizedCUDA___adaptive_avg_pool2d(self, output_size);
705	}
706	} // namespace quantizedcuda
707	} // namespace at
708

Browse the source code of pytorch/build/aten/src/ATen/RegisterQuantizedCUDA.cpp