RegisterNestedTensorCUDA.cpp source code [pytorch/build/aten/src/ATen/RegisterNestedTensorCUDA.cpp]

1	// required for old g++ to compile PRId64 macros, see
2	// https://github.com/pytorch/pytorch/issues/3571
3	// for context
4	#ifndef __STDC_FORMAT_MACROS
5	#define __STDC_FORMAT_MACROS
6	#endif
7
8	// an external backend might generate file within its code tree
9	// and check all the source files within the tree with clang-format.
10	// so, disable it since the backend might have a different config.
11	// clang-format off
12
13	// NOTE: This condition is true for all PyTorch internal libraries, it
14	// just excludes external projects such as torch_xla which
15	// re-use some of the PyTorch codegen machinery.
16	#if defined(CAFFE2_BUILD_MAIN_LIB) \|\| \
17	defined(TORCH_CUDA_BUILD_MAIN_LIB) \|\| \
18	defined(TORCH_HIP_BUILD_MAIN_LIB) \|\| \
19	defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) \|\| \
20	defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB)
21	#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
22	#endif
23
24	// @generated by torchgen/gen.py from RegisterDispatchKey.cpp
25
26	#include <c10/core/TensorImpl.h>
27	#include <c10/core/Allocator.h>
28	#include <ATen/DeviceGuard.h>
29	#include <ATen/NamedTensorUtils.h>
30	#include <ATen/Utils.h>
31	#include <ATen/WrapDimUtils.h>
32	#include <ATen/Dispatch.h>
33	#include <c10/util/ExclusivelyOwned.h>
34	#include <c10/util/Half.h>
35	#include <c10/core/UndefinedTensorImpl.h>
36	#include <c10/util/Optional.h>
37	#include <ATen/Tensor.h>
38	#include <ATen/native/Resize.h>
39
40	#include <cstddef>
41	#include <functional>
42	#include <memory>
43	#include <utility>
44
45	#include <ATen/Config.h>
46	#include <ATen/core/op_registration/adaption.h>
47	#include <torch/library.h>
48	#include <c10/cuda/CUDAGuard.h>
49	#include <ATen/cuda/ATenCUDAGeneral.h>
50	#include <ATen/cuda/CUDADevice.h>
51	#include <ATen/cuda/CUDAContext.h>
52
53	#include <ATen/ops/as_strided_native.h>
54	#include <ATen/ops/empty.h>
55	#include <ATen/ops/empty_strided.h>
56	#include <ATen/ops/_copy_from_and_resize.h>
57	#include <ATen/ops/_copy_from.h>
58	#include <ATen/ops/_fused_sdp_choice_native.h>
59	#include <ATen/ops/_native_decoder_only_multi_head_attention_native.h>
60	#include <ATen/ops/_native_multi_head_attention_native.h>
61	#include <ATen/ops/_nested_from_padded_and_nested_example_native.h>
62	#include <ATen/ops/_nested_select_backward_native.h>
63	#include <ATen/ops/_nested_tensor_offsets_native.h>
64	#include <ATen/ops/_nested_tensor_size_native.h>
65	#include <ATen/ops/_nested_tensor_softmax_with_shape_native.h>
66	#include <ATen/ops/_nested_tensor_strides_native.h>
67	#include <ATen/ops/_scaled_dot_product_efficient_attention_native.h>
68	#include <ATen/ops/_scaled_dot_product_flash_attention_native.h>
69	#include <ATen/ops/_softmax_backward_data_native.h>
70	#include <ATen/ops/_softmax_native.h>
71	#include <ATen/ops/_test_autograd_multiple_dispatch_native.h>
72	#include <ATen/ops/_to_copy_native.h>
73	#include <ATen/ops/_transform_bias_rescale_qkv_native.h>
74	#include <ATen/ops/_transformer_decoder_only_layer_fwd_native.h>
75	#include <ATen/ops/_transformer_encoder_layer_fwd_native.h>
76	#include <ATen/ops/add_native.h>
77	#include <ATen/ops/bmm_native.h>
78	#include <ATen/ops/chunk_native.h>
79	#include <ATen/ops/clone_native.h>
80	#include <ATen/ops/copy_native.h>
81	#include <ATen/ops/detach_native.h>
82	#include <ATen/ops/div_native.h>
83	#include <ATen/ops/embedding_native.h>
84	#include <ATen/ops/empty_like_native.h>
85	#include <ATen/ops/fill_native.h>
86	#include <ATen/ops/gelu_native.h>
87	#include <ATen/ops/is_same_size_native.h>
88	#include <ATen/ops/linear_backward_native.h>
89	#include <ATen/ops/linear_native.h>
90	#include <ATen/ops/matmul_backward_native.h>
91	#include <ATen/ops/matmul_native.h>
92	#include <ATen/ops/mul_native.h>
93	#include <ATen/ops/native_dropout_backward_native.h>
94	#include <ATen/ops/native_dropout_native.h>
95	#include <ATen/ops/native_layer_norm_native.h>
96	#include <ATen/ops/neg_native.h>
97	#include <ATen/ops/ones_like_native.h>
98	#include <ATen/ops/relu_native.h>
99	#include <ATen/ops/select_native.h>
100	#include <ATen/ops/squeeze_native.h>
101	#include <ATen/ops/tanh_native.h>
102	#include <ATen/ops/to_padded_tensor_native.h>
103	#include <ATen/ops/transpose_native.h>
104	#include <ATen/ops/unsqueeze_native.h>
105	#include <ATen/ops/values_native.h>
106	#include <ATen/ops/view_native.h>
107
108	// See template file RegisterDispatchDefinitions.ini
109	namespace at {
110	// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
111	// ambiguity with conflicting identifiers that may have been defined in
112	// at namespace already.
113	namespace {
114	void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
115	TORCH_CHECK(options.dtype() == out.dtype(),
116	"Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
117	TORCH_CHECK(options.device() == out.device(),
118	"Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
119	const bool resized = at::native::resize_output(out, sizes);
120	// Only restride if a resize occurred; otherwise we ignore the (advisory)
121	// strides from the meta function and directly use the output tensor's
122	// preexisting strides
123	if (resized) {
124	if (!strides.empty()) {
125	TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
126	// TODO: avoid the redispatch here
127	out.as_strided_(sizes, strides);
128	} else if (options.memory_format_opt().has_value()) {
129	out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
130	}
131	}
132	}
133	void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
134	// These checks are needed on those operators that:
135	// 1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
136	// 2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
137	// For other operators (e.g. 'add'), 'TensorIterator' already checks
138	// these things separately.
139	TORCH_CHECK(options.dtype() == self.dtype(),
140	"Bad in-place call: ",
141	"input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
142	TORCH_CHECK(options.device() == self.device(),
143	"Bad in-place call: ",
144	"input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
145	TORCH_CHECK(sizes == self.sizes(),
146	"Bad in-place call: ",
147	"input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
148	}
149	namespace {
150	::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) {
151	c10::optional<Device> common_device = nullopt;
152	(void)common_device; // Suppress unused variable warning
153	c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__native_dropout", "input");
154	const OptionalDeviceGuard device_guard(device_of(input));
155	return at::native::native_dropout_nested(input, p, train);
156	}
157	} // anonymous namespace
158	namespace {
159	at::Tensor wrapper_NestedTensorCUDA__native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
160	c10::optional<Device> common_device = nullopt;
161	(void)common_device; // Suppress unused variable warning
162	c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA__native_dropout_backward", "grad_output");
163	c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA__native_dropout_backward", "mask");
164	const OptionalDeviceGuard device_guard(device_of(grad_output));
165	return at::native::native_dropout_backward(grad_output, mask, scale);
166	}
167	} // anonymous namespace
168	namespace {
169	at::Tensor wrapper_NestedTensorCUDA_Tensor_add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
170	// No device check
171	const OptionalDeviceGuard device_guard(device_of(self));
172	return at::native::NestedTensor_add_Tensor(self, other, alpha);
173	}
174	} // anonymous namespace
175	namespace {
176	at::Tensor & wrapper_NestedTensorCUDA_Tensor_add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
177	// No device check
178	const OptionalDeviceGuard device_guard(device_of(self));
179	return at::native::NestedTensor_add__Tensor(self, other, alpha);
180	}
181	} // anonymous namespace
182	namespace {
183	at::Tensor wrapper_NestedTensorCUDA__bmm(const at::Tensor & self, const at::Tensor & mat2) {
184	c10::optional<Device> common_device = nullopt;
185	(void)common_device; // Suppress unused variable warning
186	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__bmm", "self");
187	c10::impl::check_and_update_common_device(common_device, mat2, "wrapper_NestedTensorCUDA__bmm", "mat2");
188	const OptionalDeviceGuard device_guard(device_of(self));
189	return at::native::bmm_nested_cuda(self, mat2);
190	}
191	} // anonymous namespace
192	namespace {
193	::std::vector<at::Tensor> wrapper_NestedTensorCUDA__chunk(const at::Tensor & self, int64_t chunks, int64_t dim) {
194	// No device check
195	// DeviceGuard omitted
196	return at::native::chunk_nested_tensor(self, chunks, dim);
197	}
198	} // anonymous namespace
199	namespace {
200	at::Tensor & wrapper_NestedTensorCUDA__copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking) {
201	// No device check
202	// DeviceGuard omitted
203	return at::native::copy_nested_(self, src, non_blocking);
204	}
205	} // anonymous namespace
206	namespace {
207	at::Tensor wrapper_NestedTensorCUDA_Tensor_div(const at::Tensor & self, const at::Tensor & other) {
208	// No device check
209	const OptionalDeviceGuard device_guard(device_of(self));
210	return at::native::NestedTensor_div_Tensor(self, other);
211	}
212	} // anonymous namespace
213	namespace {
214	at::Tensor wrapper_NestedTensorCUDA_Scalar_div(const at::Tensor & self, const at::Scalar & other) {
215	// No device check
216	const OptionalDeviceGuard device_guard(device_of(self));
217	return at::native::NestedTensor_div_Scalar(self, other);
218	}
219	} // anonymous namespace
220	namespace {
221	at::Tensor wrapper_NestedTensorCUDA__embedding(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
222	c10::optional<Device> common_device = nullopt;
223	(void)common_device; // Suppress unused variable warning
224	c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__embedding", "weight");
225	c10::impl::check_and_update_common_device(common_device, indices, "wrapper_NestedTensorCUDA__embedding", "indices");
226	const OptionalDeviceGuard device_guard(device_of(weight));
227	return at::native::NestedTensor_embedding(weight, indices, padding_idx.expect_int(), scale_grad_by_freq, sparse);
228	}
229	} // anonymous namespace
230	namespace {
231	at::Tensor wrapper_NestedTensorCUDA__empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
232	// No device check
233	// DeviceGuard omitted
234	return at::native::empty_like_nested(self, dtype, layout, device, pin_memory, memory_format);
235	}
236	} // anonymous namespace
237	namespace {
238	at::Tensor & wrapper_NestedTensorCUDA_Scalar_fill_(at::Tensor & self, const at::Scalar & value) {
239	// No device check
240	const OptionalDeviceGuard device_guard(device_of(self));
241	return at::native::fill_nested_(self, value);
242	}
243	} // anonymous namespace
244	namespace {
245	at::Tensor & wrapper_NestedTensorCUDA_Tensor_fill_(at::Tensor & self, const at::Tensor & value) {
246	// No device check
247	const OptionalDeviceGuard device_guard(device_of(self));
248	return at::native::fill_nested_(self, value);
249	}
250	} // anonymous namespace
251	namespace {
252	bool wrapper_NestedTensorCUDA__is_same_size(const at::Tensor & self, const at::Tensor & other) {
253	// No device check
254	// DeviceGuard omitted
255	return at::native::nested_is_same_size(self, other);
256	}
257	} // anonymous namespace
258	namespace {
259	::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__native_layer_norm(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
260	c10::optional<Device> common_device = nullopt;
261	(void)common_device; // Suppress unused variable warning
262	c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__native_layer_norm", "input");
263	c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__native_layer_norm", "weight");
264	c10::impl::check_and_update_common_device(common_device, bias, "wrapper_NestedTensorCUDA__native_layer_norm", "bias");
265	const OptionalDeviceGuard device_guard(device_of(input));
266	return at::native::nested_layer_norm(input, C10_AS_INTARRAYREF_SLOW(normalized_shape), weight, bias, eps);
267	}
268	} // anonymous namespace
269	namespace {
270	at::Tensor wrapper_NestedTensorCUDA__linear(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias) {
271	c10::optional<Device> common_device = nullopt;
272	(void)common_device; // Suppress unused variable warning
273	c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__linear", "input");
274	c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__linear", "weight");
275	c10::impl::check_and_update_common_device(common_device, bias, "wrapper_NestedTensorCUDA__linear", "bias");
276	const OptionalDeviceGuard device_guard(device_of(input));
277	return at::native::nested_linear(input, weight, bias);
278	}
279	} // anonymous namespace
280	namespace {
281	::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__linear_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,`3`> output_mask) {
282	c10::optional<Device> common_device = nullopt;
283	(void)common_device; // Suppress unused variable warning
284	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__linear_backward", "self");
285	c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA__linear_backward", "grad_output");
286	c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__linear_backward", "weight");
287	const OptionalDeviceGuard device_guard(device_of(self));
288	return at::native::nested_linear_backward(self, grad_output, weight, output_mask);
289	}
290	} // anonymous namespace
291	namespace {
292	at::Tensor wrapper_NestedTensorCUDA__matmul(const at::Tensor & self, const at::Tensor & other) {
293	c10::optional<Device> common_device = nullopt;
294	(void)common_device; // Suppress unused variable warning
295	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__matmul", "self");
296	c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA__matmul", "other");
297	const OptionalDeviceGuard device_guard(device_of(self));
298	return at::native::matmul_nested(self, other);
299	}
300	} // anonymous namespace
301	namespace {
302	at::Tensor & wrapper_NestedTensorCUDA_out_matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
303	c10::optional<Device> common_device = nullopt;
304	(void)common_device; // Suppress unused variable warning
305	c10::impl::check_and_update_common_device(common_device, out, "wrapper_NestedTensorCUDA_out_matmul_out", "out");
306	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_out_matmul_out", "self");
307	c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA_out_matmul_out", "other");
308	const OptionalDeviceGuard device_guard(device_of(self));
309	return at::native::matmul_out_nested(self, other, out);
310	}
311	} // anonymous namespace
312	namespace {
313	::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,`2`> mask) {
314	c10::optional<Device> common_device = nullopt;
315	(void)common_device; // Suppress unused variable warning
316	c10::impl::check_and_update_common_device(common_device, grad, "wrapper_NestedTensorCUDA__matmul_backward", "grad");
317	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__matmul_backward", "self");
318	c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA__matmul_backward", "other");
319	const OptionalDeviceGuard device_guard(device_of(self));
320	return at::native::matmul_backward_nested(grad, self, other, mask);
321	}
322	} // anonymous namespace
323	namespace {
324	at::Tensor wrapper_NestedTensorCUDA_Tensor_mul(const at::Tensor & self, const at::Tensor & other) {
325	// No device check
326	const OptionalDeviceGuard device_guard(device_of(self));
327	return at::native::NestedTensor_mul_Tensor(self, other);
328	}
329	} // anonymous namespace
330	namespace {
331	at::Tensor & wrapper_NestedTensorCUDA_Tensor_mul_(at::Tensor & self, const at::Tensor & other) {
332	// No device check
333	const OptionalDeviceGuard device_guard(device_of(self));
334	return at::native::NestedTensor_mul__Tensor(self, other);
335	}
336	} // anonymous namespace
337	namespace {
338	at::Tensor wrapper_NestedTensorCUDA_Scalar_mul(const at::Tensor & self, const at::Scalar & other) {
339	// No device check
340	const OptionalDeviceGuard device_guard(device_of(self));
341	return at::native::NestedTensor_mul_Scalar(self, other);
342	}
343	} // anonymous namespace
344	namespace {
345	at::Tensor & wrapper_NestedTensorCUDA_Scalar_mul_(at::Tensor & self, const at::Scalar & other) {
346	// No device check
347	const OptionalDeviceGuard device_guard(device_of(self));
348	return at::native::NestedTensor_mul__Scalar(self, other);
349	}
350	} // anonymous namespace
351	namespace {
352	at::Tensor wrapper_NestedTensorCUDA__ones_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
353	c10::optional<Device> common_device = nullopt;
354	(void)common_device; // Suppress unused variable warning
355	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__ones_like", "self");
356	globalContext().lazyInitCUDA();
357	const DeviceGuard device_guard(device_or_default(device));
358	return at::native::ones_like(self, dtype, layout, device, pin_memory, memory_format);
359	}
360	} // anonymous namespace
361	namespace {
362	at::Tensor wrapper_NestedTensorCUDA__neg(const at::Tensor & self) {
363	// No device check
364	const OptionalDeviceGuard device_guard(device_of(self));
365	return at::native::NestedTensor_neg(self);
366	}
367	} // anonymous namespace
368	namespace {
369	at::Tensor & wrapper_NestedTensorCUDA__neg_(at::Tensor & self) {
370	// No device check
371	const OptionalDeviceGuard device_guard(device_of(self));
372	return at::native::NestedTensor_neg_(self);
373	}
374	} // anonymous namespace
375	namespace {
376	at::Tensor wrapper_NestedTensorCUDA__relu(const at::Tensor & self) {
377	// No device check
378	const OptionalDeviceGuard device_guard(device_of(self));
379	return at::native::NestedTensor_relu(self);
380	}
381	} // anonymous namespace
382	namespace {
383	at::Tensor & wrapper_NestedTensorCUDA__relu_(at::Tensor & self) {
384	// No device check
385	const OptionalDeviceGuard device_guard(device_of(self));
386	return at::native::NestedTensor_relu_(self);
387	}
388	} // anonymous namespace
389	namespace {
390	at::Tensor wrapper_NestedTensorCUDA__gelu(const at::Tensor & self, c10::string_view approximate) {
391	// No device check
392	const OptionalDeviceGuard device_guard(device_of(self));
393	return at::native::NestedTensor_gelu(self, approximate);
394	}
395	} // anonymous namespace
396	namespace {
397	at::Tensor & wrapper_NestedTensorCUDA__gelu_(at::Tensor & self, c10::string_view approximate) {
398	// No device check
399	const OptionalDeviceGuard device_guard(device_of(self));
400	return at::native::NestedTensor_gelu_(self, approximate);
401	}
402	} // anonymous namespace
403	namespace {
404	at::Tensor wrapper_NestedTensorCUDA_int_select(const at::Tensor & self, int64_t dim, c10::SymInt index) {
405	// No device check
406	// DeviceGuard omitted
407	return at::native::select_nested(self, dim, index.expect_int());
408	}
409	} // anonymous namespace
410	namespace {
411	at::Tensor wrapper_NestedTensorCUDA___nested_select_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) {
412	// No device check
413	// DeviceGuard omitted
414	return at::native::_nested_select_backward_symint(grad_output, self, dim, index);
415	}
416	} // anonymous namespace
417	namespace {
418	at::Tensor wrapper_NestedTensorCUDA__detach(const at::Tensor & self) {
419	c10::optional<Device> common_device = nullopt;
420	(void)common_device; // Suppress unused variable warning
421	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__detach", "self");
422	const OptionalDeviceGuard device_guard(device_of(self));
423	return at::native::detach(self);
424	}
425	} // anonymous namespace
426	namespace {
427	at::Tensor wrapper_NestedTensorCUDA___softmax(const at::Tensor & self, int64_t dim, bool half_to_float) {
428	c10::optional<Device> common_device = nullopt;
429	(void)common_device; // Suppress unused variable warning
430	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___softmax", "self");
431	const OptionalDeviceGuard device_guard(device_of(self));
432	return at::native::softmax_nested(self, dim, half_to_float);
433	}
434	} // anonymous namespace
435	namespace {
436	at::Tensor wrapper_NestedTensorCUDA___softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
437	c10::optional<Device> common_device = nullopt;
438	(void)common_device; // Suppress unused variable warning
439	c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA___softmax_backward_data", "grad_output");
440	c10::impl::check_and_update_common_device(common_device, output, "wrapper_NestedTensorCUDA___softmax_backward_data", "output");
441	const OptionalDeviceGuard device_guard(device_of(grad_output));
442	return at::native::nested_softmax_backward(grad_output, output, dim, input_dtype);
443	}
444	} // anonymous namespace
445	namespace {
446	at::Tensor wrapper_NestedTensorCUDA__squeeze(const at::Tensor & self) {
447	// No device check
448	// DeviceGuard omitted
449	return at::native::squeeze_nested(self);
450	}
451	} // anonymous namespace
452	namespace {
453	at::Tensor wrapper_NestedTensorCUDA_dim_squeeze(const at::Tensor & self, int64_t dim) {
454	// No device check
455	// DeviceGuard omitted
456	return at::native::squeeze_dim_nested(self, dim);
457	}
458	} // anonymous namespace
459	namespace {
460	at::Tensor wrapper_NestedTensorCUDA_dims_squeeze(const at::Tensor & self, at::IntArrayRef dim) {
461	// No device check
462	// DeviceGuard omitted
463	return at::native::squeeze_dim_nested(self, dim);
464	}
465	} // anonymous namespace
466	namespace {
467	at::Tensor wrapper_NestedTensorCUDA__tanh(const at::Tensor & self) {
468	// No device check
469	const OptionalDeviceGuard device_guard(device_of(self));
470	return at::native::NestedTensor_tanh(self);
471	}
472	} // anonymous namespace
473	namespace {
474	at::Tensor & wrapper_NestedTensorCUDA__tanh_(at::Tensor & self) {
475	// No device check
476	const OptionalDeviceGuard device_guard(device_of(self));
477	return at::native::NestedTensor_tanh_(self);
478	}
479	} // anonymous namespace
480	namespace {
481	at::Tensor wrapper_NestedTensorCUDA_int_transpose(const at::Tensor & self, int64_t dim0, int64_t dim1) {
482	// No device check
483	// DeviceGuard omitted
484	return at::native::transpose_nested(self, dim0, dim1);
485	}
486	} // anonymous namespace
487	namespace {
488	::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) {
489	c10::optional<Device> common_device = nullopt;
490	(void)common_device; // Suppress unused variable warning
491	c10::impl::check_and_update_common_device(common_device, qkv, "wrapper_NestedTensorCUDA___transform_bias_rescale_qkv", "qkv");
492	c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transform_bias_rescale_qkv", "qkv_bias");
493	const OptionalDeviceGuard device_guard(device_of(qkv));
494	return at::native::transform_bias_rescale_qkv_cuda(qkv, qkv_bias, num_heads);
495	}
496	} // anonymous namespace
497	namespace {
498	at::Tensor wrapper_NestedTensorCUDA___nested_tensor_size(const at::Tensor & self) {
499	c10::optional<Device> common_device = nullopt;
500	(void)common_device; // Suppress unused variable warning
501	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_size", "self");
502	const OptionalDeviceGuard device_guard(device_of(self));
503	return at::native::_nested_tensor_size(self);
504	}
505	} // anonymous namespace
506	namespace {
507	at::Tensor wrapper_NestedTensorCUDA___nested_tensor_strides(const at::Tensor & self) {
508	c10::optional<Device> common_device = nullopt;
509	(void)common_device; // Suppress unused variable warning
510	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_strides", "self");
511	const OptionalDeviceGuard device_guard(device_of(self));
512	return at::native::_nested_tensor_strides(self);
513	}
514	} // anonymous namespace
515	namespace {
516	::std::vector<int64_t> wrapper_NestedTensorCUDA___nested_tensor_offsets(const at::Tensor & self) {
517	c10::optional<Device> common_device = nullopt;
518	(void)common_device; // Suppress unused variable warning
519	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_offsets", "self");
520	const OptionalDeviceGuard device_guard(device_of(self));
521	return at::native::_nested_tensor_offsets(self);
522	}
523	} // anonymous namespace
524	namespace {
525	at::Tensor wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example) {
526	c10::optional<Device> common_device = nullopt;
527	(void)common_device; // Suppress unused variable warning
528	c10::impl::check_and_update_common_device(common_device, padded, "wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example", "padded");
529	c10::impl::check_and_update_common_device(common_device, nt_example, "wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example", "nt_example");
530	const OptionalDeviceGuard device_guard(device_of(padded));
531	return at::native::NestedTensor_from_padded_and_nested_example(padded, nt_example);
532	}
533	} // anonymous namespace
534	namespace {
535	at::Tensor wrapper_NestedTensorCUDA__unsqueeze(const at::Tensor & self, int64_t dim) {
536	// No device check
537	// DeviceGuard omitted
538	return at::native::unsqueeze_nested(self, dim);
539	}
540	} // anonymous namespace
541	namespace {
542	at::Tensor wrapper_NestedTensorCUDA__clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
543	c10::optional<Device> common_device = nullopt;
544	(void)common_device; // Suppress unused variable warning
545	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__clone", "self");
546	const OptionalDeviceGuard device_guard(device_of(self));
547	return at::native::clone_nested(self, memory_format);
548	}
549	} // anonymous namespace
550	namespace {
551	at::Tensor wrapper_NestedTensorCUDA__values(const at::Tensor & self) {
552	// No device check
553	// DeviceGuard omitted
554	return at::native::values_nested(self);
555	}
556	} // anonymous namespace
557	namespace {
558	at::Tensor wrapper_NestedTensorCUDA___to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
559	// No device check
560	// DeviceGuard omitted
561	return at::native::_to_copy_nested(self, dtype, layout, device, pin_memory, non_blocking, memory_format);
562	}
563	} // anonymous namespace
564	namespace {
565	at::Tensor wrapper_NestedTensorCUDA__view(const at::Tensor & self, c10::SymIntArrayRef size) {
566	// No device check
567	// DeviceGuard omitted
568	return at::native::view_nested(self, C10_AS_INTARRAYREF_SLOW(size));
569	}
570	} // anonymous namespace
571	namespace {
572	at::Tensor wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch(const at::Tensor & self) {
573	c10::optional<Device> common_device = nullopt;
574	(void)common_device; // Suppress unused variable warning
575	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch", "self");
576	const OptionalDeviceGuard device_guard(device_of(self));
577	return at::native::_test_autograd_multiple_dispatch_fullcoverage(self);
578	}
579	} // anonymous namespace
580	namespace {
581	at::Tensor wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch(const at::Tensor & self, bool b) {
582	c10::optional<Device> common_device = nullopt;
583	(void)common_device; // Suppress unused variable warning
584	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch", "self");
585	const OptionalDeviceGuard device_guard(device_of(self));
586	return at::native::_test_autograd_multiple_dispatch_ntonly(self, b);
587	}
588	} // anonymous namespace
589	namespace {
590	at::Tensor wrapper_NestedTensorCUDA__to_padded_tensor(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size) {
591	c10::optional<Device> common_device = nullopt;
592	(void)common_device; // Suppress unused variable warning
593	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__to_padded_tensor", "self");
594	const OptionalDeviceGuard device_guard(device_of(self));
595	return at::native::NestedTensor_to_padded_tensor_cuda(self, padding, output_size.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*output_size)) : c10::nullopt);
596	}
597	} // anonymous namespace
598	namespace {
599	at::Tensor wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape(const at::Tensor & self, const at::Tensor & query) {
600	c10::optional<Device> common_device = nullopt;
601	(void)common_device; // Suppress unused variable warning
602	c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape", "self");
603	c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape", "query");
604	const OptionalDeviceGuard device_guard(device_of(self));
605	return at::native::NestedTensor_softmax_dropout_cuda(self, query);
606	}
607	} // anonymous namespace
608	namespace {
609	at::Tensor wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type) {
610	c10::optional<Device> common_device = nullopt;
611	(void)common_device; // Suppress unused variable warning
612	c10::impl::check_and_update_common_device(common_device, src, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "src");
613	c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "qkv_weight");
614	c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "qkv_bias");
615	c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "proj_weight");
616	c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "proj_bias");
617	c10::impl::check_and_update_common_device(common_device, norm_weight_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_weight_1");
618	c10::impl::check_and_update_common_device(common_device, norm_bias_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_bias_1");
619	c10::impl::check_and_update_common_device(common_device, norm_weight_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_weight_2");
620	c10::impl::check_and_update_common_device(common_device, norm_bias_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_bias_2");
621	c10::impl::check_and_update_common_device(common_device, ffn_weight_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_weight_1");
622	c10::impl::check_and_update_common_device(common_device, ffn_bias_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_bias_1");
623	c10::impl::check_and_update_common_device(common_device, ffn_weight_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_weight_2");
624	c10::impl::check_and_update_common_device(common_device, ffn_bias_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_bias_2");
625	c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "mask");
626	const OptionalDeviceGuard device_guard(device_of(src));
627	return at::native::transformer_encoder_layer_forward(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type);
628	}
629	} // anonymous namespace
630	namespace {
631	::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type) {
632	c10::optional<Device> common_device = nullopt;
633	(void)common_device; // Suppress unused variable warning
634	c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___native_multi_head_attention", "query");
635	c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___native_multi_head_attention", "key");
636	c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___native_multi_head_attention", "value");
637	c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___native_multi_head_attention", "qkv_weight");
638	c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___native_multi_head_attention", "qkv_bias");
639	c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___native_multi_head_attention", "proj_weight");
640	c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___native_multi_head_attention", "proj_bias");
641	c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___native_multi_head_attention", "mask");
642	const OptionalDeviceGuard device_guard(device_of(query));
643	return at::native::native_multi_head_attention_cuda(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type);
644	}
645	} // anonymous namespace
646	namespace {
647	int64_t wrapper_NestedTensorCUDA___fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask, double dropout_p, bool is_causal) {
648	c10::optional<Device> common_device = nullopt;
649	(void)common_device; // Suppress unused variable warning
650	c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___fused_sdp_choice", "query");
651	c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___fused_sdp_choice", "key");
652	c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___fused_sdp_choice", "value");
653	c10::impl::check_and_update_common_device(common_device, attn_mask, "wrapper_NestedTensorCUDA___fused_sdp_choice", "attn_mask");
654	const OptionalDeviceGuard device_guard(device_of(query));
655	return at::native::_fused_sdp_choice_cuda(query, key, value, attn_mask, dropout_p, is_causal);
656	}
657	} // anonymous namespace
658	namespace {
659	::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor> wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask) {
660	c10::optional<Device> common_device = nullopt;
661	(void)common_device; // Suppress unused variable warning
662	c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention", "query");
663	c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention", "key");
664	c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention", "value");
665	const OptionalDeviceGuard device_guard(device_of(query));
666	return at::native::_scaled_dot_product_flash_attention_nestedtensor_cuda(query, key, value, dropout_p, is_causal, return_debug_mask);
667	}
668	} // anonymous namespace
669	namespace {
670	::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, bool compute_log_sumexp, bool is_causal) {
671	c10::optional<Device> common_device = nullopt;
672	(void)common_device; // Suppress unused variable warning
673	c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention", "query");
674	c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention", "key");
675	c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention", "value");
676	const OptionalDeviceGuard device_guard(device_of(query));
677	return at::native::_scaled_dot_product_efficient_attention_nestedtensor_cuda(query, key, value, compute_log_sumexp, is_causal);
678	}
679	} // anonymous namespace
680	namespace {
681	::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value) {
682	c10::optional<Device> common_device = nullopt;
683	(void)common_device; // Suppress unused variable warning
684	c10::impl::check_and_update_common_device(common_device, src, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "src");
685	c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "qkv_weight");
686	c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "qkv_bias");
687	c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "proj_weight");
688	c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "proj_bias");
689	c10::impl::check_and_update_common_device(common_device, norm_weight_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_weight_1");
690	c10::impl::check_and_update_common_device(common_device, norm_bias_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_bias_1");
691	c10::impl::check_and_update_common_device(common_device, norm_weight_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_weight_2");
692	c10::impl::check_and_update_common_device(common_device, norm_bias_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_bias_2");
693	c10::impl::check_and_update_common_device(common_device, ffn_weight_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_weight_1");
694	c10::impl::check_and_update_common_device(common_device, ffn_bias_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_bias_1");
695	c10::impl::check_and_update_common_device(common_device, ffn_weight_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_weight_2");
696	c10::impl::check_and_update_common_device(common_device, ffn_bias_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_bias_2");
697	c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "mask");
698	c10::impl::check_and_update_common_device(common_device, incr_key, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "incr_key");
699	c10::impl::check_and_update_common_device(common_device, incr_value, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "incr_value");
700	const OptionalDeviceGuard device_guard(device_of(src));
701	return at::native::transformer_decoder_only_layer_forward(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, incr_key, incr_value);
702	}
703	} // anonymous namespace
704	namespace {
705	::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value, bool need_weights, bool average_attn_weights) {
706	c10::optional<Device> common_device = nullopt;
707	(void)common_device; // Suppress unused variable warning
708	c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "query");
709	c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "key");
710	c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "value");
711	c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "qkv_weight");
712	c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "qkv_bias");
713	c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "proj_weight");
714	c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "proj_bias");
715	c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "mask");
716	c10::impl::check_and_update_common_device(common_device, incr_key, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "incr_key");
717	c10::impl::check_and_update_common_device(common_device, incr_value, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "incr_value");
718	const OptionalDeviceGuard device_guard(device_of(query));
719	return at::native::native_decoder_only_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, incr_key, incr_value, need_weights, average_attn_weights);
720	}
721	} // anonymous namespace
722	TORCH_LIBRARY_IMPL(aten, NestedTensorCUDA, m) {
723	m.impl("native_dropout",
724	TORCH_FN(wrapper_NestedTensorCUDA__native_dropout));
725	m.impl("native_dropout_backward",
726	TORCH_FN(wrapper_NestedTensorCUDA__native_dropout_backward));
727	m.impl("add.Tensor",
728	TORCH_FN(wrapper_NestedTensorCUDA_Tensor_add));
729	m.impl("add_.Tensor",
730	TORCH_FN(wrapper_NestedTensorCUDA_Tensor_add_));
731	m.impl("bmm",
732	TORCH_FN(wrapper_NestedTensorCUDA__bmm));
733	m.impl("chunk",
734	TORCH_FN(wrapper_NestedTensorCUDA__chunk));
735	m.impl("copy_",
736	TORCH_FN(wrapper_NestedTensorCUDA__copy_));
737	m.impl("div.Tensor",
738	TORCH_FN(wrapper_NestedTensorCUDA_Tensor_div));
739	m.impl("div.Scalar",
740	TORCH_FN(wrapper_NestedTensorCUDA_Scalar_div));
741	m.impl("embedding",
742	TORCH_FN(wrapper_NestedTensorCUDA__embedding));
743	m.impl("empty_like",
744	TORCH_FN(wrapper_NestedTensorCUDA__empty_like));
745	m.impl("fill_.Scalar",
746	TORCH_FN(wrapper_NestedTensorCUDA_Scalar_fill_));
747	m.impl("fill_.Tensor",
748	TORCH_FN(wrapper_NestedTensorCUDA_Tensor_fill_));
749	m.impl("is_same_size",
750	TORCH_FN(wrapper_NestedTensorCUDA__is_same_size));
751	m.impl("native_layer_norm",
752	TORCH_FN(wrapper_NestedTensorCUDA__native_layer_norm));
753	m.impl("linear",
754	TORCH_FN(wrapper_NestedTensorCUDA__linear));
755	m.impl("linear_backward",
756	TORCH_FN(wrapper_NestedTensorCUDA__linear_backward));
757	m.impl("matmul",
758	TORCH_FN(wrapper_NestedTensorCUDA__matmul));
759	m.impl("matmul.out",
760	TORCH_FN(wrapper_NestedTensorCUDA_out_matmul_out));
761	m.impl("matmul_backward",
762	TORCH_FN(wrapper_NestedTensorCUDA__matmul_backward));
763	m.impl("mul.Tensor",
764	TORCH_FN(wrapper_NestedTensorCUDA_Tensor_mul));
765	m.impl("mul_.Tensor",
766	TORCH_FN(wrapper_NestedTensorCUDA_Tensor_mul_));
767	m.impl("mul.Scalar",
768	TORCH_FN(wrapper_NestedTensorCUDA_Scalar_mul));
769	m.impl("mul_.Scalar",
770	TORCH_FN(wrapper_NestedTensorCUDA_Scalar_mul_));
771	m.impl("ones_like",
772	TORCH_FN(wrapper_NestedTensorCUDA__ones_like));
773	m.impl("neg",
774	TORCH_FN(wrapper_NestedTensorCUDA__neg));
775	m.impl("neg_",
776	TORCH_FN(wrapper_NestedTensorCUDA__neg_));
777	m.impl("relu",
778	TORCH_FN(wrapper_NestedTensorCUDA__relu));
779	m.impl("relu_",
780	TORCH_FN(wrapper_NestedTensorCUDA__relu_));
781	m.impl("gelu",
782	TORCH_FN(wrapper_NestedTensorCUDA__gelu));
783	m.impl("gelu_",
784	TORCH_FN(wrapper_NestedTensorCUDA__gelu_));
785	m.impl("select.int",
786	TORCH_FN(wrapper_NestedTensorCUDA_int_select));
787	m.impl("_nested_select_backward",
788	TORCH_FN(wrapper_NestedTensorCUDA___nested_select_backward));
789	m.impl("detach",
790	TORCH_FN(wrapper_NestedTensorCUDA__detach));
791	m.impl("_softmax",
792	TORCH_FN(wrapper_NestedTensorCUDA___softmax));
793	m.impl("_softmax_backward_data",
794	TORCH_FN(wrapper_NestedTensorCUDA___softmax_backward_data));
795	m.impl("squeeze",
796	TORCH_FN(wrapper_NestedTensorCUDA__squeeze));
797	m.impl("squeeze.dim",
798	TORCH_FN(wrapper_NestedTensorCUDA_dim_squeeze));
799	m.impl("squeeze.dims",
800	TORCH_FN(wrapper_NestedTensorCUDA_dims_squeeze));
801	m.impl("tanh",
802	TORCH_FN(wrapper_NestedTensorCUDA__tanh));
803	m.impl("tanh_",
804	TORCH_FN(wrapper_NestedTensorCUDA__tanh_));
805	m.impl("transpose.int",
806	TORCH_FN(wrapper_NestedTensorCUDA_int_transpose));
807	m.impl("_transform_bias_rescale_qkv",
808	TORCH_FN(wrapper_NestedTensorCUDA___transform_bias_rescale_qkv));
809	m.impl("_nested_tensor_size",
810	TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_size));
811	m.impl("_nested_tensor_strides",
812	TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_strides));
813	m.impl("_nested_tensor_offsets",
814	TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_offsets));
815	m.impl("_nested_from_padded_and_nested_example",
816	TORCH_FN(wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example));
817	m.impl("unsqueeze",
818	TORCH_FN(wrapper_NestedTensorCUDA__unsqueeze));
819	m.impl("clone",
820	TORCH_FN(wrapper_NestedTensorCUDA__clone));
821	m.impl("values",
822	TORCH_FN(wrapper_NestedTensorCUDA__values));
823	m.impl("_to_copy",
824	TORCH_FN(wrapper_NestedTensorCUDA___to_copy));
825	m.impl("view",
826	TORCH_FN(wrapper_NestedTensorCUDA__view));
827	m.impl("_test_autograd_multiple_dispatch.fullcoverage",
828	TORCH_FN(wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch));
829	m.impl("_test_autograd_multiple_dispatch.ntonly",
830	TORCH_FN(wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch));
831	m.impl("to_padded_tensor",
832	TORCH_FN(wrapper_NestedTensorCUDA__to_padded_tensor));
833	m.impl("_nested_tensor_softmax_with_shape",
834	TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape));
835	m.impl("_transformer_encoder_layer_fwd",
836	TORCH_FN(wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd));
837	m.impl("_native_multi_head_attention",
838	TORCH_FN(wrapper_NestedTensorCUDA___native_multi_head_attention));
839	m.impl("_fused_sdp_choice",
840	TORCH_FN(wrapper_NestedTensorCUDA___fused_sdp_choice));
841	m.impl("_scaled_dot_product_flash_attention",
842	TORCH_FN(wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention));
843	m.impl("_scaled_dot_product_efficient_attention",
844	TORCH_FN(wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention));
845	m.impl("_transformer_decoder_only_layer_fwd",
846	TORCH_FN(wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd));
847	m.impl("_native_decoder_only_multi_head_attention",
848	TORCH_FN(wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention));
849	};
850	} // anonymous namespace
851	namespace nestedtensorcuda {
852	::std::tuple<at::Tensor,at::Tensor> native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) {
853	return wrapper_NestedTensorCUDA__native_dropout(input, p, train);
854	}
855	at::Tensor native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
856	return wrapper_NestedTensorCUDA__native_dropout_backward(grad_output, mask, scale);
857	}
858	at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
859	return wrapper_NestedTensorCUDA_Tensor_add(self, other, alpha);
860	}
861	at::Tensor & add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
862	return wrapper_NestedTensorCUDA_Tensor_add_(self, other, alpha);
863	}
864	at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2) {
865	return wrapper_NestedTensorCUDA__bmm(self, mat2);
866	}
867	::std::vector<at::Tensor> chunk(const at::Tensor & self, int64_t chunks, int64_t dim) {
868	return wrapper_NestedTensorCUDA__chunk(self, chunks, dim);
869	}
870	at::Tensor & copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking) {
871	return wrapper_NestedTensorCUDA__copy_(self, src, non_blocking);
872	}
873	at::Tensor div(const at::Tensor & self, const at::Tensor & other) {
874	return wrapper_NestedTensorCUDA_Tensor_div(self, other);
875	}
876	at::Tensor div(const at::Tensor & self, const at::Scalar & other) {
877	return wrapper_NestedTensorCUDA_Scalar_div(self, other);
878	}
879	at::Tensor embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
880	return wrapper_NestedTensorCUDA__embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
881	}
882	at::Tensor embedding_symint(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
883	return wrapper_NestedTensorCUDA__embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
884	}
885	at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
886	return wrapper_NestedTensorCUDA__empty_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
887	}
888	at::Tensor empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
889	return wrapper_NestedTensorCUDA__empty_like(self, dtype, layout, device, pin_memory, memory_format);
890	}
891	at::Tensor & fill_(at::Tensor & self, const at::Scalar & value) {
892	return wrapper_NestedTensorCUDA_Scalar_fill_(self, value);
893	}
894	at::Tensor & fill_(at::Tensor & self, const at::Tensor & value) {
895	return wrapper_NestedTensorCUDA_Tensor_fill_(self, value);
896	}
897	bool is_same_size(const at::Tensor & self, const at::Tensor & other) {
898	return wrapper_NestedTensorCUDA__is_same_size(self, other);
899	}
900	::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
901	return wrapper_NestedTensorCUDA__native_layer_norm(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps);
902	}
903	::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
904	return wrapper_NestedTensorCUDA__native_layer_norm(input, normalized_shape, weight, bias, eps);
905	}
906	at::Tensor linear(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias) {
907	return wrapper_NestedTensorCUDA__linear(input, weight, bias);
908	}
909	::std::tuple<at::Tensor,at::Tensor,at::Tensor> linear_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,`3`> output_mask) {
910	return wrapper_NestedTensorCUDA__linear_backward(self, grad_output, weight, output_mask);
911	}
912	at::Tensor matmul(const at::Tensor & self, const at::Tensor & other) {
913	return wrapper_NestedTensorCUDA__matmul(self, other);
914	}
915	at::Tensor & matmul_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
916	return wrapper_NestedTensorCUDA_out_matmul_out(self, other, out);
917	}
918	at::Tensor & matmul_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
919	return wrapper_NestedTensorCUDA_out_matmul_out(self, other, out);
920	}
921	::std::tuple<at::Tensor,at::Tensor> matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,`2`> mask) {
922	return wrapper_NestedTensorCUDA__matmul_backward(grad, self, other, mask);
923	}
924	at::Tensor mul(const at::Tensor & self, const at::Tensor & other) {
925	return wrapper_NestedTensorCUDA_Tensor_mul(self, other);
926	}
927	at::Tensor & mul_(at::Tensor & self, const at::Tensor & other) {
928	return wrapper_NestedTensorCUDA_Tensor_mul_(self, other);
929	}
930	at::Tensor mul(const at::Tensor & self, const at::Scalar & other) {
931	return wrapper_NestedTensorCUDA_Scalar_mul(self, other);
932	}
933	at::Tensor & mul_(at::Tensor & self, const at::Scalar & other) {
934	return wrapper_NestedTensorCUDA_Scalar_mul_(self, other);
935	}
936	at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
937	return wrapper_NestedTensorCUDA__ones_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
938	}
939	at::Tensor ones_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
940	return wrapper_NestedTensorCUDA__ones_like(self, dtype, layout, device, pin_memory, memory_format);
941	}
942	at::Tensor neg(const at::Tensor & self) {
943	return wrapper_NestedTensorCUDA__neg(self);
944	}
945	at::Tensor & neg_(at::Tensor & self) {
946	return wrapper_NestedTensorCUDA__neg_(self);
947	}
948	at::Tensor relu(const at::Tensor & self) {
949	return wrapper_NestedTensorCUDA__relu(self);
950	}
951	at::Tensor & relu_(at::Tensor & self) {
952	return wrapper_NestedTensorCUDA__relu_(self);
953	}
954	at::Tensor gelu(const at::Tensor & self, c10::string_view approximate) {
955	return wrapper_NestedTensorCUDA__gelu(self, approximate);
956	}
957	at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate) {
958	return wrapper_NestedTensorCUDA__gelu_(self, approximate);
959	}
960	at::Tensor select(const at::Tensor & self, int64_t dim, int64_t index) {
961	return wrapper_NestedTensorCUDA_int_select(self, dim, index);
962	}
963	at::Tensor select_symint(const at::Tensor & self, int64_t dim, c10::SymInt index) {
964	return wrapper_NestedTensorCUDA_int_select(self, dim, index);
965	}
966	at::Tensor _nested_select_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, int64_t index) {
967	return wrapper_NestedTensorCUDA___nested_select_backward(grad_output, self, dim, index);
968	}
969	at::Tensor _nested_select_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) {
970	return wrapper_NestedTensorCUDA___nested_select_backward(grad_output, self, dim, index);
971	}
972	at::Tensor detach(const at::Tensor & self) {
973	return wrapper_NestedTensorCUDA__detach(self);
974	}
975	at::Tensor _softmax(const at::Tensor & self, int64_t dim, bool half_to_float) {
976	return wrapper_NestedTensorCUDA___softmax(self, dim, half_to_float);
977	}
978	at::Tensor _softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
979	return wrapper_NestedTensorCUDA___softmax_backward_data(grad_output, output, dim, input_dtype);
980	}
981	at::Tensor squeeze(const at::Tensor & self) {
982	return wrapper_NestedTensorCUDA__squeeze(self);
983	}
984	at::Tensor squeeze(const at::Tensor & self, int64_t dim) {
985	return wrapper_NestedTensorCUDA_dim_squeeze(self, dim);
986	}
987	at::Tensor squeeze(const at::Tensor & self, at::IntArrayRef dim) {
988	return wrapper_NestedTensorCUDA_dims_squeeze(self, dim);
989	}
990	at::Tensor tanh(const at::Tensor & self) {
991	return wrapper_NestedTensorCUDA__tanh(self);
992	}
993	at::Tensor & tanh_(at::Tensor & self) {
994	return wrapper_NestedTensorCUDA__tanh_(self);
995	}
996	at::Tensor transpose(const at::Tensor & self, int64_t dim0, int64_t dim1) {
997	return wrapper_NestedTensorCUDA_int_transpose(self, dim0, dim1);
998	}
999	::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transform_bias_rescale_qkv(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) {
1000	return wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(qkv, qkv_bias, num_heads);
1001	}
1002	at::Tensor _nested_tensor_size(const at::Tensor & self) {
1003	return wrapper_NestedTensorCUDA___nested_tensor_size(self);
1004	}
1005	at::Tensor _nested_tensor_strides(const at::Tensor & self) {
1006	return wrapper_NestedTensorCUDA___nested_tensor_strides(self);
1007	}
1008	::std::vector<int64_t> _nested_tensor_offsets(const at::Tensor & self) {
1009	return wrapper_NestedTensorCUDA___nested_tensor_offsets(self);
1010	}
1011	at::Tensor _nested_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example) {
1012	return wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example(padded, nt_example);
1013	}
1014	at::Tensor unsqueeze(const at::Tensor & self, int64_t dim) {
1015	return wrapper_NestedTensorCUDA__unsqueeze(self, dim);
1016	}
1017	at::Tensor clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
1018	return wrapper_NestedTensorCUDA__clone(self, memory_format);
1019	}
1020	at::Tensor values(const at::Tensor & self) {
1021	return wrapper_NestedTensorCUDA__values(self);
1022	}
1023	at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
1024	return wrapper_NestedTensorCUDA___to_copy(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
1025	}
1026	at::Tensor _to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
1027	return wrapper_NestedTensorCUDA___to_copy(self, dtype, layout, device, pin_memory, non_blocking, memory_format);
1028	}
1029	at::Tensor view(const at::Tensor & self, at::IntArrayRef size) {
1030	return wrapper_NestedTensorCUDA__view(self, c10::fromIntArrayRefSlow(size));
1031	}
1032	at::Tensor view_symint(const at::Tensor & self, c10::SymIntArrayRef size) {
1033	return wrapper_NestedTensorCUDA__view(self, size);
1034	}
1035	at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self) {
1036	return wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch(self);
1037	}
1038	at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self, bool b) {
1039	return wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch(self, b);
1040	}
1041	at::Tensor to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size) {
1042	return wrapper_NestedTensorCUDA__to_padded_tensor(self, padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt);
1043	}
1044	at::Tensor to_padded_tensor_symint(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size) {
1045	return wrapper_NestedTensorCUDA__to_padded_tensor(self, padding, output_size);
1046	}
1047	at::Tensor _nested_tensor_softmax_with_shape(const at::Tensor & self, const at::Tensor & query) {
1048	return wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape(self, query);
1049	}
1050	at::Tensor _transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type) {
1051	return wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type);
1052	}
1053	::std::tuple<at::Tensor,at::Tensor> _native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type) {
1054	return wrapper_NestedTensorCUDA___native_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type);
1055	}
1056	int64_t _fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask, double dropout_p, bool is_causal) {
1057	return wrapper_NestedTensorCUDA___fused_sdp_choice(query, key, value, attn_mask, dropout_p, is_causal);
1058	}
1059	::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor> _scaled_dot_product_flash_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask) {
1060	return wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention(query, key, value, dropout_p, is_causal, return_debug_mask);
1061	}
1062	::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, bool compute_log_sumexp, bool is_causal) {
1063	return wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention(query, key, value, compute_log_sumexp, is_causal);
1064	}
1065	::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transformer_decoder_only_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value) {
1066	return wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, incr_key, incr_value);
1067	}
1068	::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _native_decoder_only_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value, bool need_weights, bool average_attn_weights) {
1069	return wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, incr_key, incr_value, need_weights, average_attn_weights);
1070	}
1071	} // namespace nestedtensorcuda
1072	} // namespace at
1073

Browse the source code of pytorch/build/aten/src/ATen/RegisterNestedTensorCUDA.cpp