1// required for old g++ to compile PRId64 macros, see
2// https://github.com/pytorch/pytorch/issues/3571
3// for context
4#ifndef __STDC_FORMAT_MACROS
5#define __STDC_FORMAT_MACROS
6#endif
7
8// an external backend might generate file within its code tree
9// and check all the source files within the tree with clang-format.
10// so, disable it since the backend might have a different config.
11// clang-format off
12
13// NOTE: This condition is true for all PyTorch internal libraries, it
14// just excludes external projects such as torch_xla which
15// re-use some of the PyTorch codegen machinery.
16#if defined(CAFFE2_BUILD_MAIN_LIB) || \
17 defined(TORCH_CUDA_BUILD_MAIN_LIB) || \
18 defined(TORCH_HIP_BUILD_MAIN_LIB) || \
19 defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) || \
20 defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB)
21#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
22#endif
23
24// @generated by torchgen/gen.py from RegisterDispatchKey.cpp
25
26#include <c10/core/TensorImpl.h>
27#include <c10/core/Allocator.h>
28#include <ATen/DeviceGuard.h>
29#include <ATen/NamedTensorUtils.h>
30#include <ATen/Utils.h>
31#include <ATen/WrapDimUtils.h>
32#include <ATen/Dispatch.h>
33#include <c10/util/ExclusivelyOwned.h>
34#include <c10/util/Half.h>
35#include <c10/core/UndefinedTensorImpl.h>
36#include <c10/util/Optional.h>
37#include <ATen/Tensor.h>
38#include <ATen/native/Resize.h>
39
40#include <cstddef>
41#include <functional>
42#include <memory>
43#include <utility>
44
45#include <ATen/Config.h>
46#include <ATen/core/op_registration/adaption.h>
47#include <torch/library.h>
48#include <c10/cuda/CUDAGuard.h>
49#include <ATen/cuda/ATenCUDAGeneral.h>
50#include <ATen/cuda/CUDADevice.h>
51#include <ATen/cuda/CUDAContext.h>
52
53#include <ATen/ops/as_strided_native.h>
54#include <ATen/ops/empty.h>
55#include <ATen/ops/empty_strided.h>
56#include <ATen/ops/_copy_from_and_resize.h>
57#include <ATen/ops/_copy_from.h>
58#include <ATen/ops/_fused_sdp_choice_native.h>
59#include <ATen/ops/_native_decoder_only_multi_head_attention_native.h>
60#include <ATen/ops/_native_multi_head_attention_native.h>
61#include <ATen/ops/_nested_from_padded_and_nested_example_native.h>
62#include <ATen/ops/_nested_select_backward_native.h>
63#include <ATen/ops/_nested_tensor_offsets_native.h>
64#include <ATen/ops/_nested_tensor_size_native.h>
65#include <ATen/ops/_nested_tensor_softmax_with_shape_native.h>
66#include <ATen/ops/_nested_tensor_strides_native.h>
67#include <ATen/ops/_scaled_dot_product_efficient_attention_native.h>
68#include <ATen/ops/_scaled_dot_product_flash_attention_native.h>
69#include <ATen/ops/_softmax_backward_data_native.h>
70#include <ATen/ops/_softmax_native.h>
71#include <ATen/ops/_test_autograd_multiple_dispatch_native.h>
72#include <ATen/ops/_to_copy_native.h>
73#include <ATen/ops/_transform_bias_rescale_qkv_native.h>
74#include <ATen/ops/_transformer_decoder_only_layer_fwd_native.h>
75#include <ATen/ops/_transformer_encoder_layer_fwd_native.h>
76#include <ATen/ops/add_native.h>
77#include <ATen/ops/bmm_native.h>
78#include <ATen/ops/chunk_native.h>
79#include <ATen/ops/clone_native.h>
80#include <ATen/ops/copy_native.h>
81#include <ATen/ops/detach_native.h>
82#include <ATen/ops/div_native.h>
83#include <ATen/ops/embedding_native.h>
84#include <ATen/ops/empty_like_native.h>
85#include <ATen/ops/fill_native.h>
86#include <ATen/ops/gelu_native.h>
87#include <ATen/ops/is_same_size_native.h>
88#include <ATen/ops/linear_backward_native.h>
89#include <ATen/ops/linear_native.h>
90#include <ATen/ops/matmul_backward_native.h>
91#include <ATen/ops/matmul_native.h>
92#include <ATen/ops/mul_native.h>
93#include <ATen/ops/native_dropout_backward_native.h>
94#include <ATen/ops/native_dropout_native.h>
95#include <ATen/ops/native_layer_norm_native.h>
96#include <ATen/ops/neg_native.h>
97#include <ATen/ops/ones_like_native.h>
98#include <ATen/ops/relu_native.h>
99#include <ATen/ops/select_native.h>
100#include <ATen/ops/squeeze_native.h>
101#include <ATen/ops/tanh_native.h>
102#include <ATen/ops/to_padded_tensor_native.h>
103#include <ATen/ops/transpose_native.h>
104#include <ATen/ops/unsqueeze_native.h>
105#include <ATen/ops/values_native.h>
106#include <ATen/ops/view_native.h>
107
108// See template file RegisterDispatchDefinitions.ini
109namespace at {
110// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
111// ambiguity with conflicting identifiers that may have been defined in
112// at namespace already.
113namespace {
114void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
115 TORCH_CHECK(options.dtype() == out.dtype(),
116 "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
117 TORCH_CHECK(options.device() == out.device(),
118 "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
119 const bool resized = at::native::resize_output(out, sizes);
120 // Only restride if a resize occurred; otherwise we ignore the (advisory)
121 // strides from the meta function and directly use the output tensor's
122 // preexisting strides
123 if (resized) {
124 if (!strides.empty()) {
125 TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
126 // TODO: avoid the redispatch here
127 out.as_strided_(sizes, strides);
128 } else if (options.memory_format_opt().has_value()) {
129 out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
130 }
131 }
132}
133void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) {
134 // These checks are needed on those operators that:
135 // 1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm')
136 // 2) have particular typing rules (e.g. 'cumsum' and 'cumprod')
137 // For other operators (e.g. 'add'), 'TensorIterator' already checks
138 // these things separately.
139 TORCH_CHECK(options.dtype() == self.dtype(),
140 "Bad in-place call: ",
141 "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match");
142 TORCH_CHECK(options.device() == self.device(),
143 "Bad in-place call: ",
144 "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match");
145 TORCH_CHECK(sizes == self.sizes(),
146 "Bad in-place call: ",
147 "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match");
148}
149namespace {
150::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) {
151 c10::optional<Device> common_device = nullopt;
152(void)common_device; // Suppress unused variable warning
153 c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__native_dropout", "input");
154 const OptionalDeviceGuard device_guard(device_of(input));
155 return at::native::native_dropout_nested(input, p, train);
156}
157} // anonymous namespace
158namespace {
159at::Tensor wrapper_NestedTensorCUDA__native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
160 c10::optional<Device> common_device = nullopt;
161(void)common_device; // Suppress unused variable warning
162 c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA__native_dropout_backward", "grad_output");
163 c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA__native_dropout_backward", "mask");
164 const OptionalDeviceGuard device_guard(device_of(grad_output));
165 return at::native::native_dropout_backward(grad_output, mask, scale);
166}
167} // anonymous namespace
168namespace {
169at::Tensor wrapper_NestedTensorCUDA_Tensor_add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
170 // No device check
171 const OptionalDeviceGuard device_guard(device_of(self));
172 return at::native::NestedTensor_add_Tensor(self, other, alpha);
173}
174} // anonymous namespace
175namespace {
176at::Tensor & wrapper_NestedTensorCUDA_Tensor_add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
177 // No device check
178 const OptionalDeviceGuard device_guard(device_of(self));
179 return at::native::NestedTensor_add__Tensor(self, other, alpha);
180}
181} // anonymous namespace
182namespace {
183at::Tensor wrapper_NestedTensorCUDA__bmm(const at::Tensor & self, const at::Tensor & mat2) {
184 c10::optional<Device> common_device = nullopt;
185(void)common_device; // Suppress unused variable warning
186 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__bmm", "self");
187 c10::impl::check_and_update_common_device(common_device, mat2, "wrapper_NestedTensorCUDA__bmm", "mat2");
188 const OptionalDeviceGuard device_guard(device_of(self));
189 return at::native::bmm_nested_cuda(self, mat2);
190}
191} // anonymous namespace
192namespace {
193::std::vector<at::Tensor> wrapper_NestedTensorCUDA__chunk(const at::Tensor & self, int64_t chunks, int64_t dim) {
194 // No device check
195 // DeviceGuard omitted
196 return at::native::chunk_nested_tensor(self, chunks, dim);
197}
198} // anonymous namespace
199namespace {
200at::Tensor & wrapper_NestedTensorCUDA__copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking) {
201 // No device check
202 // DeviceGuard omitted
203 return at::native::copy_nested_(self, src, non_blocking);
204}
205} // anonymous namespace
206namespace {
207at::Tensor wrapper_NestedTensorCUDA_Tensor_div(const at::Tensor & self, const at::Tensor & other) {
208 // No device check
209 const OptionalDeviceGuard device_guard(device_of(self));
210 return at::native::NestedTensor_div_Tensor(self, other);
211}
212} // anonymous namespace
213namespace {
214at::Tensor wrapper_NestedTensorCUDA_Scalar_div(const at::Tensor & self, const at::Scalar & other) {
215 // No device check
216 const OptionalDeviceGuard device_guard(device_of(self));
217 return at::native::NestedTensor_div_Scalar(self, other);
218}
219} // anonymous namespace
220namespace {
221at::Tensor wrapper_NestedTensorCUDA__embedding(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
222 c10::optional<Device> common_device = nullopt;
223(void)common_device; // Suppress unused variable warning
224 c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__embedding", "weight");
225 c10::impl::check_and_update_common_device(common_device, indices, "wrapper_NestedTensorCUDA__embedding", "indices");
226 const OptionalDeviceGuard device_guard(device_of(weight));
227 return at::native::NestedTensor_embedding(weight, indices, padding_idx.expect_int(), scale_grad_by_freq, sparse);
228}
229} // anonymous namespace
230namespace {
231at::Tensor wrapper_NestedTensorCUDA__empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
232 // No device check
233 // DeviceGuard omitted
234 return at::native::empty_like_nested(self, dtype, layout, device, pin_memory, memory_format);
235}
236} // anonymous namespace
237namespace {
238at::Tensor & wrapper_NestedTensorCUDA_Scalar_fill_(at::Tensor & self, const at::Scalar & value) {
239 // No device check
240 const OptionalDeviceGuard device_guard(device_of(self));
241 return at::native::fill_nested_(self, value);
242}
243} // anonymous namespace
244namespace {
245at::Tensor & wrapper_NestedTensorCUDA_Tensor_fill_(at::Tensor & self, const at::Tensor & value) {
246 // No device check
247 const OptionalDeviceGuard device_guard(device_of(self));
248 return at::native::fill_nested_(self, value);
249}
250} // anonymous namespace
251namespace {
252bool wrapper_NestedTensorCUDA__is_same_size(const at::Tensor & self, const at::Tensor & other) {
253 // No device check
254 // DeviceGuard omitted
255 return at::native::nested_is_same_size(self, other);
256}
257} // anonymous namespace
258namespace {
259::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__native_layer_norm(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
260 c10::optional<Device> common_device = nullopt;
261(void)common_device; // Suppress unused variable warning
262 c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__native_layer_norm", "input");
263 c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__native_layer_norm", "weight");
264 c10::impl::check_and_update_common_device(common_device, bias, "wrapper_NestedTensorCUDA__native_layer_norm", "bias");
265 const OptionalDeviceGuard device_guard(device_of(input));
266 return at::native::nested_layer_norm(input, C10_AS_INTARRAYREF_SLOW(normalized_shape), weight, bias, eps);
267}
268} // anonymous namespace
269namespace {
270at::Tensor wrapper_NestedTensorCUDA__linear(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias) {
271 c10::optional<Device> common_device = nullopt;
272(void)common_device; // Suppress unused variable warning
273 c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__linear", "input");
274 c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__linear", "weight");
275 c10::impl::check_and_update_common_device(common_device, bias, "wrapper_NestedTensorCUDA__linear", "bias");
276 const OptionalDeviceGuard device_guard(device_of(input));
277 return at::native::nested_linear(input, weight, bias);
278}
279} // anonymous namespace
280namespace {
281::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__linear_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) {
282 c10::optional<Device> common_device = nullopt;
283(void)common_device; // Suppress unused variable warning
284 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__linear_backward", "self");
285 c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA__linear_backward", "grad_output");
286 c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__linear_backward", "weight");
287 const OptionalDeviceGuard device_guard(device_of(self));
288 return at::native::nested_linear_backward(self, grad_output, weight, output_mask);
289}
290} // anonymous namespace
291namespace {
292at::Tensor wrapper_NestedTensorCUDA__matmul(const at::Tensor & self, const at::Tensor & other) {
293 c10::optional<Device> common_device = nullopt;
294(void)common_device; // Suppress unused variable warning
295 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__matmul", "self");
296 c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA__matmul", "other");
297 const OptionalDeviceGuard device_guard(device_of(self));
298 return at::native::matmul_nested(self, other);
299}
300} // anonymous namespace
301namespace {
302at::Tensor & wrapper_NestedTensorCUDA_out_matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
303 c10::optional<Device> common_device = nullopt;
304(void)common_device; // Suppress unused variable warning
305 c10::impl::check_and_update_common_device(common_device, out, "wrapper_NestedTensorCUDA_out_matmul_out", "out");
306 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_out_matmul_out", "self");
307 c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA_out_matmul_out", "other");
308 const OptionalDeviceGuard device_guard(device_of(self));
309 return at::native::matmul_out_nested(self, other, out);
310}
311} // anonymous namespace
312namespace {
313::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) {
314 c10::optional<Device> common_device = nullopt;
315(void)common_device; // Suppress unused variable warning
316 c10::impl::check_and_update_common_device(common_device, grad, "wrapper_NestedTensorCUDA__matmul_backward", "grad");
317 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__matmul_backward", "self");
318 c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA__matmul_backward", "other");
319 const OptionalDeviceGuard device_guard(device_of(self));
320 return at::native::matmul_backward_nested(grad, self, other, mask);
321}
322} // anonymous namespace
323namespace {
324at::Tensor wrapper_NestedTensorCUDA_Tensor_mul(const at::Tensor & self, const at::Tensor & other) {
325 // No device check
326 const OptionalDeviceGuard device_guard(device_of(self));
327 return at::native::NestedTensor_mul_Tensor(self, other);
328}
329} // anonymous namespace
330namespace {
331at::Tensor & wrapper_NestedTensorCUDA_Tensor_mul_(at::Tensor & self, const at::Tensor & other) {
332 // No device check
333 const OptionalDeviceGuard device_guard(device_of(self));
334 return at::native::NestedTensor_mul__Tensor(self, other);
335}
336} // anonymous namespace
337namespace {
338at::Tensor wrapper_NestedTensorCUDA_Scalar_mul(const at::Tensor & self, const at::Scalar & other) {
339 // No device check
340 const OptionalDeviceGuard device_guard(device_of(self));
341 return at::native::NestedTensor_mul_Scalar(self, other);
342}
343} // anonymous namespace
344namespace {
345at::Tensor & wrapper_NestedTensorCUDA_Scalar_mul_(at::Tensor & self, const at::Scalar & other) {
346 // No device check
347 const OptionalDeviceGuard device_guard(device_of(self));
348 return at::native::NestedTensor_mul__Scalar(self, other);
349}
350} // anonymous namespace
351namespace {
352at::Tensor wrapper_NestedTensorCUDA__ones_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
353 c10::optional<Device> common_device = nullopt;
354(void)common_device; // Suppress unused variable warning
355 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__ones_like", "self");
356 globalContext().lazyInitCUDA();
357 const DeviceGuard device_guard(device_or_default(device));
358 return at::native::ones_like(self, dtype, layout, device, pin_memory, memory_format);
359}
360} // anonymous namespace
361namespace {
362at::Tensor wrapper_NestedTensorCUDA__neg(const at::Tensor & self) {
363 // No device check
364 const OptionalDeviceGuard device_guard(device_of(self));
365 return at::native::NestedTensor_neg(self);
366}
367} // anonymous namespace
368namespace {
369at::Tensor & wrapper_NestedTensorCUDA__neg_(at::Tensor & self) {
370 // No device check
371 const OptionalDeviceGuard device_guard(device_of(self));
372 return at::native::NestedTensor_neg_(self);
373}
374} // anonymous namespace
375namespace {
376at::Tensor wrapper_NestedTensorCUDA__relu(const at::Tensor & self) {
377 // No device check
378 const OptionalDeviceGuard device_guard(device_of(self));
379 return at::native::NestedTensor_relu(self);
380}
381} // anonymous namespace
382namespace {
383at::Tensor & wrapper_NestedTensorCUDA__relu_(at::Tensor & self) {
384 // No device check
385 const OptionalDeviceGuard device_guard(device_of(self));
386 return at::native::NestedTensor_relu_(self);
387}
388} // anonymous namespace
389namespace {
390at::Tensor wrapper_NestedTensorCUDA__gelu(const at::Tensor & self, c10::string_view approximate) {
391 // No device check
392 const OptionalDeviceGuard device_guard(device_of(self));
393 return at::native::NestedTensor_gelu(self, approximate);
394}
395} // anonymous namespace
396namespace {
397at::Tensor & wrapper_NestedTensorCUDA__gelu_(at::Tensor & self, c10::string_view approximate) {
398 // No device check
399 const OptionalDeviceGuard device_guard(device_of(self));
400 return at::native::NestedTensor_gelu_(self, approximate);
401}
402} // anonymous namespace
403namespace {
404at::Tensor wrapper_NestedTensorCUDA_int_select(const at::Tensor & self, int64_t dim, c10::SymInt index) {
405 // No device check
406 // DeviceGuard omitted
407 return at::native::select_nested(self, dim, index.expect_int());
408}
409} // anonymous namespace
410namespace {
411at::Tensor wrapper_NestedTensorCUDA___nested_select_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) {
412 // No device check
413 // DeviceGuard omitted
414 return at::native::_nested_select_backward_symint(grad_output, self, dim, index);
415}
416} // anonymous namespace
417namespace {
418at::Tensor wrapper_NestedTensorCUDA__detach(const at::Tensor & self) {
419 c10::optional<Device> common_device = nullopt;
420(void)common_device; // Suppress unused variable warning
421 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__detach", "self");
422 const OptionalDeviceGuard device_guard(device_of(self));
423 return at::native::detach(self);
424}
425} // anonymous namespace
426namespace {
427at::Tensor wrapper_NestedTensorCUDA___softmax(const at::Tensor & self, int64_t dim, bool half_to_float) {
428 c10::optional<Device> common_device = nullopt;
429(void)common_device; // Suppress unused variable warning
430 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___softmax", "self");
431 const OptionalDeviceGuard device_guard(device_of(self));
432 return at::native::softmax_nested(self, dim, half_to_float);
433}
434} // anonymous namespace
435namespace {
436at::Tensor wrapper_NestedTensorCUDA___softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
437 c10::optional<Device> common_device = nullopt;
438(void)common_device; // Suppress unused variable warning
439 c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA___softmax_backward_data", "grad_output");
440 c10::impl::check_and_update_common_device(common_device, output, "wrapper_NestedTensorCUDA___softmax_backward_data", "output");
441 const OptionalDeviceGuard device_guard(device_of(grad_output));
442 return at::native::nested_softmax_backward(grad_output, output, dim, input_dtype);
443}
444} // anonymous namespace
445namespace {
446at::Tensor wrapper_NestedTensorCUDA__squeeze(const at::Tensor & self) {
447 // No device check
448 // DeviceGuard omitted
449 return at::native::squeeze_nested(self);
450}
451} // anonymous namespace
452namespace {
453at::Tensor wrapper_NestedTensorCUDA_dim_squeeze(const at::Tensor & self, int64_t dim) {
454 // No device check
455 // DeviceGuard omitted
456 return at::native::squeeze_dim_nested(self, dim);
457}
458} // anonymous namespace
459namespace {
460at::Tensor wrapper_NestedTensorCUDA_dims_squeeze(const at::Tensor & self, at::IntArrayRef dim) {
461 // No device check
462 // DeviceGuard omitted
463 return at::native::squeeze_dim_nested(self, dim);
464}
465} // anonymous namespace
466namespace {
467at::Tensor wrapper_NestedTensorCUDA__tanh(const at::Tensor & self) {
468 // No device check
469 const OptionalDeviceGuard device_guard(device_of(self));
470 return at::native::NestedTensor_tanh(self);
471}
472} // anonymous namespace
473namespace {
474at::Tensor & wrapper_NestedTensorCUDA__tanh_(at::Tensor & self) {
475 // No device check
476 const OptionalDeviceGuard device_guard(device_of(self));
477 return at::native::NestedTensor_tanh_(self);
478}
479} // anonymous namespace
480namespace {
481at::Tensor wrapper_NestedTensorCUDA_int_transpose(const at::Tensor & self, int64_t dim0, int64_t dim1) {
482 // No device check
483 // DeviceGuard omitted
484 return at::native::transpose_nested(self, dim0, dim1);
485}
486} // anonymous namespace
487namespace {
488::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) {
489 c10::optional<Device> common_device = nullopt;
490(void)common_device; // Suppress unused variable warning
491 c10::impl::check_and_update_common_device(common_device, qkv, "wrapper_NestedTensorCUDA___transform_bias_rescale_qkv", "qkv");
492 c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transform_bias_rescale_qkv", "qkv_bias");
493 const OptionalDeviceGuard device_guard(device_of(qkv));
494 return at::native::transform_bias_rescale_qkv_cuda(qkv, qkv_bias, num_heads);
495}
496} // anonymous namespace
497namespace {
498at::Tensor wrapper_NestedTensorCUDA___nested_tensor_size(const at::Tensor & self) {
499 c10::optional<Device> common_device = nullopt;
500(void)common_device; // Suppress unused variable warning
501 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_size", "self");
502 const OptionalDeviceGuard device_guard(device_of(self));
503 return at::native::_nested_tensor_size(self);
504}
505} // anonymous namespace
506namespace {
507at::Tensor wrapper_NestedTensorCUDA___nested_tensor_strides(const at::Tensor & self) {
508 c10::optional<Device> common_device = nullopt;
509(void)common_device; // Suppress unused variable warning
510 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_strides", "self");
511 const OptionalDeviceGuard device_guard(device_of(self));
512 return at::native::_nested_tensor_strides(self);
513}
514} // anonymous namespace
515namespace {
516::std::vector<int64_t> wrapper_NestedTensorCUDA___nested_tensor_offsets(const at::Tensor & self) {
517 c10::optional<Device> common_device = nullopt;
518(void)common_device; // Suppress unused variable warning
519 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_offsets", "self");
520 const OptionalDeviceGuard device_guard(device_of(self));
521 return at::native::_nested_tensor_offsets(self);
522}
523} // anonymous namespace
524namespace {
525at::Tensor wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example) {
526 c10::optional<Device> common_device = nullopt;
527(void)common_device; // Suppress unused variable warning
528 c10::impl::check_and_update_common_device(common_device, padded, "wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example", "padded");
529 c10::impl::check_and_update_common_device(common_device, nt_example, "wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example", "nt_example");
530 const OptionalDeviceGuard device_guard(device_of(padded));
531 return at::native::NestedTensor_from_padded_and_nested_example(padded, nt_example);
532}
533} // anonymous namespace
534namespace {
535at::Tensor wrapper_NestedTensorCUDA__unsqueeze(const at::Tensor & self, int64_t dim) {
536 // No device check
537 // DeviceGuard omitted
538 return at::native::unsqueeze_nested(self, dim);
539}
540} // anonymous namespace
541namespace {
542at::Tensor wrapper_NestedTensorCUDA__clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
543 c10::optional<Device> common_device = nullopt;
544(void)common_device; // Suppress unused variable warning
545 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__clone", "self");
546 const OptionalDeviceGuard device_guard(device_of(self));
547 return at::native::clone_nested(self, memory_format);
548}
549} // anonymous namespace
550namespace {
551at::Tensor wrapper_NestedTensorCUDA__values(const at::Tensor & self) {
552 // No device check
553 // DeviceGuard omitted
554 return at::native::values_nested(self);
555}
556} // anonymous namespace
557namespace {
558at::Tensor wrapper_NestedTensorCUDA___to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
559 // No device check
560 // DeviceGuard omitted
561 return at::native::_to_copy_nested(self, dtype, layout, device, pin_memory, non_blocking, memory_format);
562}
563} // anonymous namespace
564namespace {
565at::Tensor wrapper_NestedTensorCUDA__view(const at::Tensor & self, c10::SymIntArrayRef size) {
566 // No device check
567 // DeviceGuard omitted
568 return at::native::view_nested(self, C10_AS_INTARRAYREF_SLOW(size));
569}
570} // anonymous namespace
571namespace {
572at::Tensor wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch(const at::Tensor & self) {
573 c10::optional<Device> common_device = nullopt;
574(void)common_device; // Suppress unused variable warning
575 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch", "self");
576 const OptionalDeviceGuard device_guard(device_of(self));
577 return at::native::_test_autograd_multiple_dispatch_fullcoverage(self);
578}
579} // anonymous namespace
580namespace {
581at::Tensor wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch(const at::Tensor & self, bool b) {
582 c10::optional<Device> common_device = nullopt;
583(void)common_device; // Suppress unused variable warning
584 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch", "self");
585 const OptionalDeviceGuard device_guard(device_of(self));
586 return at::native::_test_autograd_multiple_dispatch_ntonly(self, b);
587}
588} // anonymous namespace
589namespace {
590at::Tensor wrapper_NestedTensorCUDA__to_padded_tensor(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size) {
591 c10::optional<Device> common_device = nullopt;
592(void)common_device; // Suppress unused variable warning
593 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__to_padded_tensor", "self");
594 const OptionalDeviceGuard device_guard(device_of(self));
595 return at::native::NestedTensor_to_padded_tensor_cuda(self, padding, output_size.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*output_size)) : c10::nullopt);
596}
597} // anonymous namespace
598namespace {
599at::Tensor wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape(const at::Tensor & self, const at::Tensor & query) {
600 c10::optional<Device> common_device = nullopt;
601(void)common_device; // Suppress unused variable warning
602 c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape", "self");
603 c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape", "query");
604 const OptionalDeviceGuard device_guard(device_of(self));
605 return at::native::NestedTensor_softmax_dropout_cuda(self, query);
606}
607} // anonymous namespace
608namespace {
609at::Tensor wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type) {
610 c10::optional<Device> common_device = nullopt;
611(void)common_device; // Suppress unused variable warning
612 c10::impl::check_and_update_common_device(common_device, src, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "src");
613 c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "qkv_weight");
614 c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "qkv_bias");
615 c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "proj_weight");
616 c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "proj_bias");
617 c10::impl::check_and_update_common_device(common_device, norm_weight_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_weight_1");
618 c10::impl::check_and_update_common_device(common_device, norm_bias_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_bias_1");
619 c10::impl::check_and_update_common_device(common_device, norm_weight_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_weight_2");
620 c10::impl::check_and_update_common_device(common_device, norm_bias_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "norm_bias_2");
621 c10::impl::check_and_update_common_device(common_device, ffn_weight_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_weight_1");
622 c10::impl::check_and_update_common_device(common_device, ffn_bias_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_bias_1");
623 c10::impl::check_and_update_common_device(common_device, ffn_weight_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_weight_2");
624 c10::impl::check_and_update_common_device(common_device, ffn_bias_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "ffn_bias_2");
625 c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd", "mask");
626 const OptionalDeviceGuard device_guard(device_of(src));
627 return at::native::transformer_encoder_layer_forward(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type);
628}
629} // anonymous namespace
630namespace {
631::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type) {
632 c10::optional<Device> common_device = nullopt;
633(void)common_device; // Suppress unused variable warning
634 c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___native_multi_head_attention", "query");
635 c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___native_multi_head_attention", "key");
636 c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___native_multi_head_attention", "value");
637 c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___native_multi_head_attention", "qkv_weight");
638 c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___native_multi_head_attention", "qkv_bias");
639 c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___native_multi_head_attention", "proj_weight");
640 c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___native_multi_head_attention", "proj_bias");
641 c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___native_multi_head_attention", "mask");
642 const OptionalDeviceGuard device_guard(device_of(query));
643 return at::native::native_multi_head_attention_cuda(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type);
644}
645} // anonymous namespace
646namespace {
647int64_t wrapper_NestedTensorCUDA___fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask, double dropout_p, bool is_causal) {
648 c10::optional<Device> common_device = nullopt;
649(void)common_device; // Suppress unused variable warning
650 c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___fused_sdp_choice", "query");
651 c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___fused_sdp_choice", "key");
652 c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___fused_sdp_choice", "value");
653 c10::impl::check_and_update_common_device(common_device, attn_mask, "wrapper_NestedTensorCUDA___fused_sdp_choice", "attn_mask");
654 const OptionalDeviceGuard device_guard(device_of(query));
655 return at::native::_fused_sdp_choice_cuda(query, key, value, attn_mask, dropout_p, is_causal);
656}
657} // anonymous namespace
658namespace {
659::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor> wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask) {
660 c10::optional<Device> common_device = nullopt;
661(void)common_device; // Suppress unused variable warning
662 c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention", "query");
663 c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention", "key");
664 c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention", "value");
665 const OptionalDeviceGuard device_guard(device_of(query));
666 return at::native::_scaled_dot_product_flash_attention_nestedtensor_cuda(query, key, value, dropout_p, is_causal, return_debug_mask);
667}
668} // anonymous namespace
669namespace {
670::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, bool compute_log_sumexp, bool is_causal) {
671 c10::optional<Device> common_device = nullopt;
672(void)common_device; // Suppress unused variable warning
673 c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention", "query");
674 c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention", "key");
675 c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention", "value");
676 const OptionalDeviceGuard device_guard(device_of(query));
677 return at::native::_scaled_dot_product_efficient_attention_nestedtensor_cuda(query, key, value, compute_log_sumexp, is_causal);
678}
679} // anonymous namespace
680namespace {
681::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value) {
682 c10::optional<Device> common_device = nullopt;
683(void)common_device; // Suppress unused variable warning
684 c10::impl::check_and_update_common_device(common_device, src, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "src");
685 c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "qkv_weight");
686 c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "qkv_bias");
687 c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "proj_weight");
688 c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "proj_bias");
689 c10::impl::check_and_update_common_device(common_device, norm_weight_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_weight_1");
690 c10::impl::check_and_update_common_device(common_device, norm_bias_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_bias_1");
691 c10::impl::check_and_update_common_device(common_device, norm_weight_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_weight_2");
692 c10::impl::check_and_update_common_device(common_device, norm_bias_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "norm_bias_2");
693 c10::impl::check_and_update_common_device(common_device, ffn_weight_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_weight_1");
694 c10::impl::check_and_update_common_device(common_device, ffn_bias_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_bias_1");
695 c10::impl::check_and_update_common_device(common_device, ffn_weight_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_weight_2");
696 c10::impl::check_and_update_common_device(common_device, ffn_bias_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "ffn_bias_2");
697 c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "mask");
698 c10::impl::check_and_update_common_device(common_device, incr_key, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "incr_key");
699 c10::impl::check_and_update_common_device(common_device, incr_value, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd", "incr_value");
700 const OptionalDeviceGuard device_guard(device_of(src));
701 return at::native::transformer_decoder_only_layer_forward(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, incr_key, incr_value);
702}
703} // anonymous namespace
704namespace {
705::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value, bool need_weights, bool average_attn_weights) {
706 c10::optional<Device> common_device = nullopt;
707(void)common_device; // Suppress unused variable warning
708 c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "query");
709 c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "key");
710 c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "value");
711 c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "qkv_weight");
712 c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "qkv_bias");
713 c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "proj_weight");
714 c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "proj_bias");
715 c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "mask");
716 c10::impl::check_and_update_common_device(common_device, incr_key, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "incr_key");
717 c10::impl::check_and_update_common_device(common_device, incr_value, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention", "incr_value");
718 const OptionalDeviceGuard device_guard(device_of(query));
719 return at::native::native_decoder_only_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, incr_key, incr_value, need_weights, average_attn_weights);
720}
721} // anonymous namespace
722TORCH_LIBRARY_IMPL(aten, NestedTensorCUDA, m) {
723 m.impl("native_dropout",
724TORCH_FN(wrapper_NestedTensorCUDA__native_dropout));
725m.impl("native_dropout_backward",
726TORCH_FN(wrapper_NestedTensorCUDA__native_dropout_backward));
727m.impl("add.Tensor",
728TORCH_FN(wrapper_NestedTensorCUDA_Tensor_add));
729m.impl("add_.Tensor",
730TORCH_FN(wrapper_NestedTensorCUDA_Tensor_add_));
731m.impl("bmm",
732TORCH_FN(wrapper_NestedTensorCUDA__bmm));
733m.impl("chunk",
734TORCH_FN(wrapper_NestedTensorCUDA__chunk));
735m.impl("copy_",
736TORCH_FN(wrapper_NestedTensorCUDA__copy_));
737m.impl("div.Tensor",
738TORCH_FN(wrapper_NestedTensorCUDA_Tensor_div));
739m.impl("div.Scalar",
740TORCH_FN(wrapper_NestedTensorCUDA_Scalar_div));
741m.impl("embedding",
742TORCH_FN(wrapper_NestedTensorCUDA__embedding));
743m.impl("empty_like",
744TORCH_FN(wrapper_NestedTensorCUDA__empty_like));
745m.impl("fill_.Scalar",
746TORCH_FN(wrapper_NestedTensorCUDA_Scalar_fill_));
747m.impl("fill_.Tensor",
748TORCH_FN(wrapper_NestedTensorCUDA_Tensor_fill_));
749m.impl("is_same_size",
750TORCH_FN(wrapper_NestedTensorCUDA__is_same_size));
751m.impl("native_layer_norm",
752TORCH_FN(wrapper_NestedTensorCUDA__native_layer_norm));
753m.impl("linear",
754TORCH_FN(wrapper_NestedTensorCUDA__linear));
755m.impl("linear_backward",
756TORCH_FN(wrapper_NestedTensorCUDA__linear_backward));
757m.impl("matmul",
758TORCH_FN(wrapper_NestedTensorCUDA__matmul));
759m.impl("matmul.out",
760TORCH_FN(wrapper_NestedTensorCUDA_out_matmul_out));
761m.impl("matmul_backward",
762TORCH_FN(wrapper_NestedTensorCUDA__matmul_backward));
763m.impl("mul.Tensor",
764TORCH_FN(wrapper_NestedTensorCUDA_Tensor_mul));
765m.impl("mul_.Tensor",
766TORCH_FN(wrapper_NestedTensorCUDA_Tensor_mul_));
767m.impl("mul.Scalar",
768TORCH_FN(wrapper_NestedTensorCUDA_Scalar_mul));
769m.impl("mul_.Scalar",
770TORCH_FN(wrapper_NestedTensorCUDA_Scalar_mul_));
771m.impl("ones_like",
772TORCH_FN(wrapper_NestedTensorCUDA__ones_like));
773m.impl("neg",
774TORCH_FN(wrapper_NestedTensorCUDA__neg));
775m.impl("neg_",
776TORCH_FN(wrapper_NestedTensorCUDA__neg_));
777m.impl("relu",
778TORCH_FN(wrapper_NestedTensorCUDA__relu));
779m.impl("relu_",
780TORCH_FN(wrapper_NestedTensorCUDA__relu_));
781m.impl("gelu",
782TORCH_FN(wrapper_NestedTensorCUDA__gelu));
783m.impl("gelu_",
784TORCH_FN(wrapper_NestedTensorCUDA__gelu_));
785m.impl("select.int",
786TORCH_FN(wrapper_NestedTensorCUDA_int_select));
787m.impl("_nested_select_backward",
788TORCH_FN(wrapper_NestedTensorCUDA___nested_select_backward));
789m.impl("detach",
790TORCH_FN(wrapper_NestedTensorCUDA__detach));
791m.impl("_softmax",
792TORCH_FN(wrapper_NestedTensorCUDA___softmax));
793m.impl("_softmax_backward_data",
794TORCH_FN(wrapper_NestedTensorCUDA___softmax_backward_data));
795m.impl("squeeze",
796TORCH_FN(wrapper_NestedTensorCUDA__squeeze));
797m.impl("squeeze.dim",
798TORCH_FN(wrapper_NestedTensorCUDA_dim_squeeze));
799m.impl("squeeze.dims",
800TORCH_FN(wrapper_NestedTensorCUDA_dims_squeeze));
801m.impl("tanh",
802TORCH_FN(wrapper_NestedTensorCUDA__tanh));
803m.impl("tanh_",
804TORCH_FN(wrapper_NestedTensorCUDA__tanh_));
805m.impl("transpose.int",
806TORCH_FN(wrapper_NestedTensorCUDA_int_transpose));
807m.impl("_transform_bias_rescale_qkv",
808TORCH_FN(wrapper_NestedTensorCUDA___transform_bias_rescale_qkv));
809m.impl("_nested_tensor_size",
810TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_size));
811m.impl("_nested_tensor_strides",
812TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_strides));
813m.impl("_nested_tensor_offsets",
814TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_offsets));
815m.impl("_nested_from_padded_and_nested_example",
816TORCH_FN(wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example));
817m.impl("unsqueeze",
818TORCH_FN(wrapper_NestedTensorCUDA__unsqueeze));
819m.impl("clone",
820TORCH_FN(wrapper_NestedTensorCUDA__clone));
821m.impl("values",
822TORCH_FN(wrapper_NestedTensorCUDA__values));
823m.impl("_to_copy",
824TORCH_FN(wrapper_NestedTensorCUDA___to_copy));
825m.impl("view",
826TORCH_FN(wrapper_NestedTensorCUDA__view));
827m.impl("_test_autograd_multiple_dispatch.fullcoverage",
828TORCH_FN(wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch));
829m.impl("_test_autograd_multiple_dispatch.ntonly",
830TORCH_FN(wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch));
831m.impl("to_padded_tensor",
832TORCH_FN(wrapper_NestedTensorCUDA__to_padded_tensor));
833m.impl("_nested_tensor_softmax_with_shape",
834TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape));
835m.impl("_transformer_encoder_layer_fwd",
836TORCH_FN(wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd));
837m.impl("_native_multi_head_attention",
838TORCH_FN(wrapper_NestedTensorCUDA___native_multi_head_attention));
839m.impl("_fused_sdp_choice",
840TORCH_FN(wrapper_NestedTensorCUDA___fused_sdp_choice));
841m.impl("_scaled_dot_product_flash_attention",
842TORCH_FN(wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention));
843m.impl("_scaled_dot_product_efficient_attention",
844TORCH_FN(wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention));
845m.impl("_transformer_decoder_only_layer_fwd",
846TORCH_FN(wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd));
847m.impl("_native_decoder_only_multi_head_attention",
848TORCH_FN(wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention));
849};
850} // anonymous namespace
851namespace nestedtensorcuda {
852::std::tuple<at::Tensor,at::Tensor> native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) {
853return wrapper_NestedTensorCUDA__native_dropout(input, p, train);
854}
855at::Tensor native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
856return wrapper_NestedTensorCUDA__native_dropout_backward(grad_output, mask, scale);
857}
858at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
859return wrapper_NestedTensorCUDA_Tensor_add(self, other, alpha);
860}
861at::Tensor & add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
862return wrapper_NestedTensorCUDA_Tensor_add_(self, other, alpha);
863}
864at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2) {
865return wrapper_NestedTensorCUDA__bmm(self, mat2);
866}
867::std::vector<at::Tensor> chunk(const at::Tensor & self, int64_t chunks, int64_t dim) {
868return wrapper_NestedTensorCUDA__chunk(self, chunks, dim);
869}
870at::Tensor & copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking) {
871return wrapper_NestedTensorCUDA__copy_(self, src, non_blocking);
872}
873at::Tensor div(const at::Tensor & self, const at::Tensor & other) {
874return wrapper_NestedTensorCUDA_Tensor_div(self, other);
875}
876at::Tensor div(const at::Tensor & self, const at::Scalar & other) {
877return wrapper_NestedTensorCUDA_Scalar_div(self, other);
878}
879at::Tensor embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
880return wrapper_NestedTensorCUDA__embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
881}
882at::Tensor embedding_symint(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
883return wrapper_NestedTensorCUDA__embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
884}
885at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
886return wrapper_NestedTensorCUDA__empty_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
887}
888at::Tensor empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
889return wrapper_NestedTensorCUDA__empty_like(self, dtype, layout, device, pin_memory, memory_format);
890}
891at::Tensor & fill_(at::Tensor & self, const at::Scalar & value) {
892return wrapper_NestedTensorCUDA_Scalar_fill_(self, value);
893}
894at::Tensor & fill_(at::Tensor & self, const at::Tensor & value) {
895return wrapper_NestedTensorCUDA_Tensor_fill_(self, value);
896}
897bool is_same_size(const at::Tensor & self, const at::Tensor & other) {
898return wrapper_NestedTensorCUDA__is_same_size(self, other);
899}
900::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
901return wrapper_NestedTensorCUDA__native_layer_norm(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps);
902}
903::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
904return wrapper_NestedTensorCUDA__native_layer_norm(input, normalized_shape, weight, bias, eps);
905}
906at::Tensor linear(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias) {
907return wrapper_NestedTensorCUDA__linear(input, weight, bias);
908}
909::std::tuple<at::Tensor,at::Tensor,at::Tensor> linear_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) {
910return wrapper_NestedTensorCUDA__linear_backward(self, grad_output, weight, output_mask);
911}
912at::Tensor matmul(const at::Tensor & self, const at::Tensor & other) {
913return wrapper_NestedTensorCUDA__matmul(self, other);
914}
915at::Tensor & matmul_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
916return wrapper_NestedTensorCUDA_out_matmul_out(self, other, out);
917}
918at::Tensor & matmul_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
919return wrapper_NestedTensorCUDA_out_matmul_out(self, other, out);
920}
921::std::tuple<at::Tensor,at::Tensor> matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) {
922return wrapper_NestedTensorCUDA__matmul_backward(grad, self, other, mask);
923}
924at::Tensor mul(const at::Tensor & self, const at::Tensor & other) {
925return wrapper_NestedTensorCUDA_Tensor_mul(self, other);
926}
927at::Tensor & mul_(at::Tensor & self, const at::Tensor & other) {
928return wrapper_NestedTensorCUDA_Tensor_mul_(self, other);
929}
930at::Tensor mul(const at::Tensor & self, const at::Scalar & other) {
931return wrapper_NestedTensorCUDA_Scalar_mul(self, other);
932}
933at::Tensor & mul_(at::Tensor & self, const at::Scalar & other) {
934return wrapper_NestedTensorCUDA_Scalar_mul_(self, other);
935}
936at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) {
937return wrapper_NestedTensorCUDA__ones_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
938}
939at::Tensor ones_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
940return wrapper_NestedTensorCUDA__ones_like(self, dtype, layout, device, pin_memory, memory_format);
941}
942at::Tensor neg(const at::Tensor & self) {
943return wrapper_NestedTensorCUDA__neg(self);
944}
945at::Tensor & neg_(at::Tensor & self) {
946return wrapper_NestedTensorCUDA__neg_(self);
947}
948at::Tensor relu(const at::Tensor & self) {
949return wrapper_NestedTensorCUDA__relu(self);
950}
951at::Tensor & relu_(at::Tensor & self) {
952return wrapper_NestedTensorCUDA__relu_(self);
953}
954at::Tensor gelu(const at::Tensor & self, c10::string_view approximate) {
955return wrapper_NestedTensorCUDA__gelu(self, approximate);
956}
957at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate) {
958return wrapper_NestedTensorCUDA__gelu_(self, approximate);
959}
960at::Tensor select(const at::Tensor & self, int64_t dim, int64_t index) {
961return wrapper_NestedTensorCUDA_int_select(self, dim, index);
962}
963at::Tensor select_symint(const at::Tensor & self, int64_t dim, c10::SymInt index) {
964return wrapper_NestedTensorCUDA_int_select(self, dim, index);
965}
966at::Tensor _nested_select_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, int64_t index) {
967return wrapper_NestedTensorCUDA___nested_select_backward(grad_output, self, dim, index);
968}
969at::Tensor _nested_select_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) {
970return wrapper_NestedTensorCUDA___nested_select_backward(grad_output, self, dim, index);
971}
972at::Tensor detach(const at::Tensor & self) {
973return wrapper_NestedTensorCUDA__detach(self);
974}
975at::Tensor _softmax(const at::Tensor & self, int64_t dim, bool half_to_float) {
976return wrapper_NestedTensorCUDA___softmax(self, dim, half_to_float);
977}
978at::Tensor _softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
979return wrapper_NestedTensorCUDA___softmax_backward_data(grad_output, output, dim, input_dtype);
980}
981at::Tensor squeeze(const at::Tensor & self) {
982return wrapper_NestedTensorCUDA__squeeze(self);
983}
984at::Tensor squeeze(const at::Tensor & self, int64_t dim) {
985return wrapper_NestedTensorCUDA_dim_squeeze(self, dim);
986}
987at::Tensor squeeze(const at::Tensor & self, at::IntArrayRef dim) {
988return wrapper_NestedTensorCUDA_dims_squeeze(self, dim);
989}
990at::Tensor tanh(const at::Tensor & self) {
991return wrapper_NestedTensorCUDA__tanh(self);
992}
993at::Tensor & tanh_(at::Tensor & self) {
994return wrapper_NestedTensorCUDA__tanh_(self);
995}
996at::Tensor transpose(const at::Tensor & self, int64_t dim0, int64_t dim1) {
997return wrapper_NestedTensorCUDA_int_transpose(self, dim0, dim1);
998}
999::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transform_bias_rescale_qkv(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) {
1000return wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(qkv, qkv_bias, num_heads);
1001}
1002at::Tensor _nested_tensor_size(const at::Tensor & self) {
1003return wrapper_NestedTensorCUDA___nested_tensor_size(self);
1004}
1005at::Tensor _nested_tensor_strides(const at::Tensor & self) {
1006return wrapper_NestedTensorCUDA___nested_tensor_strides(self);
1007}
1008::std::vector<int64_t> _nested_tensor_offsets(const at::Tensor & self) {
1009return wrapper_NestedTensorCUDA___nested_tensor_offsets(self);
1010}
1011at::Tensor _nested_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example) {
1012return wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example(padded, nt_example);
1013}
1014at::Tensor unsqueeze(const at::Tensor & self, int64_t dim) {
1015return wrapper_NestedTensorCUDA__unsqueeze(self, dim);
1016}
1017at::Tensor clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
1018return wrapper_NestedTensorCUDA__clone(self, memory_format);
1019}
1020at::Tensor values(const at::Tensor & self) {
1021return wrapper_NestedTensorCUDA__values(self);
1022}
1023at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
1024return wrapper_NestedTensorCUDA___to_copy(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
1025}
1026at::Tensor _to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
1027return wrapper_NestedTensorCUDA___to_copy(self, dtype, layout, device, pin_memory, non_blocking, memory_format);
1028}
1029at::Tensor view(const at::Tensor & self, at::IntArrayRef size) {
1030return wrapper_NestedTensorCUDA__view(self, c10::fromIntArrayRefSlow(size));
1031}
1032at::Tensor view_symint(const at::Tensor & self, c10::SymIntArrayRef size) {
1033return wrapper_NestedTensorCUDA__view(self, size);
1034}
1035at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self) {
1036return wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch(self);
1037}
1038at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self, bool b) {
1039return wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch(self, b);
1040}
1041at::Tensor to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size) {
1042return wrapper_NestedTensorCUDA__to_padded_tensor(self, padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt);
1043}
1044at::Tensor to_padded_tensor_symint(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size) {
1045return wrapper_NestedTensorCUDA__to_padded_tensor(self, padding, output_size);
1046}
1047at::Tensor _nested_tensor_softmax_with_shape(const at::Tensor & self, const at::Tensor & query) {
1048return wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape(self, query);
1049}
1050at::Tensor _transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type) {
1051return wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type);
1052}
1053::std::tuple<at::Tensor,at::Tensor> _native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type) {
1054return wrapper_NestedTensorCUDA___native_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type);
1055}
1056int64_t _fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask, double dropout_p, bool is_causal) {
1057return wrapper_NestedTensorCUDA___fused_sdp_choice(query, key, value, attn_mask, dropout_p, is_causal);
1058}
1059::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor> _scaled_dot_product_flash_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask) {
1060return wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention(query, key, value, dropout_p, is_causal, return_debug_mask);
1061}
1062::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, bool compute_log_sumexp, bool is_causal) {
1063return wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention(query, key, value, compute_log_sumexp, is_causal);
1064}
1065::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transformer_decoder_only_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value) {
1066return wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, incr_key, incr_value);
1067}
1068::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _native_decoder_only_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value, bool need_weights, bool average_attn_weights) {
1069return wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, incr_key, incr_value, need_weights, average_attn_weights);
1070}
1071} // namespace nestedtensorcuda
1072} // namespace at
1073