1 | // required for old g++ to compile PRId64 macros, see |
2 | // https://github.com/pytorch/pytorch/issues/3571 |
3 | // for context |
4 | #ifndef __STDC_FORMAT_MACROS |
5 | #define __STDC_FORMAT_MACROS |
6 | #endif |
7 | |
8 | // an external backend might generate file within its code tree |
9 | // and check all the source files within the tree with clang-format. |
10 | // so, disable it since the backend might have a different config. |
11 | // clang-format off |
12 | |
13 | // NOTE: This condition is true for all PyTorch internal libraries, it |
14 | // just excludes external projects such as torch_xla which |
15 | // re-use some of the PyTorch codegen machinery. |
16 | #if defined(CAFFE2_BUILD_MAIN_LIB) || \ |
17 | defined(TORCH_CUDA_BUILD_MAIN_LIB) || \ |
18 | defined(TORCH_HIP_BUILD_MAIN_LIB) || \ |
19 | defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) || \ |
20 | defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB) |
21 | #define TORCH_ASSERT_ONLY_METHOD_OPERATORS |
22 | #endif |
23 | |
24 | // @generated by torchgen/gen.py from RegisterDispatchKey.cpp |
25 | |
26 | #include <c10/core/TensorImpl.h> |
27 | #include <c10/core/Allocator.h> |
28 | #include <ATen/DeviceGuard.h> |
29 | #include <ATen/NamedTensorUtils.h> |
30 | #include <ATen/Utils.h> |
31 | #include <ATen/WrapDimUtils.h> |
32 | #include <ATen/Dispatch.h> |
33 | #include <c10/util/ExclusivelyOwned.h> |
34 | #include <c10/util/Half.h> |
35 | #include <c10/core/UndefinedTensorImpl.h> |
36 | #include <c10/util/Optional.h> |
37 | #include <ATen/Tensor.h> |
38 | #include <ATen/native/Resize.h> |
39 | |
40 | #include <cstddef> |
41 | #include <functional> |
42 | #include <memory> |
43 | #include <utility> |
44 | |
45 | #include <ATen/Config.h> |
46 | #include <ATen/core/op_registration/adaption.h> |
47 | #include <torch/library.h> |
48 | #include <c10/cuda/CUDAGuard.h> |
49 | #include <ATen/cuda/ATenCUDAGeneral.h> |
50 | #include <ATen/cuda/CUDADevice.h> |
51 | #include <ATen/cuda/CUDAContext.h> |
52 | |
53 | #include <ATen/ops/as_strided_native.h> |
54 | #include <ATen/ops/empty.h> |
55 | #include <ATen/ops/empty_strided.h> |
56 | #include <ATen/ops/_copy_from_and_resize.h> |
57 | #include <ATen/ops/_copy_from.h> |
58 | #include <ATen/ops/_fused_sdp_choice_native.h> |
59 | #include <ATen/ops/_native_decoder_only_multi_head_attention_native.h> |
60 | #include <ATen/ops/_native_multi_head_attention_native.h> |
61 | #include <ATen/ops/_nested_from_padded_and_nested_example_native.h> |
62 | #include <ATen/ops/_nested_select_backward_native.h> |
63 | #include <ATen/ops/_nested_tensor_offsets_native.h> |
64 | #include <ATen/ops/_nested_tensor_size_native.h> |
65 | #include <ATen/ops/_nested_tensor_softmax_with_shape_native.h> |
66 | #include <ATen/ops/_nested_tensor_strides_native.h> |
67 | #include <ATen/ops/_scaled_dot_product_efficient_attention_native.h> |
68 | #include <ATen/ops/_scaled_dot_product_flash_attention_native.h> |
69 | #include <ATen/ops/_softmax_backward_data_native.h> |
70 | #include <ATen/ops/_softmax_native.h> |
71 | #include <ATen/ops/_test_autograd_multiple_dispatch_native.h> |
72 | #include <ATen/ops/_to_copy_native.h> |
73 | #include <ATen/ops/_transform_bias_rescale_qkv_native.h> |
74 | #include <ATen/ops/_transformer_decoder_only_layer_fwd_native.h> |
75 | #include <ATen/ops/_transformer_encoder_layer_fwd_native.h> |
76 | #include <ATen/ops/add_native.h> |
77 | #include <ATen/ops/bmm_native.h> |
78 | #include <ATen/ops/chunk_native.h> |
79 | #include <ATen/ops/clone_native.h> |
80 | #include <ATen/ops/copy_native.h> |
81 | #include <ATen/ops/detach_native.h> |
82 | #include <ATen/ops/div_native.h> |
83 | #include <ATen/ops/embedding_native.h> |
84 | #include <ATen/ops/empty_like_native.h> |
85 | #include <ATen/ops/fill_native.h> |
86 | #include <ATen/ops/gelu_native.h> |
87 | #include <ATen/ops/is_same_size_native.h> |
88 | #include <ATen/ops/linear_backward_native.h> |
89 | #include <ATen/ops/linear_native.h> |
90 | #include <ATen/ops/matmul_backward_native.h> |
91 | #include <ATen/ops/matmul_native.h> |
92 | #include <ATen/ops/mul_native.h> |
93 | #include <ATen/ops/native_dropout_backward_native.h> |
94 | #include <ATen/ops/native_dropout_native.h> |
95 | #include <ATen/ops/native_layer_norm_native.h> |
96 | #include <ATen/ops/neg_native.h> |
97 | #include <ATen/ops/ones_like_native.h> |
98 | #include <ATen/ops/relu_native.h> |
99 | #include <ATen/ops/select_native.h> |
100 | #include <ATen/ops/squeeze_native.h> |
101 | #include <ATen/ops/tanh_native.h> |
102 | #include <ATen/ops/to_padded_tensor_native.h> |
103 | #include <ATen/ops/transpose_native.h> |
104 | #include <ATen/ops/unsqueeze_native.h> |
105 | #include <ATen/ops/values_native.h> |
106 | #include <ATen/ops/view_native.h> |
107 | |
108 | // See template file RegisterDispatchDefinitions.ini |
109 | namespace at { |
110 | // NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid |
111 | // ambiguity with conflicting identifiers that may have been defined in |
112 | // at namespace already. |
113 | namespace { |
114 | void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) { |
115 | TORCH_CHECK(options.dtype() == out.dtype(), |
116 | "Expected out tensor to have dtype " , options.dtype(), ", but got " , out.dtype(), " instead" ); |
117 | TORCH_CHECK(options.device() == out.device(), |
118 | "Expected out tensor to have device " , options.device(), ", but got " , out.device(), " instead" ); |
119 | const bool resized = at::native::resize_output(out, sizes); |
120 | // Only restride if a resize occurred; otherwise we ignore the (advisory) |
121 | // strides from the meta function and directly use the output tensor's |
122 | // preexisting strides |
123 | if (resized) { |
124 | if (!strides.empty()) { |
125 | TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); |
126 | // TODO: avoid the redispatch here |
127 | out.as_strided_(sizes, strides); |
128 | } else if (options.memory_format_opt().has_value()) { |
129 | out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); |
130 | } |
131 | } |
132 | } |
133 | void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) { |
134 | // These checks are needed on those operators that: |
135 | // 1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm') |
136 | // 2) have particular typing rules (e.g. 'cumsum' and 'cumprod') |
137 | // For other operators (e.g. 'add'), 'TensorIterator' already checks |
138 | // these things separately. |
139 | TORCH_CHECK(options.dtype() == self.dtype(), |
140 | "Bad in-place call: " , |
141 | "input tensor dtype " , self.dtype(), " and output tensor dtype " , options.dtype(), " should match" ); |
142 | TORCH_CHECK(options.device() == self.device(), |
143 | "Bad in-place call: " , |
144 | "input tensor device " , self.device(), " and output tensor device " , options.device(), " should match" ); |
145 | TORCH_CHECK(sizes == self.sizes(), |
146 | "Bad in-place call: " , |
147 | "input tensor size " , self.sizes(), " and output tensor size " , sizes, " should match" ); |
148 | } |
149 | namespace { |
150 | ::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) { |
151 | c10::optional<Device> common_device = nullopt; |
152 | (void)common_device; // Suppress unused variable warning |
153 | c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__native_dropout" , "input" ); |
154 | const OptionalDeviceGuard device_guard(device_of(input)); |
155 | return at::native::native_dropout_nested(input, p, train); |
156 | } |
157 | } // anonymous namespace |
158 | namespace { |
159 | at::Tensor wrapper_NestedTensorCUDA__native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) { |
160 | c10::optional<Device> common_device = nullopt; |
161 | (void)common_device; // Suppress unused variable warning |
162 | c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA__native_dropout_backward" , "grad_output" ); |
163 | c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA__native_dropout_backward" , "mask" ); |
164 | const OptionalDeviceGuard device_guard(device_of(grad_output)); |
165 | return at::native::native_dropout_backward(grad_output, mask, scale); |
166 | } |
167 | } // anonymous namespace |
168 | namespace { |
169 | at::Tensor wrapper_NestedTensorCUDA_Tensor_add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { |
170 | // No device check |
171 | const OptionalDeviceGuard device_guard(device_of(self)); |
172 | return at::native::NestedTensor_add_Tensor(self, other, alpha); |
173 | } |
174 | } // anonymous namespace |
175 | namespace { |
176 | at::Tensor & wrapper_NestedTensorCUDA_Tensor_add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { |
177 | // No device check |
178 | const OptionalDeviceGuard device_guard(device_of(self)); |
179 | return at::native::NestedTensor_add__Tensor(self, other, alpha); |
180 | } |
181 | } // anonymous namespace |
182 | namespace { |
183 | at::Tensor wrapper_NestedTensorCUDA__bmm(const at::Tensor & self, const at::Tensor & mat2) { |
184 | c10::optional<Device> common_device = nullopt; |
185 | (void)common_device; // Suppress unused variable warning |
186 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__bmm" , "self" ); |
187 | c10::impl::check_and_update_common_device(common_device, mat2, "wrapper_NestedTensorCUDA__bmm" , "mat2" ); |
188 | const OptionalDeviceGuard device_guard(device_of(self)); |
189 | return at::native::bmm_nested_cuda(self, mat2); |
190 | } |
191 | } // anonymous namespace |
192 | namespace { |
193 | ::std::vector<at::Tensor> wrapper_NestedTensorCUDA__chunk(const at::Tensor & self, int64_t chunks, int64_t dim) { |
194 | // No device check |
195 | // DeviceGuard omitted |
196 | return at::native::chunk_nested_tensor(self, chunks, dim); |
197 | } |
198 | } // anonymous namespace |
199 | namespace { |
200 | at::Tensor & wrapper_NestedTensorCUDA__copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking) { |
201 | // No device check |
202 | // DeviceGuard omitted |
203 | return at::native::copy_nested_(self, src, non_blocking); |
204 | } |
205 | } // anonymous namespace |
206 | namespace { |
207 | at::Tensor wrapper_NestedTensorCUDA_Tensor_div(const at::Tensor & self, const at::Tensor & other) { |
208 | // No device check |
209 | const OptionalDeviceGuard device_guard(device_of(self)); |
210 | return at::native::NestedTensor_div_Tensor(self, other); |
211 | } |
212 | } // anonymous namespace |
213 | namespace { |
214 | at::Tensor wrapper_NestedTensorCUDA_Scalar_div(const at::Tensor & self, const at::Scalar & other) { |
215 | // No device check |
216 | const OptionalDeviceGuard device_guard(device_of(self)); |
217 | return at::native::NestedTensor_div_Scalar(self, other); |
218 | } |
219 | } // anonymous namespace |
220 | namespace { |
221 | at::Tensor wrapper_NestedTensorCUDA__embedding(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) { |
222 | c10::optional<Device> common_device = nullopt; |
223 | (void)common_device; // Suppress unused variable warning |
224 | c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__embedding" , "weight" ); |
225 | c10::impl::check_and_update_common_device(common_device, indices, "wrapper_NestedTensorCUDA__embedding" , "indices" ); |
226 | const OptionalDeviceGuard device_guard(device_of(weight)); |
227 | return at::native::NestedTensor_embedding(weight, indices, padding_idx.expect_int(), scale_grad_by_freq, sparse); |
228 | } |
229 | } // anonymous namespace |
230 | namespace { |
231 | at::Tensor wrapper_NestedTensorCUDA__empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) { |
232 | // No device check |
233 | // DeviceGuard omitted |
234 | return at::native::empty_like_nested(self, dtype, layout, device, pin_memory, memory_format); |
235 | } |
236 | } // anonymous namespace |
237 | namespace { |
238 | at::Tensor & wrapper_NestedTensorCUDA_Scalar_fill_(at::Tensor & self, const at::Scalar & value) { |
239 | // No device check |
240 | const OptionalDeviceGuard device_guard(device_of(self)); |
241 | return at::native::fill_nested_(self, value); |
242 | } |
243 | } // anonymous namespace |
244 | namespace { |
245 | at::Tensor & wrapper_NestedTensorCUDA_Tensor_fill_(at::Tensor & self, const at::Tensor & value) { |
246 | // No device check |
247 | const OptionalDeviceGuard device_guard(device_of(self)); |
248 | return at::native::fill_nested_(self, value); |
249 | } |
250 | } // anonymous namespace |
251 | namespace { |
252 | bool wrapper_NestedTensorCUDA__is_same_size(const at::Tensor & self, const at::Tensor & other) { |
253 | // No device check |
254 | // DeviceGuard omitted |
255 | return at::native::nested_is_same_size(self, other); |
256 | } |
257 | } // anonymous namespace |
258 | namespace { |
259 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__native_layer_norm(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) { |
260 | c10::optional<Device> common_device = nullopt; |
261 | (void)common_device; // Suppress unused variable warning |
262 | c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__native_layer_norm" , "input" ); |
263 | c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__native_layer_norm" , "weight" ); |
264 | c10::impl::check_and_update_common_device(common_device, bias, "wrapper_NestedTensorCUDA__native_layer_norm" , "bias" ); |
265 | const OptionalDeviceGuard device_guard(device_of(input)); |
266 | return at::native::nested_layer_norm(input, C10_AS_INTARRAYREF_SLOW(normalized_shape), weight, bias, eps); |
267 | } |
268 | } // anonymous namespace |
269 | namespace { |
270 | at::Tensor wrapper_NestedTensorCUDA__linear(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias) { |
271 | c10::optional<Device> common_device = nullopt; |
272 | (void)common_device; // Suppress unused variable warning |
273 | c10::impl::check_and_update_common_device(common_device, input, "wrapper_NestedTensorCUDA__linear" , "input" ); |
274 | c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__linear" , "weight" ); |
275 | c10::impl::check_and_update_common_device(common_device, bias, "wrapper_NestedTensorCUDA__linear" , "bias" ); |
276 | const OptionalDeviceGuard device_guard(device_of(input)); |
277 | return at::native::nested_linear(input, weight, bias); |
278 | } |
279 | } // anonymous namespace |
280 | namespace { |
281 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__linear_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) { |
282 | c10::optional<Device> common_device = nullopt; |
283 | (void)common_device; // Suppress unused variable warning |
284 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__linear_backward" , "self" ); |
285 | c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA__linear_backward" , "grad_output" ); |
286 | c10::impl::check_and_update_common_device(common_device, weight, "wrapper_NestedTensorCUDA__linear_backward" , "weight" ); |
287 | const OptionalDeviceGuard device_guard(device_of(self)); |
288 | return at::native::nested_linear_backward(self, grad_output, weight, output_mask); |
289 | } |
290 | } // anonymous namespace |
291 | namespace { |
292 | at::Tensor wrapper_NestedTensorCUDA__matmul(const at::Tensor & self, const at::Tensor & other) { |
293 | c10::optional<Device> common_device = nullopt; |
294 | (void)common_device; // Suppress unused variable warning |
295 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__matmul" , "self" ); |
296 | c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA__matmul" , "other" ); |
297 | const OptionalDeviceGuard device_guard(device_of(self)); |
298 | return at::native::matmul_nested(self, other); |
299 | } |
300 | } // anonymous namespace |
301 | namespace { |
302 | at::Tensor & wrapper_NestedTensorCUDA_out_matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { |
303 | c10::optional<Device> common_device = nullopt; |
304 | (void)common_device; // Suppress unused variable warning |
305 | c10::impl::check_and_update_common_device(common_device, out, "wrapper_NestedTensorCUDA_out_matmul_out" , "out" ); |
306 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_out_matmul_out" , "self" ); |
307 | c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA_out_matmul_out" , "other" ); |
308 | const OptionalDeviceGuard device_guard(device_of(self)); |
309 | return at::native::matmul_out_nested(self, other, out); |
310 | } |
311 | } // anonymous namespace |
312 | namespace { |
313 | ::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA__matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) { |
314 | c10::optional<Device> common_device = nullopt; |
315 | (void)common_device; // Suppress unused variable warning |
316 | c10::impl::check_and_update_common_device(common_device, grad, "wrapper_NestedTensorCUDA__matmul_backward" , "grad" ); |
317 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__matmul_backward" , "self" ); |
318 | c10::impl::check_and_update_common_device(common_device, other, "wrapper_NestedTensorCUDA__matmul_backward" , "other" ); |
319 | const OptionalDeviceGuard device_guard(device_of(self)); |
320 | return at::native::matmul_backward_nested(grad, self, other, mask); |
321 | } |
322 | } // anonymous namespace |
323 | namespace { |
324 | at::Tensor wrapper_NestedTensorCUDA_Tensor_mul(const at::Tensor & self, const at::Tensor & other) { |
325 | // No device check |
326 | const OptionalDeviceGuard device_guard(device_of(self)); |
327 | return at::native::NestedTensor_mul_Tensor(self, other); |
328 | } |
329 | } // anonymous namespace |
330 | namespace { |
331 | at::Tensor & wrapper_NestedTensorCUDA_Tensor_mul_(at::Tensor & self, const at::Tensor & other) { |
332 | // No device check |
333 | const OptionalDeviceGuard device_guard(device_of(self)); |
334 | return at::native::NestedTensor_mul__Tensor(self, other); |
335 | } |
336 | } // anonymous namespace |
337 | namespace { |
338 | at::Tensor wrapper_NestedTensorCUDA_Scalar_mul(const at::Tensor & self, const at::Scalar & other) { |
339 | // No device check |
340 | const OptionalDeviceGuard device_guard(device_of(self)); |
341 | return at::native::NestedTensor_mul_Scalar(self, other); |
342 | } |
343 | } // anonymous namespace |
344 | namespace { |
345 | at::Tensor & wrapper_NestedTensorCUDA_Scalar_mul_(at::Tensor & self, const at::Scalar & other) { |
346 | // No device check |
347 | const OptionalDeviceGuard device_guard(device_of(self)); |
348 | return at::native::NestedTensor_mul__Scalar(self, other); |
349 | } |
350 | } // anonymous namespace |
351 | namespace { |
352 | at::Tensor wrapper_NestedTensorCUDA__ones_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) { |
353 | c10::optional<Device> common_device = nullopt; |
354 | (void)common_device; // Suppress unused variable warning |
355 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__ones_like" , "self" ); |
356 | globalContext().lazyInitCUDA(); |
357 | const DeviceGuard device_guard(device_or_default(device)); |
358 | return at::native::ones_like(self, dtype, layout, device, pin_memory, memory_format); |
359 | } |
360 | } // anonymous namespace |
361 | namespace { |
362 | at::Tensor wrapper_NestedTensorCUDA__neg(const at::Tensor & self) { |
363 | // No device check |
364 | const OptionalDeviceGuard device_guard(device_of(self)); |
365 | return at::native::NestedTensor_neg(self); |
366 | } |
367 | } // anonymous namespace |
368 | namespace { |
369 | at::Tensor & wrapper_NestedTensorCUDA__neg_(at::Tensor & self) { |
370 | // No device check |
371 | const OptionalDeviceGuard device_guard(device_of(self)); |
372 | return at::native::NestedTensor_neg_(self); |
373 | } |
374 | } // anonymous namespace |
375 | namespace { |
376 | at::Tensor wrapper_NestedTensorCUDA__relu(const at::Tensor & self) { |
377 | // No device check |
378 | const OptionalDeviceGuard device_guard(device_of(self)); |
379 | return at::native::NestedTensor_relu(self); |
380 | } |
381 | } // anonymous namespace |
382 | namespace { |
383 | at::Tensor & wrapper_NestedTensorCUDA__relu_(at::Tensor & self) { |
384 | // No device check |
385 | const OptionalDeviceGuard device_guard(device_of(self)); |
386 | return at::native::NestedTensor_relu_(self); |
387 | } |
388 | } // anonymous namespace |
389 | namespace { |
390 | at::Tensor wrapper_NestedTensorCUDA__gelu(const at::Tensor & self, c10::string_view approximate) { |
391 | // No device check |
392 | const OptionalDeviceGuard device_guard(device_of(self)); |
393 | return at::native::NestedTensor_gelu(self, approximate); |
394 | } |
395 | } // anonymous namespace |
396 | namespace { |
397 | at::Tensor & wrapper_NestedTensorCUDA__gelu_(at::Tensor & self, c10::string_view approximate) { |
398 | // No device check |
399 | const OptionalDeviceGuard device_guard(device_of(self)); |
400 | return at::native::NestedTensor_gelu_(self, approximate); |
401 | } |
402 | } // anonymous namespace |
403 | namespace { |
404 | at::Tensor wrapper_NestedTensorCUDA_int_select(const at::Tensor & self, int64_t dim, c10::SymInt index) { |
405 | // No device check |
406 | // DeviceGuard omitted |
407 | return at::native::select_nested(self, dim, index.expect_int()); |
408 | } |
409 | } // anonymous namespace |
410 | namespace { |
411 | at::Tensor wrapper_NestedTensorCUDA___nested_select_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) { |
412 | // No device check |
413 | // DeviceGuard omitted |
414 | return at::native::_nested_select_backward_symint(grad_output, self, dim, index); |
415 | } |
416 | } // anonymous namespace |
417 | namespace { |
418 | at::Tensor wrapper_NestedTensorCUDA__detach(const at::Tensor & self) { |
419 | c10::optional<Device> common_device = nullopt; |
420 | (void)common_device; // Suppress unused variable warning |
421 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__detach" , "self" ); |
422 | const OptionalDeviceGuard device_guard(device_of(self)); |
423 | return at::native::detach(self); |
424 | } |
425 | } // anonymous namespace |
426 | namespace { |
427 | at::Tensor wrapper_NestedTensorCUDA___softmax(const at::Tensor & self, int64_t dim, bool half_to_float) { |
428 | c10::optional<Device> common_device = nullopt; |
429 | (void)common_device; // Suppress unused variable warning |
430 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___softmax" , "self" ); |
431 | const OptionalDeviceGuard device_guard(device_of(self)); |
432 | return at::native::softmax_nested(self, dim, half_to_float); |
433 | } |
434 | } // anonymous namespace |
435 | namespace { |
436 | at::Tensor wrapper_NestedTensorCUDA___softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) { |
437 | c10::optional<Device> common_device = nullopt; |
438 | (void)common_device; // Suppress unused variable warning |
439 | c10::impl::check_and_update_common_device(common_device, grad_output, "wrapper_NestedTensorCUDA___softmax_backward_data" , "grad_output" ); |
440 | c10::impl::check_and_update_common_device(common_device, output, "wrapper_NestedTensorCUDA___softmax_backward_data" , "output" ); |
441 | const OptionalDeviceGuard device_guard(device_of(grad_output)); |
442 | return at::native::nested_softmax_backward(grad_output, output, dim, input_dtype); |
443 | } |
444 | } // anonymous namespace |
445 | namespace { |
446 | at::Tensor wrapper_NestedTensorCUDA__squeeze(const at::Tensor & self) { |
447 | // No device check |
448 | // DeviceGuard omitted |
449 | return at::native::squeeze_nested(self); |
450 | } |
451 | } // anonymous namespace |
452 | namespace { |
453 | at::Tensor wrapper_NestedTensorCUDA_dim_squeeze(const at::Tensor & self, int64_t dim) { |
454 | // No device check |
455 | // DeviceGuard omitted |
456 | return at::native::squeeze_dim_nested(self, dim); |
457 | } |
458 | } // anonymous namespace |
459 | namespace { |
460 | at::Tensor wrapper_NestedTensorCUDA_dims_squeeze(const at::Tensor & self, at::IntArrayRef dim) { |
461 | // No device check |
462 | // DeviceGuard omitted |
463 | return at::native::squeeze_dim_nested(self, dim); |
464 | } |
465 | } // anonymous namespace |
466 | namespace { |
467 | at::Tensor wrapper_NestedTensorCUDA__tanh(const at::Tensor & self) { |
468 | // No device check |
469 | const OptionalDeviceGuard device_guard(device_of(self)); |
470 | return at::native::NestedTensor_tanh(self); |
471 | } |
472 | } // anonymous namespace |
473 | namespace { |
474 | at::Tensor & wrapper_NestedTensorCUDA__tanh_(at::Tensor & self) { |
475 | // No device check |
476 | const OptionalDeviceGuard device_guard(device_of(self)); |
477 | return at::native::NestedTensor_tanh_(self); |
478 | } |
479 | } // anonymous namespace |
480 | namespace { |
481 | at::Tensor wrapper_NestedTensorCUDA_int_transpose(const at::Tensor & self, int64_t dim0, int64_t dim1) { |
482 | // No device check |
483 | // DeviceGuard omitted |
484 | return at::native::transpose_nested(self, dim0, dim1); |
485 | } |
486 | } // anonymous namespace |
487 | namespace { |
488 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) { |
489 | c10::optional<Device> common_device = nullopt; |
490 | (void)common_device; // Suppress unused variable warning |
491 | c10::impl::check_and_update_common_device(common_device, qkv, "wrapper_NestedTensorCUDA___transform_bias_rescale_qkv" , "qkv" ); |
492 | c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transform_bias_rescale_qkv" , "qkv_bias" ); |
493 | const OptionalDeviceGuard device_guard(device_of(qkv)); |
494 | return at::native::transform_bias_rescale_qkv_cuda(qkv, qkv_bias, num_heads); |
495 | } |
496 | } // anonymous namespace |
497 | namespace { |
498 | at::Tensor wrapper_NestedTensorCUDA___nested_tensor_size(const at::Tensor & self) { |
499 | c10::optional<Device> common_device = nullopt; |
500 | (void)common_device; // Suppress unused variable warning |
501 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_size" , "self" ); |
502 | const OptionalDeviceGuard device_guard(device_of(self)); |
503 | return at::native::_nested_tensor_size(self); |
504 | } |
505 | } // anonymous namespace |
506 | namespace { |
507 | at::Tensor wrapper_NestedTensorCUDA___nested_tensor_strides(const at::Tensor & self) { |
508 | c10::optional<Device> common_device = nullopt; |
509 | (void)common_device; // Suppress unused variable warning |
510 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_strides" , "self" ); |
511 | const OptionalDeviceGuard device_guard(device_of(self)); |
512 | return at::native::_nested_tensor_strides(self); |
513 | } |
514 | } // anonymous namespace |
515 | namespace { |
516 | ::std::vector<int64_t> wrapper_NestedTensorCUDA___nested_tensor_offsets(const at::Tensor & self) { |
517 | c10::optional<Device> common_device = nullopt; |
518 | (void)common_device; // Suppress unused variable warning |
519 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_offsets" , "self" ); |
520 | const OptionalDeviceGuard device_guard(device_of(self)); |
521 | return at::native::_nested_tensor_offsets(self); |
522 | } |
523 | } // anonymous namespace |
524 | namespace { |
525 | at::Tensor wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example) { |
526 | c10::optional<Device> common_device = nullopt; |
527 | (void)common_device; // Suppress unused variable warning |
528 | c10::impl::check_and_update_common_device(common_device, padded, "wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example" , "padded" ); |
529 | c10::impl::check_and_update_common_device(common_device, nt_example, "wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example" , "nt_example" ); |
530 | const OptionalDeviceGuard device_guard(device_of(padded)); |
531 | return at::native::NestedTensor_from_padded_and_nested_example(padded, nt_example); |
532 | } |
533 | } // anonymous namespace |
534 | namespace { |
535 | at::Tensor wrapper_NestedTensorCUDA__unsqueeze(const at::Tensor & self, int64_t dim) { |
536 | // No device check |
537 | // DeviceGuard omitted |
538 | return at::native::unsqueeze_nested(self, dim); |
539 | } |
540 | } // anonymous namespace |
541 | namespace { |
542 | at::Tensor wrapper_NestedTensorCUDA__clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) { |
543 | c10::optional<Device> common_device = nullopt; |
544 | (void)common_device; // Suppress unused variable warning |
545 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__clone" , "self" ); |
546 | const OptionalDeviceGuard device_guard(device_of(self)); |
547 | return at::native::clone_nested(self, memory_format); |
548 | } |
549 | } // anonymous namespace |
550 | namespace { |
551 | at::Tensor wrapper_NestedTensorCUDA__values(const at::Tensor & self) { |
552 | // No device check |
553 | // DeviceGuard omitted |
554 | return at::native::values_nested(self); |
555 | } |
556 | } // anonymous namespace |
557 | namespace { |
558 | at::Tensor wrapper_NestedTensorCUDA___to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) { |
559 | // No device check |
560 | // DeviceGuard omitted |
561 | return at::native::_to_copy_nested(self, dtype, layout, device, pin_memory, non_blocking, memory_format); |
562 | } |
563 | } // anonymous namespace |
564 | namespace { |
565 | at::Tensor wrapper_NestedTensorCUDA__view(const at::Tensor & self, c10::SymIntArrayRef size) { |
566 | // No device check |
567 | // DeviceGuard omitted |
568 | return at::native::view_nested(self, C10_AS_INTARRAYREF_SLOW(size)); |
569 | } |
570 | } // anonymous namespace |
571 | namespace { |
572 | at::Tensor wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch(const at::Tensor & self) { |
573 | c10::optional<Device> common_device = nullopt; |
574 | (void)common_device; // Suppress unused variable warning |
575 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch" , "self" ); |
576 | const OptionalDeviceGuard device_guard(device_of(self)); |
577 | return at::native::_test_autograd_multiple_dispatch_fullcoverage(self); |
578 | } |
579 | } // anonymous namespace |
580 | namespace { |
581 | at::Tensor wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch(const at::Tensor & self, bool b) { |
582 | c10::optional<Device> common_device = nullopt; |
583 | (void)common_device; // Suppress unused variable warning |
584 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch" , "self" ); |
585 | const OptionalDeviceGuard device_guard(device_of(self)); |
586 | return at::native::_test_autograd_multiple_dispatch_ntonly(self, b); |
587 | } |
588 | } // anonymous namespace |
589 | namespace { |
590 | at::Tensor wrapper_NestedTensorCUDA__to_padded_tensor(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size) { |
591 | c10::optional<Device> common_device = nullopt; |
592 | (void)common_device; // Suppress unused variable warning |
593 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA__to_padded_tensor" , "self" ); |
594 | const OptionalDeviceGuard device_guard(device_of(self)); |
595 | return at::native::NestedTensor_to_padded_tensor_cuda(self, padding, output_size.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*output_size)) : c10::nullopt); |
596 | } |
597 | } // anonymous namespace |
598 | namespace { |
599 | at::Tensor wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape(const at::Tensor & self, const at::Tensor & query) { |
600 | c10::optional<Device> common_device = nullopt; |
601 | (void)common_device; // Suppress unused variable warning |
602 | c10::impl::check_and_update_common_device(common_device, self, "wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape" , "self" ); |
603 | c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape" , "query" ); |
604 | const OptionalDeviceGuard device_guard(device_of(self)); |
605 | return at::native::NestedTensor_softmax_dropout_cuda(self, query); |
606 | } |
607 | } // anonymous namespace |
608 | namespace { |
609 | at::Tensor wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type) { |
610 | c10::optional<Device> common_device = nullopt; |
611 | (void)common_device; // Suppress unused variable warning |
612 | c10::impl::check_and_update_common_device(common_device, src, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "src" ); |
613 | c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "qkv_weight" ); |
614 | c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "qkv_bias" ); |
615 | c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "proj_weight" ); |
616 | c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "proj_bias" ); |
617 | c10::impl::check_and_update_common_device(common_device, norm_weight_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "norm_weight_1" ); |
618 | c10::impl::check_and_update_common_device(common_device, norm_bias_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "norm_bias_1" ); |
619 | c10::impl::check_and_update_common_device(common_device, norm_weight_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "norm_weight_2" ); |
620 | c10::impl::check_and_update_common_device(common_device, norm_bias_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "norm_bias_2" ); |
621 | c10::impl::check_and_update_common_device(common_device, ffn_weight_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "ffn_weight_1" ); |
622 | c10::impl::check_and_update_common_device(common_device, ffn_bias_1, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "ffn_bias_1" ); |
623 | c10::impl::check_and_update_common_device(common_device, ffn_weight_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "ffn_weight_2" ); |
624 | c10::impl::check_and_update_common_device(common_device, ffn_bias_2, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "ffn_bias_2" ); |
625 | c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd" , "mask" ); |
626 | const OptionalDeviceGuard device_guard(device_of(src)); |
627 | return at::native::transformer_encoder_layer_forward(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type); |
628 | } |
629 | } // anonymous namespace |
630 | namespace { |
631 | ::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type) { |
632 | c10::optional<Device> common_device = nullopt; |
633 | (void)common_device; // Suppress unused variable warning |
634 | c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "query" ); |
635 | c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "key" ); |
636 | c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "value" ); |
637 | c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "qkv_weight" ); |
638 | c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "qkv_bias" ); |
639 | c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "proj_weight" ); |
640 | c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "proj_bias" ); |
641 | c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___native_multi_head_attention" , "mask" ); |
642 | const OptionalDeviceGuard device_guard(device_of(query)); |
643 | return at::native::native_multi_head_attention_cuda(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type); |
644 | } |
645 | } // anonymous namespace |
646 | namespace { |
647 | int64_t wrapper_NestedTensorCUDA___fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask, double dropout_p, bool is_causal) { |
648 | c10::optional<Device> common_device = nullopt; |
649 | (void)common_device; // Suppress unused variable warning |
650 | c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___fused_sdp_choice" , "query" ); |
651 | c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___fused_sdp_choice" , "key" ); |
652 | c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___fused_sdp_choice" , "value" ); |
653 | c10::impl::check_and_update_common_device(common_device, attn_mask, "wrapper_NestedTensorCUDA___fused_sdp_choice" , "attn_mask" ); |
654 | const OptionalDeviceGuard device_guard(device_of(query)); |
655 | return at::native::_fused_sdp_choice_cuda(query, key, value, attn_mask, dropout_p, is_causal); |
656 | } |
657 | } // anonymous namespace |
658 | namespace { |
659 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor> wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask) { |
660 | c10::optional<Device> common_device = nullopt; |
661 | (void)common_device; // Suppress unused variable warning |
662 | c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention" , "query" ); |
663 | c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention" , "key" ); |
664 | c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention" , "value" ); |
665 | const OptionalDeviceGuard device_guard(device_of(query)); |
666 | return at::native::_scaled_dot_product_flash_attention_nestedtensor_cuda(query, key, value, dropout_p, is_causal, return_debug_mask); |
667 | } |
668 | } // anonymous namespace |
669 | namespace { |
670 | ::std::tuple<at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, bool compute_log_sumexp, bool is_causal) { |
671 | c10::optional<Device> common_device = nullopt; |
672 | (void)common_device; // Suppress unused variable warning |
673 | c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention" , "query" ); |
674 | c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention" , "key" ); |
675 | c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention" , "value" ); |
676 | const OptionalDeviceGuard device_guard(device_of(query)); |
677 | return at::native::_scaled_dot_product_efficient_attention_nestedtensor_cuda(query, key, value, compute_log_sumexp, is_causal); |
678 | } |
679 | } // anonymous namespace |
680 | namespace { |
681 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value) { |
682 | c10::optional<Device> common_device = nullopt; |
683 | (void)common_device; // Suppress unused variable warning |
684 | c10::impl::check_and_update_common_device(common_device, src, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "src" ); |
685 | c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "qkv_weight" ); |
686 | c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "qkv_bias" ); |
687 | c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "proj_weight" ); |
688 | c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "proj_bias" ); |
689 | c10::impl::check_and_update_common_device(common_device, norm_weight_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "norm_weight_1" ); |
690 | c10::impl::check_and_update_common_device(common_device, norm_bias_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "norm_bias_1" ); |
691 | c10::impl::check_and_update_common_device(common_device, norm_weight_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "norm_weight_2" ); |
692 | c10::impl::check_and_update_common_device(common_device, norm_bias_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "norm_bias_2" ); |
693 | c10::impl::check_and_update_common_device(common_device, ffn_weight_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "ffn_weight_1" ); |
694 | c10::impl::check_and_update_common_device(common_device, ffn_bias_1, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "ffn_bias_1" ); |
695 | c10::impl::check_and_update_common_device(common_device, ffn_weight_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "ffn_weight_2" ); |
696 | c10::impl::check_and_update_common_device(common_device, ffn_bias_2, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "ffn_bias_2" ); |
697 | c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "mask" ); |
698 | c10::impl::check_and_update_common_device(common_device, incr_key, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "incr_key" ); |
699 | c10::impl::check_and_update_common_device(common_device, incr_value, "wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd" , "incr_value" ); |
700 | const OptionalDeviceGuard device_guard(device_of(src)); |
701 | return at::native::transformer_decoder_only_layer_forward(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, incr_key, incr_value); |
702 | } |
703 | } // anonymous namespace |
704 | namespace { |
705 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value, bool need_weights, bool average_attn_weights) { |
706 | c10::optional<Device> common_device = nullopt; |
707 | (void)common_device; // Suppress unused variable warning |
708 | c10::impl::check_and_update_common_device(common_device, query, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "query" ); |
709 | c10::impl::check_and_update_common_device(common_device, key, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "key" ); |
710 | c10::impl::check_and_update_common_device(common_device, value, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "value" ); |
711 | c10::impl::check_and_update_common_device(common_device, qkv_weight, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "qkv_weight" ); |
712 | c10::impl::check_and_update_common_device(common_device, qkv_bias, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "qkv_bias" ); |
713 | c10::impl::check_and_update_common_device(common_device, proj_weight, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "proj_weight" ); |
714 | c10::impl::check_and_update_common_device(common_device, proj_bias, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "proj_bias" ); |
715 | c10::impl::check_and_update_common_device(common_device, mask, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "mask" ); |
716 | c10::impl::check_and_update_common_device(common_device, incr_key, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "incr_key" ); |
717 | c10::impl::check_and_update_common_device(common_device, incr_value, "wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention" , "incr_value" ); |
718 | const OptionalDeviceGuard device_guard(device_of(query)); |
719 | return at::native::native_decoder_only_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, incr_key, incr_value, need_weights, average_attn_weights); |
720 | } |
721 | } // anonymous namespace |
722 | TORCH_LIBRARY_IMPL(aten, NestedTensorCUDA, m) { |
723 | m.impl("native_dropout" , |
724 | TORCH_FN(wrapper_NestedTensorCUDA__native_dropout)); |
725 | m.impl("native_dropout_backward" , |
726 | TORCH_FN(wrapper_NestedTensorCUDA__native_dropout_backward)); |
727 | m.impl("add.Tensor" , |
728 | TORCH_FN(wrapper_NestedTensorCUDA_Tensor_add)); |
729 | m.impl("add_.Tensor" , |
730 | TORCH_FN(wrapper_NestedTensorCUDA_Tensor_add_)); |
731 | m.impl("bmm" , |
732 | TORCH_FN(wrapper_NestedTensorCUDA__bmm)); |
733 | m.impl("chunk" , |
734 | TORCH_FN(wrapper_NestedTensorCUDA__chunk)); |
735 | m.impl("copy_" , |
736 | TORCH_FN(wrapper_NestedTensorCUDA__copy_)); |
737 | m.impl("div.Tensor" , |
738 | TORCH_FN(wrapper_NestedTensorCUDA_Tensor_div)); |
739 | m.impl("div.Scalar" , |
740 | TORCH_FN(wrapper_NestedTensorCUDA_Scalar_div)); |
741 | m.impl("embedding" , |
742 | TORCH_FN(wrapper_NestedTensorCUDA__embedding)); |
743 | m.impl("empty_like" , |
744 | TORCH_FN(wrapper_NestedTensorCUDA__empty_like)); |
745 | m.impl("fill_.Scalar" , |
746 | TORCH_FN(wrapper_NestedTensorCUDA_Scalar_fill_)); |
747 | m.impl("fill_.Tensor" , |
748 | TORCH_FN(wrapper_NestedTensorCUDA_Tensor_fill_)); |
749 | m.impl("is_same_size" , |
750 | TORCH_FN(wrapper_NestedTensorCUDA__is_same_size)); |
751 | m.impl("native_layer_norm" , |
752 | TORCH_FN(wrapper_NestedTensorCUDA__native_layer_norm)); |
753 | m.impl("linear" , |
754 | TORCH_FN(wrapper_NestedTensorCUDA__linear)); |
755 | m.impl("linear_backward" , |
756 | TORCH_FN(wrapper_NestedTensorCUDA__linear_backward)); |
757 | m.impl("matmul" , |
758 | TORCH_FN(wrapper_NestedTensorCUDA__matmul)); |
759 | m.impl("matmul.out" , |
760 | TORCH_FN(wrapper_NestedTensorCUDA_out_matmul_out)); |
761 | m.impl("matmul_backward" , |
762 | TORCH_FN(wrapper_NestedTensorCUDA__matmul_backward)); |
763 | m.impl("mul.Tensor" , |
764 | TORCH_FN(wrapper_NestedTensorCUDA_Tensor_mul)); |
765 | m.impl("mul_.Tensor" , |
766 | TORCH_FN(wrapper_NestedTensorCUDA_Tensor_mul_)); |
767 | m.impl("mul.Scalar" , |
768 | TORCH_FN(wrapper_NestedTensorCUDA_Scalar_mul)); |
769 | m.impl("mul_.Scalar" , |
770 | TORCH_FN(wrapper_NestedTensorCUDA_Scalar_mul_)); |
771 | m.impl("ones_like" , |
772 | TORCH_FN(wrapper_NestedTensorCUDA__ones_like)); |
773 | m.impl("neg" , |
774 | TORCH_FN(wrapper_NestedTensorCUDA__neg)); |
775 | m.impl("neg_" , |
776 | TORCH_FN(wrapper_NestedTensorCUDA__neg_)); |
777 | m.impl("relu" , |
778 | TORCH_FN(wrapper_NestedTensorCUDA__relu)); |
779 | m.impl("relu_" , |
780 | TORCH_FN(wrapper_NestedTensorCUDA__relu_)); |
781 | m.impl("gelu" , |
782 | TORCH_FN(wrapper_NestedTensorCUDA__gelu)); |
783 | m.impl("gelu_" , |
784 | TORCH_FN(wrapper_NestedTensorCUDA__gelu_)); |
785 | m.impl("select.int" , |
786 | TORCH_FN(wrapper_NestedTensorCUDA_int_select)); |
787 | m.impl("_nested_select_backward" , |
788 | TORCH_FN(wrapper_NestedTensorCUDA___nested_select_backward)); |
789 | m.impl("detach" , |
790 | TORCH_FN(wrapper_NestedTensorCUDA__detach)); |
791 | m.impl("_softmax" , |
792 | TORCH_FN(wrapper_NestedTensorCUDA___softmax)); |
793 | m.impl("_softmax_backward_data" , |
794 | TORCH_FN(wrapper_NestedTensorCUDA___softmax_backward_data)); |
795 | m.impl("squeeze" , |
796 | TORCH_FN(wrapper_NestedTensorCUDA__squeeze)); |
797 | m.impl("squeeze.dim" , |
798 | TORCH_FN(wrapper_NestedTensorCUDA_dim_squeeze)); |
799 | m.impl("squeeze.dims" , |
800 | TORCH_FN(wrapper_NestedTensorCUDA_dims_squeeze)); |
801 | m.impl("tanh" , |
802 | TORCH_FN(wrapper_NestedTensorCUDA__tanh)); |
803 | m.impl("tanh_" , |
804 | TORCH_FN(wrapper_NestedTensorCUDA__tanh_)); |
805 | m.impl("transpose.int" , |
806 | TORCH_FN(wrapper_NestedTensorCUDA_int_transpose)); |
807 | m.impl("_transform_bias_rescale_qkv" , |
808 | TORCH_FN(wrapper_NestedTensorCUDA___transform_bias_rescale_qkv)); |
809 | m.impl("_nested_tensor_size" , |
810 | TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_size)); |
811 | m.impl("_nested_tensor_strides" , |
812 | TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_strides)); |
813 | m.impl("_nested_tensor_offsets" , |
814 | TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_offsets)); |
815 | m.impl("_nested_from_padded_and_nested_example" , |
816 | TORCH_FN(wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example)); |
817 | m.impl("unsqueeze" , |
818 | TORCH_FN(wrapper_NestedTensorCUDA__unsqueeze)); |
819 | m.impl("clone" , |
820 | TORCH_FN(wrapper_NestedTensorCUDA__clone)); |
821 | m.impl("values" , |
822 | TORCH_FN(wrapper_NestedTensorCUDA__values)); |
823 | m.impl("_to_copy" , |
824 | TORCH_FN(wrapper_NestedTensorCUDA___to_copy)); |
825 | m.impl("view" , |
826 | TORCH_FN(wrapper_NestedTensorCUDA__view)); |
827 | m.impl("_test_autograd_multiple_dispatch.fullcoverage" , |
828 | TORCH_FN(wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch)); |
829 | m.impl("_test_autograd_multiple_dispatch.ntonly" , |
830 | TORCH_FN(wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch)); |
831 | m.impl("to_padded_tensor" , |
832 | TORCH_FN(wrapper_NestedTensorCUDA__to_padded_tensor)); |
833 | m.impl("_nested_tensor_softmax_with_shape" , |
834 | TORCH_FN(wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape)); |
835 | m.impl("_transformer_encoder_layer_fwd" , |
836 | TORCH_FN(wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd)); |
837 | m.impl("_native_multi_head_attention" , |
838 | TORCH_FN(wrapper_NestedTensorCUDA___native_multi_head_attention)); |
839 | m.impl("_fused_sdp_choice" , |
840 | TORCH_FN(wrapper_NestedTensorCUDA___fused_sdp_choice)); |
841 | m.impl("_scaled_dot_product_flash_attention" , |
842 | TORCH_FN(wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention)); |
843 | m.impl("_scaled_dot_product_efficient_attention" , |
844 | TORCH_FN(wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention)); |
845 | m.impl("_transformer_decoder_only_layer_fwd" , |
846 | TORCH_FN(wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd)); |
847 | m.impl("_native_decoder_only_multi_head_attention" , |
848 | TORCH_FN(wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention)); |
849 | }; |
850 | } // anonymous namespace |
851 | namespace nestedtensorcuda { |
852 | ::std::tuple<at::Tensor,at::Tensor> native_dropout(const at::Tensor & input, double p, c10::optional<bool> train) { |
853 | return wrapper_NestedTensorCUDA__native_dropout(input, p, train); |
854 | } |
855 | at::Tensor native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale) { |
856 | return wrapper_NestedTensorCUDA__native_dropout_backward(grad_output, mask, scale); |
857 | } |
858 | at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { |
859 | return wrapper_NestedTensorCUDA_Tensor_add(self, other, alpha); |
860 | } |
861 | at::Tensor & add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { |
862 | return wrapper_NestedTensorCUDA_Tensor_add_(self, other, alpha); |
863 | } |
864 | at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2) { |
865 | return wrapper_NestedTensorCUDA__bmm(self, mat2); |
866 | } |
867 | ::std::vector<at::Tensor> chunk(const at::Tensor & self, int64_t chunks, int64_t dim) { |
868 | return wrapper_NestedTensorCUDA__chunk(self, chunks, dim); |
869 | } |
870 | at::Tensor & copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking) { |
871 | return wrapper_NestedTensorCUDA__copy_(self, src, non_blocking); |
872 | } |
873 | at::Tensor div(const at::Tensor & self, const at::Tensor & other) { |
874 | return wrapper_NestedTensorCUDA_Tensor_div(self, other); |
875 | } |
876 | at::Tensor div(const at::Tensor & self, const at::Scalar & other) { |
877 | return wrapper_NestedTensorCUDA_Scalar_div(self, other); |
878 | } |
879 | at::Tensor embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) { |
880 | return wrapper_NestedTensorCUDA__embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse); |
881 | } |
882 | at::Tensor embedding_symint(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) { |
883 | return wrapper_NestedTensorCUDA__embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse); |
884 | } |
885 | at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) { |
886 | return wrapper_NestedTensorCUDA__empty_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); |
887 | } |
888 | at::Tensor empty_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) { |
889 | return wrapper_NestedTensorCUDA__empty_like(self, dtype, layout, device, pin_memory, memory_format); |
890 | } |
891 | at::Tensor & fill_(at::Tensor & self, const at::Scalar & value) { |
892 | return wrapper_NestedTensorCUDA_Scalar_fill_(self, value); |
893 | } |
894 | at::Tensor & fill_(at::Tensor & self, const at::Tensor & value) { |
895 | return wrapper_NestedTensorCUDA_Tensor_fill_(self, value); |
896 | } |
897 | bool is_same_size(const at::Tensor & self, const at::Tensor & other) { |
898 | return wrapper_NestedTensorCUDA__is_same_size(self, other); |
899 | } |
900 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) { |
901 | return wrapper_NestedTensorCUDA__native_layer_norm(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps); |
902 | } |
903 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) { |
904 | return wrapper_NestedTensorCUDA__native_layer_norm(input, normalized_shape, weight, bias, eps); |
905 | } |
906 | at::Tensor linear(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias) { |
907 | return wrapper_NestedTensorCUDA__linear(input, weight, bias); |
908 | } |
909 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linear_backward(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) { |
910 | return wrapper_NestedTensorCUDA__linear_backward(self, grad_output, weight, output_mask); |
911 | } |
912 | at::Tensor matmul(const at::Tensor & self, const at::Tensor & other) { |
913 | return wrapper_NestedTensorCUDA__matmul(self, other); |
914 | } |
915 | at::Tensor & matmul_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { |
916 | return wrapper_NestedTensorCUDA_out_matmul_out(self, other, out); |
917 | } |
918 | at::Tensor & matmul_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { |
919 | return wrapper_NestedTensorCUDA_out_matmul_out(self, other, out); |
920 | } |
921 | ::std::tuple<at::Tensor,at::Tensor> matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) { |
922 | return wrapper_NestedTensorCUDA__matmul_backward(grad, self, other, mask); |
923 | } |
924 | at::Tensor mul(const at::Tensor & self, const at::Tensor & other) { |
925 | return wrapper_NestedTensorCUDA_Tensor_mul(self, other); |
926 | } |
927 | at::Tensor & mul_(at::Tensor & self, const at::Tensor & other) { |
928 | return wrapper_NestedTensorCUDA_Tensor_mul_(self, other); |
929 | } |
930 | at::Tensor mul(const at::Tensor & self, const at::Scalar & other) { |
931 | return wrapper_NestedTensorCUDA_Scalar_mul(self, other); |
932 | } |
933 | at::Tensor & mul_(at::Tensor & self, const at::Scalar & other) { |
934 | return wrapper_NestedTensorCUDA_Scalar_mul_(self, other); |
935 | } |
936 | at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options, c10::optional<at::MemoryFormat> memory_format) { |
937 | return wrapper_NestedTensorCUDA__ones_like(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); |
938 | } |
939 | at::Tensor ones_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) { |
940 | return wrapper_NestedTensorCUDA__ones_like(self, dtype, layout, device, pin_memory, memory_format); |
941 | } |
942 | at::Tensor neg(const at::Tensor & self) { |
943 | return wrapper_NestedTensorCUDA__neg(self); |
944 | } |
945 | at::Tensor & neg_(at::Tensor & self) { |
946 | return wrapper_NestedTensorCUDA__neg_(self); |
947 | } |
948 | at::Tensor relu(const at::Tensor & self) { |
949 | return wrapper_NestedTensorCUDA__relu(self); |
950 | } |
951 | at::Tensor & relu_(at::Tensor & self) { |
952 | return wrapper_NestedTensorCUDA__relu_(self); |
953 | } |
954 | at::Tensor gelu(const at::Tensor & self, c10::string_view approximate) { |
955 | return wrapper_NestedTensorCUDA__gelu(self, approximate); |
956 | } |
957 | at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate) { |
958 | return wrapper_NestedTensorCUDA__gelu_(self, approximate); |
959 | } |
960 | at::Tensor select(const at::Tensor & self, int64_t dim, int64_t index) { |
961 | return wrapper_NestedTensorCUDA_int_select(self, dim, index); |
962 | } |
963 | at::Tensor select_symint(const at::Tensor & self, int64_t dim, c10::SymInt index) { |
964 | return wrapper_NestedTensorCUDA_int_select(self, dim, index); |
965 | } |
966 | at::Tensor _nested_select_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, int64_t index) { |
967 | return wrapper_NestedTensorCUDA___nested_select_backward(grad_output, self, dim, index); |
968 | } |
969 | at::Tensor _nested_select_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) { |
970 | return wrapper_NestedTensorCUDA___nested_select_backward(grad_output, self, dim, index); |
971 | } |
972 | at::Tensor detach(const at::Tensor & self) { |
973 | return wrapper_NestedTensorCUDA__detach(self); |
974 | } |
975 | at::Tensor _softmax(const at::Tensor & self, int64_t dim, bool half_to_float) { |
976 | return wrapper_NestedTensorCUDA___softmax(self, dim, half_to_float); |
977 | } |
978 | at::Tensor _softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) { |
979 | return wrapper_NestedTensorCUDA___softmax_backward_data(grad_output, output, dim, input_dtype); |
980 | } |
981 | at::Tensor squeeze(const at::Tensor & self) { |
982 | return wrapper_NestedTensorCUDA__squeeze(self); |
983 | } |
984 | at::Tensor squeeze(const at::Tensor & self, int64_t dim) { |
985 | return wrapper_NestedTensorCUDA_dim_squeeze(self, dim); |
986 | } |
987 | at::Tensor squeeze(const at::Tensor & self, at::IntArrayRef dim) { |
988 | return wrapper_NestedTensorCUDA_dims_squeeze(self, dim); |
989 | } |
990 | at::Tensor tanh(const at::Tensor & self) { |
991 | return wrapper_NestedTensorCUDA__tanh(self); |
992 | } |
993 | at::Tensor & tanh_(at::Tensor & self) { |
994 | return wrapper_NestedTensorCUDA__tanh_(self); |
995 | } |
996 | at::Tensor transpose(const at::Tensor & self, int64_t dim0, int64_t dim1) { |
997 | return wrapper_NestedTensorCUDA_int_transpose(self, dim0, dim1); |
998 | } |
999 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transform_bias_rescale_qkv(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) { |
1000 | return wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(qkv, qkv_bias, num_heads); |
1001 | } |
1002 | at::Tensor _nested_tensor_size(const at::Tensor & self) { |
1003 | return wrapper_NestedTensorCUDA___nested_tensor_size(self); |
1004 | } |
1005 | at::Tensor _nested_tensor_strides(const at::Tensor & self) { |
1006 | return wrapper_NestedTensorCUDA___nested_tensor_strides(self); |
1007 | } |
1008 | ::std::vector<int64_t> _nested_tensor_offsets(const at::Tensor & self) { |
1009 | return wrapper_NestedTensorCUDA___nested_tensor_offsets(self); |
1010 | } |
1011 | at::Tensor _nested_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example) { |
1012 | return wrapper_NestedTensorCUDA___nested_from_padded_and_nested_example(padded, nt_example); |
1013 | } |
1014 | at::Tensor unsqueeze(const at::Tensor & self, int64_t dim) { |
1015 | return wrapper_NestedTensorCUDA__unsqueeze(self, dim); |
1016 | } |
1017 | at::Tensor clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) { |
1018 | return wrapper_NestedTensorCUDA__clone(self, memory_format); |
1019 | } |
1020 | at::Tensor values(const at::Tensor & self) { |
1021 | return wrapper_NestedTensorCUDA__values(self); |
1022 | } |
1023 | at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) { |
1024 | return wrapper_NestedTensorCUDA___to_copy(self, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); |
1025 | } |
1026 | at::Tensor _to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) { |
1027 | return wrapper_NestedTensorCUDA___to_copy(self, dtype, layout, device, pin_memory, non_blocking, memory_format); |
1028 | } |
1029 | at::Tensor view(const at::Tensor & self, at::IntArrayRef size) { |
1030 | return wrapper_NestedTensorCUDA__view(self, c10::fromIntArrayRefSlow(size)); |
1031 | } |
1032 | at::Tensor view_symint(const at::Tensor & self, c10::SymIntArrayRef size) { |
1033 | return wrapper_NestedTensorCUDA__view(self, size); |
1034 | } |
1035 | at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self) { |
1036 | return wrapper_NestedTensorCUDA_fullcoverage__test_autograd_multiple_dispatch(self); |
1037 | } |
1038 | at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self, bool b) { |
1039 | return wrapper_NestedTensorCUDA_ntonly__test_autograd_multiple_dispatch(self, b); |
1040 | } |
1041 | at::Tensor to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size) { |
1042 | return wrapper_NestedTensorCUDA__to_padded_tensor(self, padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt); |
1043 | } |
1044 | at::Tensor to_padded_tensor_symint(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size) { |
1045 | return wrapper_NestedTensorCUDA__to_padded_tensor(self, padding, output_size); |
1046 | } |
1047 | at::Tensor _nested_tensor_softmax_with_shape(const at::Tensor & self, const at::Tensor & query) { |
1048 | return wrapper_NestedTensorCUDA___nested_tensor_softmax_with_shape(self, query); |
1049 | } |
1050 | at::Tensor _transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type) { |
1051 | return wrapper_NestedTensorCUDA___transformer_encoder_layer_fwd(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type); |
1052 | } |
1053 | ::std::tuple<at::Tensor,at::Tensor> _native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type) { |
1054 | return wrapper_NestedTensorCUDA___native_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type); |
1055 | } |
1056 | int64_t _fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask, double dropout_p, bool is_causal) { |
1057 | return wrapper_NestedTensorCUDA___fused_sdp_choice(query, key, value, attn_mask, dropout_p, is_causal); |
1058 | } |
1059 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor> _scaled_dot_product_flash_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask) { |
1060 | return wrapper_NestedTensorCUDA___scaled_dot_product_flash_attention(query, key, value, dropout_p, is_causal, return_debug_mask); |
1061 | } |
1062 | ::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, bool compute_log_sumexp, bool is_causal) { |
1063 | return wrapper_NestedTensorCUDA___scaled_dot_product_efficient_attention(query, key, value, compute_log_sumexp, is_causal); |
1064 | } |
1065 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transformer_decoder_only_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value) { |
1066 | return wrapper_NestedTensorCUDA___transformer_decoder_only_layer_fwd(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, incr_key, incr_value); |
1067 | } |
1068 | ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _native_decoder_only_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, const c10::optional<at::Tensor> & incr_key, const c10::optional<at::Tensor> & incr_value, bool need_weights, bool average_attn_weights) { |
1069 | return wrapper_NestedTensorCUDA___native_decoder_only_multi_head_attention(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, incr_key, incr_value, need_weights, average_attn_weights); |
1070 | } |
1071 | } // namespace nestedtensorcuda |
1072 | } // namespace at |
1073 | |