register_interface.cpp source code [pytorch/third_party/nvfuser/csrc/register_interface.cpp]

1	#include <manager.h>
2	#include <parser.h>
3	#include <partition.h>
4	#include <register_interface.h>
5
6	#include <ATen/core/dispatch/OperatorOptions.h>
7	#include <ATen/native/NonSymbolicBC.h>
8	#include <ATen/native/TensorShape.h>
9	#include <c10/util/CallOnce.h>
10	#include <c10/util/irange.h>
11	#include <torch/csrc/jit/runtime/custom_operator.h>
12	#include <torch/csrc/jit/runtime/profiling_record.h>
13	#include <torch/csrc/jit/runtime/register_ops_utils.h>
14
15	/*
16	* Registers function pointers in interface.h
17	*/
18
19	namespace torch {
20	namespace jit {
21	namespace fuser {
22	namespace cuda {
23
24	namespace {
25	class RegisterInterface {
26	public:
27	RegisterInterface() {
28	auto ptr = getFuserInterface();
29	ptr->fn_compile_n = &compileCudaFusionGroup;
30	ptr->fn_run_n_s = &runCudaFusionGroup;
31	ptr->fn_fuse_graph = &CudaFuseGraph;
32	ptr->fn_can_fuse_n = &isFusibleCudaFusionGroup;
33	ptr->fn_insert_profile_inodes = &InsertProfileNodes;
34	ptr->fn_profile_n = &shouldProfileNode;
35	ptr->fn_skip_n = &skipNodeKind;
36	}
37	};
38
39	static RegisterInterface register_interface_;
40
41	class RegisterNVFuserPass {
42	public:
43	RegisterNVFuserPass() {
44	NVFuserPassManager::registerPass(true);
45	}
46	};
47
48	static RegisterNVFuserPass register_nvfuser_pass_;
49
50	} // namespace
51
52	//! [ Note -- type guard logic in CudaFusionGuard ]
53	//!
54	//! CudaFusionGuard is used to Guard input tensor to `CudaFusionGroup` so that
55	//! we would not feed inputs that violates the graph defined in `GraphCache`.
56	//!
57	//! see [ Note -- 2 level cache implementation ] for definition of unique
58	//! computational graph.
59	//! see [ Note -- CudaFusionGuard implementation] for details on how guard works
60	//! in profiling executor
61	//!
62	//! Type guard logic is used to query whether a runtime input `tensor` compiles
63	//! with profiled `guard_tensor_type`. `guard_tensor_type` is the observed
64	//! tensor type during profiling runs.
65	//!
66	//! At this moment, we only do single profiling run, so `guard_tensor_type` has
67	//! static shape / stride / scalarType. This might be a little confusing as our*
68	//! implementation is actually more relaxed.
69	//!
70	//! Things that we check:
71	//! a. identical rank & scalar type
72	//! b. stride check:
73	//! b.1. identical stride order
74	//! b.2. identical contiguity
75	//! note that contiguity here is used for tensor collapsing. So
76	//! extra attention should be paid to contiguity across size-1
77	//! dimensions.
78	//! c. size check:
79	//! c.1 broadcast check:
80	//! making sure that broadcast semantics are identical. So we want to
81	//! make sure a given dimension either are both size-1 for `tensor` &
82	//! `guard_tensor_type`, or are both non-size-1.
83	//! This is due to the fact that we specialize size-1 dimension as
84	//! broadcasted dimension while translating PyTorch tensor to Fusion IR.
85	//! c.1 size-0 check:
86	//! we don't specialize this on codegen, but we do specialize fusion
87	//! logic for size-0 on reductoins, hence the check
88	//!
89	bool complyWith(
90	const at::Tensor& tensor,
91	const c10::TensorTypePtr& guard_tensor_type) {
92	// guard broadcast semantics, contiguity & stride order;
93	TORCH_INTERNAL_ASSERT(
94	guard_tensor_type && guard_tensor_type ->dim().has_value());
95
96	// check a. if num_dimension check fails or scalar type check fails
97	if (guard_tensor_type ->dim() != static_cast*<size_t>(tensor.ndimension()) \|\|
98	(guard_tensor_type ->scalarType().has_value() &&
99	(guard_tensor_type ->scalarType().value() != tensor.scalar_type())) \|\|
100	(guard_tensor_type ->device().has_value() &&
101	(guard_tensor_type ->device().value() != tensor.device())) \|\|
102	(guard_tensor_type ->requiresGrad().has_value() &&
103	guard_tensor_type ->requiresGrad().value() !=
104	(tensor.requires_grad() && at::GradMode::is_enabled()))) {
105	return false;
106	}
107
108	// TODO: should we get symbolic_size instead and check for size
109	// consistency across tensors as well?
110	const auto& sizes = guard_tensor_type ->sizes();
111	// see [ Note -- stirde_properties in tensor type ]
112	const auto& stride_properties = guard_tensor_type ->stride_properties();
113
114	const auto& t_sizes = tensor.sizes();
115	const auto& t_strides = tensor.strides();
116	int inner_dim = -`1`;
117	for (const auto j : c10::irange(*guard_tensor_type ->dim())) {
118	// check b. for stride check, we go along dimensions from fastest stride to
119	// slowest stride
120	int sorted_index = stride_properties [j]->stride_index_
121	? static_cast<int>(*stride_properties [j]->stride_index_)
122	: -`1`;
123
124	// only apply stride check when we have stride_properties
125	if (sorted_index != -`1`) {
126	// check b.1. stride order [current dimension has stride larger
127	// than its inner dimension(s)], check only applies when both:
128	// i. already encountered an inner dimension
129	// ii. not at the fastest dimension
130	if (j != `0` && inner_dim != -`1`) {
131	// we are not looking at dim-j, but dim-sorted_index, which
132	// is the j-th fastest dim;
133	// Note: we ignore 0-stride dimension, since eager logic on stride
134	// indices is ambiguous
135	if (t_strides [sorted_index] != `0` && t_strides [inner_dim] != `0` &&
136	t_strides [sorted_index] < t_strides [inner_dim]) {
137	return false;
138	}
139	}
140
141	// check b.2. contiguity, we only check when it's marked as
142	// contiguous.
143	if (stride_properties [j]->contiguous_ &&
144	*stride_properties [j]->contiguous_) {
145	if (j != `0`) {
146	// we use contiguity to collapse dimension, if size == 1, it is
147	// always collapsible
148	// computeStrideProps also default to contiguous when stride == 1
149	if (t_sizes [sorted_index] != `1` && t_strides [sorted_index] != `1`) {
150	TORCH_INTERNAL_ASSERT(
151	stride_properties[j - `1`]->stride_index_.has_value(),
152	"Counknown index is meaningless");
153	// TODO: merge this check up
154	if (t_strides [sorted_index] !=
155	t_strides [inner_dim] * t_sizes [inner_dim]) {
156	return false;
157	}
158	}
159	} else {
160	// TODO: merge this check up
161	if (t_strides [sorted_index] != `1`) {
162	return false;
163	}
164	}
165	}
166
167	// update inner_dim to be current dim. Note that we try to skip update
168	// when current `t_size[sorted_index] == 1`, because:
169	// 1. stride comparison on a size-1 dimension is meaningless
170	// [check b.1]
171	// 2. contiguity on a size-1 dimension is misleading. For collapsing,
172	// we should actually look at the next non-size-1 dimension
173	// [check b.2]
174	if (inner_dim == -`1` \|\| t_sizes [sorted_index] != `1`) {
175	inner_dim = sorted_index;
176	}
177	}
178
179	// check c.1, we go along semantic ordered dimensions
180	// check broadcast / size-1:
181	bool guard_bcast = sizes [j].has_value() && sizes [j].value() == `1`;
182	if (guard_bcast != (t_sizes [j] == `1`)) {
183	return false;
184	}
185
186	// check c.2, check for size-0
187	bool guard_size_0 = sizes [j].has_value() && sizes [j].value() == `0`;
188	if (guard_size_0 != (t_sizes [j] == `0`)) {
189	return false;
190	}
191	}
192
193	return true;
194	}
195
196	} // namespace cuda
197	} // namespace fuser
198
199	namespace {
200
201	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
202	RegisterOperators size_eq_guard({
203	Operator (
204	//"prim::CudaFusionSizeEq(int[] size, int[] ref) -> bool",
205	"prim::CudaFusionSizeEq(...) -> bool",
206	// prim::CudaFusionGuard returns a fresh Boolean type without aliasing.
207	// if we would ever return refined tensor, which would change aliasing
208	// analysis, we should update aliasdb pass.
209	[](const Node* node) -> Operation {
210	return [](Stack& stack) {
211	at::ArrayRef<IValue> inputs = last(stack, `2`);
212	drop(stack, `2`);
213
214	if (!fuser::cuda::getCudaFusionGuardMode()) {
215	push(stack, IValue(true));
216	return;
217	}
218
219	// auto inp = inputs[0].toIntList();
220	TORCH_INTERNAL_ASSERT(
221	inputs[`1`].isIntList(), "reference needs to be of int list");
222	auto ref = inputs [`1`].toIntList();
223
224	auto ret = true;
225	if (ref.empty()) {
226	ret = inputs [`0`].isNone();
227	} else {
228	if (inputs [`0`].isIntList()) {
229	auto inp = inputs [`0`].toIntList();
230	if (inp.size() != ref.size()) {
231	push(stack, IValue(false));
232	return;
233	}
234
235	for (const auto i : c10::irange(inp.size())) {
236	if (((inp [i] == `1`) != (ref [i] == `1`))) {
237	ret = false;
238	break;
239	}
240	}
241	} else {
242	ret = false;
243	}
244	}
245
246	push(stack, IValue(ret));
247	return;
248	};
249	},
250	aliasAnalysisFromSchema()),
251	});
252
253	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
254	RegisterOperators reg_fusion({
255	Operator (
256	prim::CudaFusionGroup,
257	[](const Node* node) -> Operation {
258	return [node](Stack& stack) {
259	fuser::cuda::runFusionGroup(node, stack);
260	};
261	},
262	aliasAnalysisSpecialCase()),
263	});
264
265	RegisterOperators reg_guard({
266	Operator (
267	"prim::CudaFusionGuard(...) -> bool",
268	// prim::CudaFusionGuard returns a fresh Boolean type without aliasing.
269	// if we would ever return refined tensor, which would change aliasing
270	// analysis, we should update aliasdb pass.
271	[](const Node* node) -> Operation {
272	return [node](Stack& stack) {
273	// TODO: check latency here!!!!
274	std::vector<TypePtr> types = node->tys(attr::types);
275	const auto num_inputs = types.size();
276	at::ArrayRef<IValue> inputs = last(stack, num_inputs);
277	drop(stack, num_inputs);
278
279	if (!fuser::cuda::getCudaFusionGuardMode()) {
280	push(stack, IValue(true));
281	return;
282	}
283
284	for (const auto i : c10::irange(num_inputs)) {
285	const c10::TensorTypePtr& guard_tensor_type =
286	types [i]->cast<TensorType>();
287
288	// TODO: maybe we should just push false and fallback
289	TORCH_INTERNAL_ASSERT(inputs[i].isTensor());
290	const at::Tensor& tensor = inputs [i].toTensor();
291
292	if (!fuser::cuda::complyWith(tensor, guard_tensor_type)) {
293	push(stack, IValue(false));
294	return;
295	}
296	}
297
298	// TODO: check type and return the right flag
299	// naively return true;
300	push(stack, IValue(true));
301	return;
302	};
303	},
304	aliasAnalysisFromSchema()),
305	});
306
307	// Infer dynamic axis (-1) in view_sizes given tensor_sizes
308	bool inferViewShape(
309	c10::List<int64_t> tensor_sizes,
310	c10::List<int64_t> view_sizes) {
311	int64_t dynamic_index = -`1`;
312	size_t view_size_num_elements = `1`;
313	for (size_t idx = `0`; idx < view_sizes.size(); ++idx) {
314	if (view_sizes [idx] == -`1`) {
315	TORCH_INTERNAL_ASSERT(
316	dynamic_index == -`1`, "Only one dimension can by inferred.")
317	dynamic_index = idx;
318	} else {
319	TORCH_INTERNAL_ASSERT(view_sizes[idx] > `0`);
320	view_size_num_elements *= view_sizes [idx];
321	}
322	}
323	const size_t kNumElements = std::accumulate(
324	tensor_sizes.begin(), tensor_sizes.end(), `1`, std::multiplies<>());
325
326	if (kNumElements % view_size_num_elements != `0`) {
327	return false;
328	}
329
330	if (dynamic_index != -`1`) {
331	view_sizes [dynamic_index] = kNumElements / view_size_num_elements;
332	}
333
334	return true;
335	}
336
337	//!
338	//! CudaFusionViewGuard Example Graph:
339	//!
340	//! graph(%self : __torch__.BiasViewRelu,
341	//! %inputs.1 : Tensor):
342	//! %2 : int = prim::Constant[value=-1]() # dynamic_bvg.py:50:40
343	//! %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
344	//! %4 : NoneType = prim::Constant()
345	//! %5 : int[] = prim::Constant[value=[2, 3]]()
346	//! %6 : int[] = aten::size(%inputs.1) # dynamic_bvg.py:50:25
347	//! %7 : int[] = aten::slice(%6, %4, %2, %3) # dynamic_bvg.py:50:25
348	//! %view_shape.1 : int[] = aten::add(%7, %5) # dynamic_bvg.py:50:25
349	//! %bias : Tensor = prim::GetAttr[name="bias"](%self)
350	//! %10 : int[] = aten::size(%bias)
351	//! %11 : int[] = prim::BroadcastSizes(%6, %10)
352	//! %12 : bool = prim::CudaFusionGuard[types=[...]](%inputs.1, %bias)
353	//! %13 : int[] = prim::Constant[value=[-1, -1, -1, 6]]()
354	//! %14 : int[] = prim::Constant[value=[-1, -1, -1, 2, 3]]()
355	//! %15 : bool = prim::CudaFusionViewGuard(%11, %view_shape.1, %13, %14)
356	//! %16 : bool[] = prim::ListConstruct(%15, %12)
357	//! %17 : bool = aten::all(%16)
358	//! %18 : Tensor = prim::If(%17)
359	//! block0():
360	//! %19 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%inputs.1, %bias)
361	//! -> (%19)
362	//! block1():
363	//! %20 : Function = prim::Constant[name="fallback_fn", fallback=1]()
364	//! %21 : (...) = prim::CallFunction(%20, %inputs.1, %bias, %view_shape.1)
365	//! %22 : Float(...) = prim::TupleUnpack(%21)
366	//! -> (%22)
367	//! return (%18)
368	//! with prim::CudaFusionGroup_0 = graph(%0 : Float(...),
369	//! %1 : Float(...)):
370	//! %2 : int[] = prim::Constant[value=[2, 3, 4, 2, 3]]()
371	//! %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
372	//! %o.1 : Float(...) = aten::add(%0, %1, %3) # dynamic_bvg.py:51:16
373	//! %5 : Float(...) = prim::view_copy(%o.1, %2)
374	//! %6 : Float(...) = aten::relu(%5) # dynamic_bvg.py:53:19
375	//! return (%6)
376	//!
377	RegisterOperators view_guard({
378	Operator (
379	"prim::CudaFusionViewGuard(...) -> bool",
380	// prim::CudaFusionViewGuard returns a fresh Boolean type without
381	// aliasing. if we would ever return refined tensor, which would change
382	// aliasing analysis, we should update aliasdb pass.
383	[](const Node* node) -> Operation {
384	return [](Stack& stack) {
385	// view_sizes_constraint - Constant List[Int]
386	at::ArrayRef<IValue> inputs = last(stack, `3`);
387
388	// tensor_sizes is the runtime size for the self tensor
389	// tensor_sizes - dynamic size List[Int]
390	TORCH_INTERNAL_ASSERT(
391	inputs[`0`].isIntList(), "tensor_sizes needs to be Int List");
392	auto tensor_sizes = inputs [`0`].toIntList();
393
394	// profiled_view_sizes is the runtime view size
395	// profiled_view_sizes - profile_ivalue List[Int]
396	TORCH_INTERNAL_ASSERT(
397	inputs[`1`].isIntList(),
398	"profiled_view_sizes needs to be Int list");
399	auto profiled_view_sizes = inputs [`1`].toIntList();
400
401	// tensor_constraints is a constant List[Int]
402	// used to guard tensor_sizes
403	TORCH_INTERNAL_ASSERT(
404	inputs[`2`].isIntList(),
405	"tensor constraint needs to be Int List");
406	auto tensor_constraints = inputs [`2`].toIntList();
407
408	// Drop after gather all input arguments
409	// If an argument is moved, it is destroyed when dropped from stack
410	drop(stack, `3`);
411
412	auto status = inferViewShape(tensor_sizes, profiled_view_sizes);
413	if (!status) {
414	push(stack, IValue(false));
415	return;
416	}
417
418	if (!fuser::cuda::getCudaFusionGuardMode()) {
419	push(stack, IValue(true));
420	return;
421	}
422	std::vector<int64_t> tensor_sizes_int_vec = tensor_sizes.vec();
423	std::vector<int64_t> view_sizes_int_vec = tensor_sizes.vec();
424	std::vector<int64_t> previous_constraints =
425	tensor_constraints.vec();
426	auto new_constraints =
427	torch::jit::fuser::cuda::analyzeViewConstraint(
428	tensor_sizes_int_vec, view_sizes_int_vec);
429	bool guard_status =
430	(new_constraints.conglomerateString() == previous_constraints);
431	push(stack, IValue(guard_status));
432	return;
433	};
434	},
435	aliasAnalysisFromSchema()),
436	});
437
438	RegisterOperators ivalue_guard({
439	Operator (
440	"prim::CudaFusionIvalGuard(...) -> bool",
441	[](const Node* node) -> Operation {
442	return [](Stack& stack) {
443	at::ArrayRef<IValue> inputs = last(stack, `2`);
444	drop(stack, `2`);
445	if (!fuser::cuda::getCudaFusionGuardMode()) {
446	push(stack, IValue(true));
447	return;
448	}
449	push(stack, inputs [`0`].equals(inputs [`1`]));
450	return;
451	};
452	},
453	aliasAnalysisFromSchema()),
454	});
455
456	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
457	RegisterOperators reg_add_optional({
458	Operator (
459	"prim::add_optional(Tensor(a) input, Tensor? bias) -> Tensor(a)",
460	[](const Node* node) -> Operation {
461	return [](Stack& stack) {
462	IValue input, bias;
463	pop(stack, input, bias);
464	if (bias.isNone()) {
465	push(stack, std::move(input));
466	} else {
467	push(stack, at::add(input.toTensor(), bias.toTensor(), `1.0`));
468	}
469	};
470	},
471	aliasAnalysisFromSchema()),
472	});
473
474	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
475	RegisterOperators reg_permute_copy({
476	Operator (
477	"prim::permute_copy(Tensor(a) self, int[] dims) -> Tensor",
478	[](const Node* node) -> Operation {
479	return [node](Stack& stack) {
480	TORCH_CHECK(
481	node->s(attr::name) == "CudaFusionGroup",
482	"permute_copy is only used by nvfuser to identify non-mutating ",
483	"alias ops, should be restored after fusion pass!");
484	IValue self, dims;
485	pop(stack, self, dims);
486	push(stack, at::native::view(self.toTensor(), dims.toIntVector()));
487	};
488	},
489	aliasAnalysisFromSchema()),
490	});
491
492	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
493	RegisterOperators reg_transpose_copy({
494	Operator (
495	"prim::transpose_copy.int(Tensor(a) self, int dim0, int dim1) -> Tensor",
496	[](const Node* node) -> Operation {
497	return [node](Stack& stack) {
498	TORCH_CHECK(
499	node->s(attr::name) == "CudaFusionGroup",
500	"transpose_copy is only used by nvfuser to identify non-mutating ",
501	"alias ops, should be restored after fusion pass!");
502	IValue self, dim0, dim1;
503	pop(stack, self, dim0, dim1);
504	push(
505	stack,
506	at::transpose(self.toTensor(), dim0.toInt(), dim1.toInt()));
507	};
508	},
509	aliasAnalysisFromSchema()),
510	});
511
512	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
513	RegisterOperators reg_t_copy({
514	Operator (
515	"prim::t_copy(Tensor(a) self) -> Tensor",
516	[](const Node* node) -> Operation {
517	return [node](Stack& stack) {
518	TORCH_CHECK(
519	node->s(attr::name) == "CudaFusionGroup",
520	"t_copy is only used by nvfuser to identify non-mutating ",
521	"alias ops, should be restored after fusion pass!");
522	IValue self;
523	pop(stack, self);
524	push(stack, at::t(self.toTensor()));
525	};
526	},
527	aliasAnalysisFromSchema()),
528	});
529
530	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
531	RegisterOperators reg_view_copy({
532	Operator (
533	"prim::view_copy(Tensor self, int[] size) -> Tensor",
534	[](const Node* node) -> Operation {
535	return [node](Stack& stack) {
536	TORCH_CHECK(
537	node->s(attr::name) == "CudaFusionGroup",
538	"view_copy is only used by nvfuser to identify non-mutating ",
539	"alias ops, should be restored after fusion pass!");
540	IValue self, size;
541	pop(stack, self, size);
542	push(stack, at::native::view(self.toTensor(), size.toIntVector()));
543	};
544	},
545	aliasAnalysisFromSchema()),
546	});
547
548	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
549	RegisterOperators reg_flatten_copy({
550	Operator (
551	"prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor",
552	[](const Node* node) -> Operation {
553	return [node](Stack& stack) {
554	TORCH_CHECK(
555	node->s(attr::name) == "CudaFusionGroup",
556	"flatten_copy is only used by nvfuser to identify non-mutating ",
557	"alias ops, should be restored after fusion pass!");
558	IValue self, start_dim, end_dim;
559	pop(stack, self, start_dim, end_dim);
560	push(
561	stack,
562	at::native::flatten(
563	self.toTensor(), start_dim.toInt(), end_dim.toInt()));
564	};
565	},
566	aliasAnalysisFromSchema()),
567	});
568
569	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
570	RegisterOperators reg_reshape_copy({
571	Operator (
572	"prim::reshape_copy(Tensor self, int[] shape) -> Tensor",
573	[](const Node* node) -> Operation {
574	return [node](Stack& stack) {
575	TORCH_CHECK(
576	node->s(attr::name) == "CudaFusionGroup",
577	"reshape_copy is only used by nvfuser to identify non-mutating ",
578	"alias ops, should be restored after fusion pass!");
579	IValue self, shape;
580	pop(stack, self, shape);
581	push(
582	stack,
583	at::native::reshape(self.toTensor(), shape.toIntVector()));
584	};
585	},
586	aliasAnalysisFromSchema()),
587	});
588
589	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
590	RegisterOperators reg_squeeze_copy({
591	Operator (
592	"prim::squeeze_copy(Tensor self) -> Tensor",
593	[](const Node* node) -> Operation {
594	return [node](Stack& stack) {
595	TORCH_CHECK(
596	node->s(attr::name) == "CudaFusionGroup",
597	"squeeze_copy is only used by nvfuser to identify non-mutating ",
598	"alias ops, should be restored after fusion pass!");
599	IValue self;
600	pop(stack, self);
601	push(stack, at::squeeze(self.toTensor()));
602	};
603	},
604	aliasAnalysisFromSchema()),
605	});
606
607	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
608	RegisterOperators reg_squeeze_dim_copy({
609	Operator (
610	"prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor",
611	[](const Node* node) -> Operation {
612	return [node](Stack& stack) {
613	TORCH_CHECK(
614	node->s(attr::name) == "CudaFusionGroup",
615	"squeeze_dim_copy is only used by nvfuser to identify non-mutating ",
616	"alias ops, should be restored after fusion pass!");
617	IValue self, dim;
618	pop(stack, self, dim);
619	push(stack, at::squeeze(self.toTensor(), dim.toInt()));
620	};
621	},
622	aliasAnalysisFromSchema()),
623	});
624
625	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
626	RegisterOperators reg_unsqueeze_copy({
627	Operator (
628	"prim::unsqueeze_copy(Tensor self, int dim) -> Tensor",
629	[](const Node* node) -> Operation {
630	return [node](Stack& stack) {
631	TORCH_CHECK(
632	node->s(attr::name) == "CudaFusionGroup",
633	"unsqueeze_copy is only used by nvfuser to identify non-mutating ",
634	"alias ops, should be restored after fusion pass!");
635	IValue self, dim;
636	pop(stack, self, dim);
637	push(stack, at::unsqueeze(self.toTensor(), dim.toInt()));
638	};
639	},
640	aliasAnalysisFromSchema()),
641	});
642
643	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
644	RegisterOperators reg_infer_unsqueeze_size({
645	Operator (
646	"prim::infer_unsqueeze_size(int[] a, int dim) -> int[]",
647	[](const Node* node) -> Operation {
648	return [](Stack& stack) {
649	auto dim = pop(stack).toInt();
650	auto size = pop(stack).toIntVector();
651	if (dim < `0`) {
652	dim = dim + `1` + size.size();
653	}
654	auto it = size.begin() + dim;
655	size.insert(it, `1`);
656	push(stack, IValue(size));
657	};
658	},
659	aliasAnalysisFromSchema()),
660	});
661
662	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
663	RegisterOperators reg_infer_squeeze_dim_size({
664	Operator (
665	"prim::infer_squeeze_size.dim(int[] a, int dim) -> int[]",
666	[](const Node* node) -> Operation {
667	return [](Stack& stack) {
668	auto dim = pop(stack).toInt();
669	auto size = pop(stack).toIntVector();
670	if (dim < `0`) {
671	dim = dim + size.size();
672	}
673	auto it = size.begin() + dim;
674	if (*it == `1`) {
675	size.erase(it);
676	}
677	push(stack, IValue(size));
678	};
679	},
680	aliasAnalysisFromSchema()),
681	});
682
683	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
684	RegisterOperators reg_infer_squeeze_size({
685	Operator (
686	"prim::infer_squeeze_size(int[] a) -> int[]",
687	[](const Node* node) -> Operation {
688	return [](Stack& stack) {
689	auto size = pop(stack).toIntVector();
690
691	for (auto it = size.begin(); it != size.end(); it ++) {
692	if (*it == `1`) {
693	auto pre = it - `1`;
694	size.erase(it);
695	it = pre;
696	}
697	}
698	push(stack, IValue(size));
699	};
700	},
701	aliasAnalysisFromSchema()),
702	});
703
704	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
705	RegisterOperators reg_expand_copy({
706	Operator (
707	"prim::expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor",
708	[](const Node* node) -> Operation {
709	return [node](Stack& stack) {
710	TORCH_CHECK(
711	node->s(attr::name) == "CudaFusionGroup",
712	"expand_copy is only used by nvfuser to identify non-mutating ",
713	"alias ops, should be restored after fusion pass!");
714	IValue self, size, implicit;
715	pop(stack, self, size, implicit);
716	push(stack, self.toTensor().expand(size.toIntVector()));
717	};
718	},
719	aliasAnalysisFromSchema()),
720	});
721
722	// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
723	RegisterOperators reg_expand_as_copy({
724	Operator (
725	"prim::expand_as_copy(Tensor self, Tensor other) -> Tensor",
726	[](const Node* node) -> Operation {
727	return [node](Stack& stack) {
728	TORCH_CHECK(
729	node->s(attr::name) == "CudaFusionGroup",
730	"expand_as_copy is only used by nvfuser to identify non-mutating ",
731	"alias ops, should be restored after fusion pass!");
732	IValue self, other;
733	pop(stack, self, other);
734	push(
735	stack,
736	at::native::expand_as(self.toTensor(), other.toTensor()));
737	};
738	},
739	aliasAnalysisFromSchema()),
740	});
741
742	} // namespace
743
744	} // namespace jit
745	} // namespace torch
746

Browse the source code of pytorch/third_party/nvfuser/csrc/register_interface.cpp