test_gpu_view.cpp source code [pytorch/third_party/nvfuser/test/test_gpu_view.cpp]

1	#if defined(USE_CUDA)
2	#include <gmock/gmock-matchers.h>
3	#include <gtest/gtest.h>
4
5	#include <arith.h>
6	#include <codegen.h>
7	#include <disjoint_set.h>
8	#include <executor.h>
9	#include <executor_launch_params.h>
10	#include <expr_evaluator.h>
11	#include <fusion.h>
12	#include <fusion_segmenter.h>
13	#include <inlining.h>
14	#include <ir_all_nodes.h>
15	#include <ir_builder.h>
16	#include <ir_graphviz.h>
17	#include <ir_iostream.h>
18	#include <ir_utils.h>
19	#include <iter_visitor.h>
20	#include <kernel_cache.h>
21	#include <kernel_expr_evaluator.h>
22	#include <kernel_ir.h>
23	#include <kernel_ir_dispatch.h>
24	#include <lower2device.h>
25	#include <lower_divisible_split.h>
26	#include <mutator.h>
27	#include <ops/all_ops.h>
28	#include <register_interface.h>
29	#include <root_domain_map.h>
30	#include <scheduler/all_schedulers.h>
31	#include <scheduler/reduction_utils.h>
32	#include <scheduler/utils.h>
33	#include <test/test_gpu_validator.h>
34	#include <test/test_utils.h>
35	#include <transform_replay.h>
36	#include <transform_rfactor.h>
37
38	// fuser and IR parser
39	#include <parser.h>
40	#include <torch/csrc/jit/ir/irparser.h>
41
42	#include <ATen/cuda/CUDAContext.h>
43	#include <ATen/cuda/Exceptions.h>
44	#include <c10/cuda/CUDAStream.h>
45
46	#include <algorithm>
47	#include <iostream>
48
49	// Tests go in torch::jit
50	namespace torch {
51	namespace jit {
52
53	using namespace torch::jit::fuser::cuda;
54	using namespace at::indexing;
55
56	TEST_F(NVFuserTest, FusionViewDtypeSameSizeOutput_CUDA) {
57	Fusion fusion;
58	FusionGuard fg(&fusion);
59
60	std::vector<int64_t> input_shape{`2`, `10`, `40`};
61
62	TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float);
63	TensorView* bias = makeSymbolicTensor(input_shape.size());
64	fusion.addInput(x);
65	fusion.addInput(bias);
66
67	auto x_add_bias = add(x, bias);
68	auto x_view = view(x_add_bias, DataType::Int32);
69	fusion.addOutput(x_view);
70
71	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
72	at::Tensor at_x = at::randn(input_shape, options);
73	at::Tensor at_bias = at::randn(input_shape, options);
74	std::vector<IValue> aten_inputs = {at_x, at_bias};
75
76	auto lparams = schedulePointwise(&fusion, aten_inputs);
77
78	FusionExecutor fe;
79	fe.compileFusion(&fusion, aten_inputs, lparams);
80	auto outputs = fe.runFusion(aten_inputs, lparams);
81
82	auto at_x_add_bias = at_x + at_bias;
83	auto at_x_view = at_x_add_bias.view(at::ScalarType::Int);
84
85	testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
86	}
87
88	TEST_F(NVFuserTest, FusionViewDtypeFailMismatchSize_CUDA) {
89	Fusion fusion;
90	FusionGuard fg(&fusion);
91
92	std::vector<int64_t> input_shape{`2`, `10`, `40`};
93
94	TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Float);
95	TensorView* bias = makeSymbolicTensor(input_shape.size());
96	fusion.addInput(x);
97	fusion.addInput(bias);
98
99	auto x_add_bias = add(x, bias);
100	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
101	ASSERT_ANY_THROW(view(x_add_bias, DataType::Int));
102	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
103	ASSERT_ANY_THROW(view(x_add_bias, DataType::Half));
104	}
105
106	TEST_F(NVFuserTest, FusionViewAsRealOutput_CUDA) {
107	Fusion fusion;
108	FusionGuard fg(&fusion);
109
110	// TODO: We should modify our schedulers to correctly handle
111	// view_as_real. And test these schedulers.
112	std::vector<int64_t> input_shape{`512`};
113	std::vector<int64_t> output_shape{`512`, `2`};
114
115	TensorView* x =
116	makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat);
117	TensorView* bias =
118	makeSymbolicTensor(input_shape.size(), DataType::ComplexFloat);
119	fusion.addInput(x);
120	fusion.addInput(bias);
121
122	TensorView* y = makeSymbolicTensor(output_shape.size());
123	fusion.addInput(y);
124
125	auto y_plus_1 = add(y, IrBuilder::create<Double>(`1`));
126
127	auto x_add_bias = add(x, bias);
128	auto x_view = view_as_real(x_add_bias);
129	auto out = add(y_plus_1, x_view);
130	fusion.addOutput(out);
131
132	out->axis(`0`)->parallelize(ParallelType::TIDx);
133	x_add_bias->computeAt(out, -`1`);
134	y->computeAt(out, -`1`);
135
136	auto in_options =
137	at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, `0`);
138	auto out_options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
139	at::Tensor at_x = at::randn(input_shape, in_options);
140	at::Tensor at_bias = at::randn(input_shape, in_options);
141	at::Tensor at_y = at::randn(output_shape, out_options);
142	std::vector<IValue> aten_inputs = {at_x, at_bias, at_y};
143
144	FusionExecutor fe;
145	fe.compileFusion(&fusion, aten_inputs);
146	auto outputs = fe.runFusion(aten_inputs);
147
148	auto at_x_add_bias = at_x + at_bias;
149	auto at_x_view = at::view_as_real(at_x_add_bias);
150	auto at_y_plus_1 = at_y + `1.0`;
151	auto at_out = at_y_plus_1 + at_x_view;
152
153	testValidate(&fusion, outputs, aten_inputs, {at_out}, __LINE__, __FILE__);
154	}
155
156	TEST_F(NVFuserTest, FusionViewRfactorExtentReplacement_CUDA) {
157	auto fusion = std::make_unique<Fusion>();
158	FusionGuard fg(fusion.get());
159
160	auto tv0 = makeSymbolicTensor(`2`);
161	fusion ->addInput(tv0);
162	auto tv1 = makeContigTensor(`2`);
163	fusion ->addInput(tv1);
164
165	auto tv2 = view(tv0, {`12`, `8`}, {`4`, `3`, `8`});
166	auto tv3 = sum(tv2, {-`1`});
167	auto tv4 = add(tv3, IrBuilder::create<Double>(`1`));
168	auto tv5 = add(tv1, tv4);
169	fusion ->addOutput(tv5);
170
171	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
172	at::manual_seed(`0`);
173	auto t0 = at::randn({`12`, `8`}, options);
174	auto t1 = at::randn({`4`, `3`}, options);
175
176	FusionExecutorCache executor_cache(std::move(fusion));
177	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
178
179	auto ref = at::native::view(t0, {`4`, `3`, `8`}).sum({-`1`}) + `1` + t1;
180
181	testValidate(
182	executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
183	}
184
185	TEST_F(NVFuserTest, FusionViewOutput_CUDA) {
186	Fusion fusion;
187	FusionGuard fg(&fusion);
188
189	std::vector<int64_t> input_shape{`2`, `10`, `40`};
190	std::vector<int64_t> output_shape{`2`, `10`, `4`, `10`};
191
192	TensorView* x = makeSymbolicTensor(input_shape.size());
193	TensorView* bias = makeSymbolicTensor(input_shape.size());
194	fusion.addInput(x);
195	fusion.addInput(bias);
196
197	auto x_add_bias = add(x, bias);
198	auto x_view = view(x_add_bias, input_shape, output_shape);
199	fusion.addOutput(x_view);
200
201	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
202	at::Tensor at_x = at::randn(input_shape, options);
203	at::Tensor at_bias = at::randn(input_shape, options);
204	std::vector<IValue> aten_inputs = {at_x, at_bias};
205
206	auto lparams = schedulePointwise(&fusion, aten_inputs);
207
208	FusionExecutor fe;
209	fe.compileFusion(&fusion, aten_inputs, lparams);
210	auto outputs = fe.runFusion(aten_inputs, lparams);
211
212	auto at_x_add_bias = at_x + at_bias;
213	auto at_x_view = at::native::view(at_x_add_bias, output_shape);
214
215	testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
216	}
217
218	TEST_F(NVFuserTest, FusionViewFailMismatchSize_CUDA) {
219	Fusion fusion;
220	FusionGuard fg(&fusion);
221
222	// The number of elements in input and output shapes do not match,
223	// so this view transformation is invalid.
224	// 2 10 * 40 != 2 * 50 * 4 * 10*
225
226	std::vector<int64_t> input_shape{`2`, `10`, `40`};
227	std::vector<int64_t> output_shape{`2`, `50`, `4`, `10`};
228
229	TensorView* x = makeSymbolicTensor(input_shape.size());
230	TensorView* bias = makeSymbolicTensor(input_shape.size());
231	fusion.addInput(x);
232	fusion.addInput(bias);
233
234	auto x_add_bias = add(x, bias);
235	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
236	ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
237	}
238
239	TEST_F(NVFuserTest, FusionViewFailMulitDimInference_CUDA) {
240	Fusion fusion;
241	FusionGuard fg(&fusion);
242
243	// Only one dimension can be inferred in the output shape.
244	// Otherwise, the size of the dimensions is ambiguous.
245	std::vector<int64_t> input_shape{`2`, `10`, `40`};
246	std::vector<int64_t> output_shape{`2`, -`1`, `4`, -`1`};
247
248	TensorView* x = makeSymbolicTensor(input_shape.size());
249	TensorView* bias = makeSymbolicTensor(input_shape.size());
250	fusion.addInput(x);
251	fusion.addInput(bias);
252
253	auto x_add_bias = add(x, bias);
254	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
255	ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape));
256	}
257
258	void reductionViewAddFusion(
259	std::vector<int64_t>& input_shape,
260	std::vector<int64_t>& output_shape,
261	bool view_before_reduction) {
262	constexpr int kReductionAxis = -`1`;
263
264	// Drop size for reduction axis from view_shape
265	std::vector<int64_t> view_shape;
266	{
267	const auto kAxis = (kReductionAxis < `0`)
268	? (kReductionAxis + input_shape.size())
269	: kReductionAxis;
270	for (auto i : c10::irange(input_shape.size())) {
271	if (view_before_reduction \|\| i != kAxis) {
272	view_shape.push_back(input_shape [i]);
273	}
274	}
275	}
276
277	auto bias_shape = (view_before_reduction) ? input_shape : output_shape;
278	for (auto has_implicit_broadcast : {false, true}) {
279	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
280	Fusion& fusion = *fusion_ptr.get();
281	FusionGuard fg(&fusion);
282
283	TensorView* x = (has_implicit_broadcast)
284	? makeConcreteTensor(input_shape)
285	: makeSymbolicTensor(input_shape.size());
286	TensorView* bias = (has_implicit_broadcast)
287	? makeConcreteTensor(bias_shape)
288	: makeSymbolicTensor(bias_shape.size());
289	fusion.addInput(x);
290	fusion.addInput(bias);
291
292	auto tv1 =
293	(view_before_reduction) ? add(x, bias) : sum(x, {kReductionAxis});
294	auto x_view = view(tv1, view_shape, output_shape);
295	auto y = (view_before_reduction) ? sum(x_view, {kReductionAxis})
296	: add(x_view, bias);
297	fusion.addOutput(y);
298
299	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
300	at::Tensor at_x = at::randn(input_shape, options);
301	at::Tensor at_bias = at::randn(bias_shape, options);
302	std::vector<IValue> aten_inputs = {at_x, at_bias};
303
304	FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
305	auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
306
307	auto at_tv1 = (view_before_reduction) ? (at_x + at_bias)
308	: at::sum(at_x, kReductionAxis);
309	auto at_x_view = at::native::view(at_tv1, output_shape);
310	auto at_y = (view_before_reduction) ? at::sum(at_x_view, kReductionAxis)
311	: at::add(at_x_view, at_bias);
312
313	testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
314	}
315	}
316
317	typedef std::vector<int64_t> shape;
318	typedef std::pair<shape, shape> view_example;
319
320	// TODO: View examples with just 333 elements are failing validation in
321	// normalization. This might just be because our tolerances aren't tuned well
322	// for small sizes and the parallelization could be limited which could be
323	// detected as a validation issue, though it might not actually be a correctness
324	// issue. Using 3333 instead of 333 in those cases but should validate what's
325	// going on in the 333 case.
326	std::vector<view_example> all_view_examples = {
327	{{`1`, `19`, `1`, `3` * `4`, `7`, `1`, `99`}, {`1`, `19`, -`1`, `3`, `4` * `7` * `99`}},
328	{{`1`, `19`, `1`, `3` * `4`, `7`, `1`, `99`}, {`1`, `19`, `1`, `3`, `4` * `7` * `99`}},
329	{{`19`, `3` * `4`, `7`, `99`}, {`19`, `3`, `4` * `7` * `99`}},
330
331	{{`3`, `17`, `2` * `4` * `10`, `1`}, {`3` * `17`, `1`, `2`, `4`, -`1`}},
332	{{`3`, `17`, `2` * `4` * `10`, `1`}, {`3` * `17`, `1`, `2`, `4`, `10`}},
333	{{`3`, `17`, `2` * `4` * `10`, `1`}, {`3` * `17`, `2`, `4`, `1`, `10`}},
334
335	{{`3`, `17`, `2` * `4` * `10`, `1`, `9`}, {-`1`, `1`, `2`, `4`, `10`, `9`}},
336	{{`3`, `17`, `2` * `4` * `10`, `1`, `9`}, {`3` * `17`, `1`, `2`, `4`, `10`, `9`}},
337	{{`3`, `17`, `2` * `4` * `10`, `1`, `9`}, {`3` * `17`, `2`, `4`, `1`, `10`, `9`}},
338
339	{{`2`, `3`, `2` * `2`, `5`}, {`1`, `2` * `3`, `1`, -`1`, `2`, `5`, `1`}},
340
341	{{`22`, `11` * `2`, `2`}, {`22`, -`1`, `1`, `1`, `2` * `2`}},
342	{{`22`, `1`, `22`, `1`}, {-`1`}},
343	{{`22`, `11` * `2`, `2`}, {`22`, `11`, `1`, `1`, `2` * `2`}},
344	{{`22`, `1`, `22`, `1`}, {`22` * `22`}},
345
346	{{`37`, `9`, `7`, `3` * `2`, `5` * `2`}, {`37` * `9`, `2`, -`1`, `3`, `7` * `5`}},
347	{{`37`, `9`, `7`, `3` * `2`, `5` * `2`}, {`37` * `9`, `2`, `2`, `3`, `7` * `5`}},
348
349	{{`1`, `1`, `3333`, `1`}, {`1`, `1`, -`1`, `1`}},
350	// Disabled for now due to non-deterministic nan issue (#1920)
351	// {{1, 1111 3}, {1, 1, 1, -1, 1, 3}},*
352	{{`1`, `3333`, `1`}, {-`1`}},
353	{{`1`, `1`, `3333`, `1`}, {`1`, `1`, `3333`, `1`}},
354	{{`1`, `303` * `11`, `1`}, {`1`, `303`, -`1`, `1`}},
355	{{`1`, `3333`, `1`}, {`1`, `303`, `11`, `1`}},
356	// Disabled for now due to non-deterministic nan issue (#1920)
357	// {{1, 3333}, {1, 1, 1, 1111, 1, 3}},
358	{{`1`, `3333`, `1`}, {`3333`}},
359
360	{{`1`, `3922` * `7`, `1`, `2`}, {`1`, `3922` * `2`, `1`, -`1`}},
361	{{`1`, `3922` * `2`, `1`, `7`}, {`1`, -`1`, `2`}},
362	{{`1`, `3922` * `7`, `2`}, {`1`, `3922` * `2`, `7`}},
363	{{`1`, `3922` * `2`, `1`, `7`}, {`1`, `3922` * `7`, `2`}},
364	{{`1`, `3922` * `7`, `1`, `2`}, {`1`, `3922` * `2`, `1`, `7`}},
365
366	{{`8`, `1`, `1`, `2` * `4`, `1`, `8`}, {`8`, `2`, `4`, `1`, -`1`}},
367	{{`8`, `1`, `1`, `8`, `1`, `8`}, {`8`, `2`, `4`, `1`, `8`}},
368
369	{{`2`, `3`, `2` * `2`, `5`}, {`1`, `6`, `1`, `2`, `2`, `5`, `1`}},
370	};
371
372	TEST_F(NVFuserTest, FusionViewReductionShmoo_CUDA) {
373	for (auto e : all_view_examples) {
374	reductionViewAddFusion(e.first, e.second, true / view_before_reduction /);
375	}
376	std::vector<view_example> view_after_reduce_examples = {
377	{{`19`, `12`, `7`, `99`}, {`19`, `3`, `28`}},
378	{{`1`, `19`, `1`, `12`, `7`, `1`, `99`}, {`1`, `19`, `1`, `3`, `28`}},
379	{{`3`, `17`, `80`, `1`}, {`51`, `1`, `2`, `4`, `10`}},
380	{{`3`, `17`, `80`, `1`, `9`}, {`51`, `1`, `2`, `4`, `10`}},
381	{{`2`, `3`, `4`, `5`}, {`1`, `6`, `1`, `2`, `2`, `1`}},
382	{{`22`, `22`, `2`}, {`22`, `11`, `1`, `1`, `2`}},
383	{{`37`, `9`, `7`, `6`, `10`}, {`333`, `2`, `21`}},
384	{{`1`, `1`, `333`, `1`}, {`1`, `1`, `333`, `1`}},
385	{{`8`, `1`, `1`, `8`, `1`, `8`}, {`8`, `2`, `4`, `1`}},
386	{{`1`, `333`, `1`}, {`1`, `37`, `9`, `1`}},
387	{{`22`, `1`, `22`, `1`}, {`484`}},
388	{{`1`, `333`, `1`}, {`333`}},
389	{{`1`, `27454`, `1`, `2`}, {`1`, `3922`, `1`, `7`}},
390	{{`1`, `7844`, `1`, `7`}, {`1`, `1961`, `4`}}};
391
392	for (auto e : view_after_reduce_examples) {
393	reductionViewAddFusion(
394	e.first, e.second, false / view_before_reduction /);
395	}
396	}
397
398	void persistentViewAddFusion(
399	std::vector<int64_t>& input_shape,
400	std::vector<int64_t>& output_shape,
401	bool view_before_persistent) {
402	constexpr int kAxis = -`1`;
403
404	// Support -1 sizes in the inputs
405	auto inferred_shapes = inferViewShapes(input_shape, output_shape);
406	auto inferred_input = inferred_shapes.first;
407	auto inferred_output = inferred_shapes.second;
408
409	auto bias_shape = view_before_persistent ? inferred_input : inferred_output;
410	for (auto has_implicit_broadcast : {false, true}) {
411	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
412	Fusion& fusion = *fusion_ptr.get();
413	FusionGuard fg(&fusion);
414
415	TensorView* x = (has_implicit_broadcast)
416	? makeConcreteTensor(inferred_input)
417	: makeSymbolicTensor(inferred_input.size());
418	TensorView* bias = (has_implicit_broadcast)
419	? makeConcreteTensor(bias_shape)
420	: makeSymbolicTensor(bias_shape.size());
421	fusion.addInput(x);
422	fusion.addInput(bias);
423
424	auto tv1 = (view_before_persistent) ? add(x, bias) : softmax(x, kAxis);
425	auto x_view = view(tv1, inferred_input, inferred_output);
426	auto y =
427	(view_before_persistent) ? softmax(x_view, kAxis) : add(x_view, bias);
428	fusion.addOutput(y);
429
430	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
431	at::Tensor at_x = at::randn(inferred_input, options);
432	at::Tensor at_bias = at::randn(bias_shape, options);
433	std::vector<IValue> aten_inputs = {at_x, at_bias};
434
435	FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
436	auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
437
438	auto at_tv1 = (view_before_persistent)
439	? (at_x + at_bias)
440	: at::_softmax(at_x, kAxis, false / half_to_float /);
441	auto at_x_view = at::native::view(at_tv1, inferred_output);
442	auto at_y = (view_before_persistent)
443	? at::_softmax(at_x_view, kAxis, false / half_to_float /)
444	: at::add(at_x_view, at_bias);
445
446	testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
447	}
448	}
449
450	TEST_F(NVFuserTest, FusionViewPersistentShmoo_CUDA) {
451	for (auto e : all_view_examples) {
452	persistentViewAddFusion(
453	e.first, e.second, true / view_before_persistent /);
454	}
455
456	for (auto e : all_view_examples) {
457	persistentViewAddFusion(
458	e.first, e.second, false / view_before_persistent /);
459	}
460	}
461
462	void addViewGeluFusion(
463	std::vector<int64_t>& input_shape,
464	std::vector<int64_t>& output_shape) {
465	for (auto has_implicit_broadcast : {false, true}) {
466	Fusion fusion;
467	FusionGuard fg(&fusion);
468
469	TensorView* x = (has_implicit_broadcast)
470	? makeConcreteTensor(input_shape)
471	: makeSymbolicTensor(input_shape.size());
472	TensorView* bias = (has_implicit_broadcast)
473	? makeConcreteTensor(input_shape)
474	: makeSymbolicTensor(input_shape.size());
475	fusion.addInput(x);
476	fusion.addInput(bias);
477
478	auto x_add_bias = add(x, bias);
479	auto x_view = view(x_add_bias, input_shape, output_shape);
480	auto y = gelu(x_view);
481	fusion.addOutput(y);
482
483	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
484	at::Tensor at_x = at::randn(input_shape, options);
485	at::Tensor at_bias = at::randn(input_shape, options);
486	std::vector<IValue> aten_inputs = {at_x, at_bias};
487
488	auto lparams = schedulePointwise(&fusion, aten_inputs);
489
490	FusionExecutor fe;
491	fe.compileFusion(&fusion, aten_inputs, lparams);
492	auto outputs = fe.runFusion(aten_inputs, lparams);
493
494	auto at_x_add_bias = at_x + at_bias;
495	auto at_x_view = at::native::view(at_x_add_bias, output_shape);
496	auto at_y = at::gelu(at_x_view);
497
498	testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
499	}
500	}
501
502	TEST_F(NVFuserTest, FusionViewSplit_CUDA) {
503	std::vector<int64_t> input_shape{`80`};
504	std::vector<int64_t> output_shape{`2`, `4`, `10`};
505	addViewGeluFusion(input_shape, output_shape);
506	}
507
508	TEST_F(NVFuserTest, FusionViewBroadcast_CUDA) {
509	std::vector<int64_t> input_shape{`80`};
510	std::vector<int64_t> output_shape{`1`, `80`};
511	addViewGeluFusion(input_shape, output_shape);
512	}
513
514	TEST_F(NVFuserTest, FusionViewMerge_CUDA) {
515	std::vector<int64_t> input_shape{`2`, `40`, `7`};
516	std::vector<int64_t> output_shape{`560`};
517	addViewGeluFusion(input_shape, output_shape);
518	}
519
520	TEST_F(NVFuserTest, FusionViewAllShmoo_CUDA) {
521	for (auto e : all_view_examples) {
522	addViewGeluFusion(e.first, e.second);
523	}
524	}
525
526	void geluViewAddFusion(
527	std::vector<int64_t> input_shape,
528	std::vector<int64_t> output_shape) {
529	// Support -1 sizes in the inputs
530	auto inferred_shapes = inferViewShapes(input_shape, output_shape);
531	auto inferred_input = inferred_shapes.first;
532	auto inferred_output = inferred_shapes.second;
533
534	for (auto hasImplicitBroadcast : {false, true}) {
535	Fusion fusion;
536	FusionGuard fg(&fusion);
537
538	TensorView* x = (hasImplicitBroadcast)
539	? makeConcreteTensor(inferred_input)
540	: makeSymbolicTensor(inferred_input.size());
541	TensorView* bias = (hasImplicitBroadcast)
542	? makeConcreteTensor(inferred_output)
543	: makeSymbolicTensor(inferred_output.size());
544	fusion.addInput(x);
545	fusion.addInput(bias);
546
547	auto x_gelu = gelu(x);
548	auto x_view = view(x_gelu, inferred_input, inferred_output);
549	auto y = add(x_view, bias);
550	fusion.addOutput(y);
551
552	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
553	at::Tensor at_x = at::randn(inferred_input, options);
554	at::Tensor at_bias = at::randn(inferred_output, options);
555	std::vector<IValue> aten_inputs = {at_x, at_bias};
556
557	auto lparams = schedulePointwise(&fusion, aten_inputs);
558
559	FusionExecutor fe;
560	fe.compileFusion(&fusion, aten_inputs, lparams);
561	auto outputs = fe.runFusion(aten_inputs, lparams);
562
563	auto at_x_gelu = at::gelu(at_x);
564	auto at_x_view = at::native::view(at_x_gelu, inferred_output);
565	auto at_y = at_x_view + at_bias;
566
567	testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
568	}
569	}
570
571	TEST_F(NVFuserTest, FusionViewStride_CUDA) {
572	for (const auto& e : all_view_examples) {
573	geluViewAddFusion(e.first, e.second);
574	}
575	}
576
577	void geluViewBinaryAddFusion(
578	std::vector<int64_t> input_shape1,
579	std::vector<int64_t> input_shape2,
580	std::vector<int64_t> output_shape) {
581	for (auto hasImplicitBroadcast : {false, true}) {
582	Fusion fusion;
583	FusionGuard fg(&fusion);
584
585	TensorView* x = (hasImplicitBroadcast)
586	? makeConcreteTensor(input_shape1)
587	: makeSymbolicTensor(input_shape1.size());
588	TensorView* bias = (hasImplicitBroadcast)
589	? makeConcreteTensor(input_shape2)
590	: makeSymbolicTensor(input_shape2.size());
591	fusion.addInput(x);
592	fusion.addInput(bias);
593
594	auto x_gelu = gelu(x);
595	auto x_view = view(x_gelu, input_shape1, output_shape);
596	auto bias_view = view(bias, input_shape2, output_shape);
597	auto y = add(x_view, bias_view);
598	fusion.addOutput(y);
599
600	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
601	at::Tensor at_x = at::randn(input_shape1, options);
602	at::Tensor at_bias = at::randn(input_shape2, options);
603	std::vector<IValue> aten_inputs = {at_x, at_bias};
604
605	auto lparams = schedulePointwise(&fusion, aten_inputs);
606
607	FusionExecutor fe;
608	fe.compileFusion(&fusion, aten_inputs, lparams);
609	auto outputs = fe.runFusion(aten_inputs, lparams);
610
611	auto at_x_gelu = at::gelu(at_x);
612	auto at_x_view = at::native::view(at_x_gelu, output_shape);
613	auto at_bias_view = at::native::view(at_bias, output_shape);
614	auto at_y = at_x_view + at_bias_view;
615
616	testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
617	}
618	}
619
620	TEST_F(NVFuserTest, FusionViewBinary_CUDA) {
621	geluViewBinaryAddFusion({`27454`, `2`}, {`54908`}, {`7844`, `7`});
622	}
623
624	// Repro of issue #1493
625	TEST_F(NVFuserTest, FusionViewConcreteDomain_CUDA) {
626	Fusion fusion;
627	FusionGuard fg(&fusion);
628
629	auto tv0 = makeSymbolicTensor(`2`);
630	fusion.addInput(tv0);
631	auto tv1 = makeContigTensor(`2`);
632	fusion.addInput(tv1);
633
634	auto tv2 = view(tv0, {`2`, `3`}, {`6`});
635	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
636	auto tv4 = broadcast(tv3, {true, false});
637	auto tv5 = add(tv4, tv1);
638
639	fusion.addOutput(tv5);
640
641	tv5->merge(`0`);
642	tv0->computeAt(tv5, -`1`);
643	tv1->computeAt(tv5, -`1`);
644
645	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
646	at::manual_seed(`0`);
647	auto t0 = at::randn({`2`, `3`}, options);
648	auto t1 = at::randn({`1`, `6`}, options);
649
650	FusionExecutor fe;
651	fe.compileFusion(&fusion, {t0, t1});
652	auto cg_outputs = fe.runFusion({t0, t1});
653
654	auto ref = (at::native::view(t0, {`6`}) + `1`).unsqueeze(`0`) + t1;
655
656	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
657	}
658
659	TEST_F(NVFuserTest, FusionViewConcreteDomain2_CUDA) {
660	constexpr int kAxis = -`1`;
661	std::vector<int64_t> input_shape = {`19`, `12`, `7`, `99`};
662	std::vector<int64_t> output_shape = {`19`, `3`, `2772`};
663
664	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
665	Fusion& fusion = *fusion_ptr.get();
666	FusionGuard fg(&fusion);
667
668	TensorView* x = makeSymbolicTensor(input_shape.size());
669	TensorView* bias = makeSymbolicTensor(output_shape.size());
670	fusion.addInput(x);
671	fusion.addInput(bias);
672
673	auto tv1 = softmax(x, kAxis);
674	auto x_view = view(tv1, input_shape, output_shape);
675	auto y = add(x_view, bias);
676	fusion.addOutput(y);
677
678	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
679	at::Tensor at_x = at::randn(input_shape, options);
680	at::Tensor at_bias = at::randn(output_shape, options);
681	std::vector<IValue> aten_inputs = {at_x, at_bias};
682
683	FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
684	auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
685
686	auto at_tv1 = at::_softmax(at_x, kAxis, false / half_to_float /);
687	auto at_x_view = at::native::view(at_tv1, output_shape);
688	auto at_y = at::add(at_x_view, at_bias);
689
690	testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__);
691	}
692
693	// Repro of issue #1608
694	TEST_F(NVFuserTest, FusionViewConcreteDomain3_CUDA) {
695	std::vector<int64_t> input_shape = {`14`, `12`, `8`, `100`};
696	std::vector<int64_t> bcast_shape = {`14`, `12`, `8`, `1`};
697	std::vector<int64_t> other_shape = {`14`, `100`, `96`};
698	std::vector<int64_t> output_shape = {`14`, `3`, `3200`};
699
700	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
701	Fusion& fusion = *fusion_ptr.get();
702	FusionGuard fg(&fusion);
703
704	TensorView* x = makeSymbolicTensor(input_shape.size());
705	TensorView* y = makeConcreteTensor(bcast_shape);
706	TensorView* z = makeSymbolicTensor(other_shape.size());
707	fusion.addInput(x);
708	fusion.addInput(y);
709	fusion.addInput(z);
710
711	auto tv1 = add(x, y);
712	auto tv2 = view(tv1, input_shape, output_shape);
713	auto tv3 = view(z, other_shape, output_shape);
714	auto output = add(tv2, tv3);
715	fusion.addOutput(output);
716
717	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
718	at::Tensor at_x = at::randn(input_shape, options);
719	at::Tensor at_y = at::randn(bcast_shape, options);
720	at::Tensor at_z = at::randn(other_shape, options);
721	std::vector<IValue> aten_inputs = {at_x, at_y, at_z};
722
723	FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
724	auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
725
726	auto at_tv1 = at::add(at_x, at_y);
727	auto at_tv2 = at::native::view(at_tv1, output_shape);
728	auto at_tv3 = at::native::view(at_z, output_shape);
729	auto at_output = at::add(at_tv2, at_tv3);
730
731	testValidate(&fusion, outputs, aten_inputs, {at_output}, __LINE__, __FILE__);
732	}
733
734	TEST_F(NVFuserTest, FusionViewConcreteDomain4_CUDA) {
735	std::vector<int64_t> shape1 = {`3`, `4`, `5`};
736	std::vector<int64_t> shape2 = {`3` * `4` * `5`};
737
738	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
739	Fusion& fusion = *fusion_ptr.get();
740	FusionGuard fg(&fusion);
741
742	auto tv0 = makeSymbolicTensor(shape1.size() - `1`);
743	fusion.addInput(tv0);
744
745	auto tv1 = makeSymbolicTensor(shape1.size());
746	fusion.addInput(tv1);
747
748	auto tv2 = broadcast(tv0, {true, false, false});
749	auto tv3 = add(tv1, tv2);
750	auto tv4 = view(tv3, shape1, shape2);
751	auto tv5 = set(tv4);
752	fusion.addOutput(tv5);
753
754	tv0->computeAt(tv5, -`1`);
755	tv1->computeAt(tv5, -`1`);
756
757	TORCH_CHECK(tv5->nDims() == `1`);
758
759	// The concrete domain of tv5, which is 1D, with permissive or loop mapping
760	// needs to be either the domain of tv4 or tv5, both of which have the three
761	// concrete root domains of tv1. In other words, it must map with tv4 and tv5
762	// with the exact mapping.
763	ComputeAtMap map(&fusion);
764	auto concrete_id =
765	map.getConcreteMappedID(tv5->axis(`0`), IdMappingMode::PERMISSIVE);
766	TORCH_CHECK(
767	map.areMapped(concrete_id, tv5->axis(`0`), IdMappingMode::EXACT),
768	"Invalid concrete ID: ",
769	concrete_id->toString());
770	TORCH_CHECK(
771	map.areMapped(concrete_id, tv4->axis(`0`), IdMappingMode::EXACT),
772	"Invalid concrete ID: ",
773	concrete_id->toString());
774	}
775
776	TEST_F(NVFuserTest, FusionViewConcreteDomain5_CUDA) {
777	const std::vector<int64_t> shape1 = {`12`};
778	const std::vector<int64_t> shape2 = {`4`, `3`};
779	const std::vector<int64_t> shape3 = {`12`, `5`};
780	const std::vector<int64_t> shape4 = {`4`, `3`, `5`};
781
782	for (auto order : {true, false}) {
783	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
784	Fusion& fusion = *fusion_ptr.get();
785	FusionGuard fg(&fusion);
786
787	auto tv0 = makeSymbolicTensor(`1`);
788	fusion.addInput(tv0);
789
790	auto tv1 = makeSymbolicTensor(`2`);
791	fusion.addInput(tv1);
792
793	auto tv0_cache = set(tv0);
794
795	auto path1 = [&]() {
796	auto view_2d = view(tv0_cache, shape1, shape2);
797	auto view_2d_copy = set(view_2d);
798	fusion.addOutput(view_2d_copy);
799	return view_2d_copy;
800	};
801
802	auto path2 = [&]() {
803	auto tv0_bc = broadcast(tv0_cache, {false, true});
804	auto tv0_bc_plus_tv1 = add(tv0_bc, tv1);
805	auto view_3d = view(tv0_bc_plus_tv1, shape3, shape4);
806	auto view_3d_copy = set(view_3d);
807	fusion.addOutput(view_3d_copy);
808	return view_3d_copy;
809	};
810
811	TensorView* path1_out = nullptr;
812	TensorView* path2_out = nullptr;
813
814	if (order) {
815	// Fails before #1544. Concrete ID is picked from path1_out, which
816	// doesn't have the second root domain of tv1
817	path2_out = path2 ();
818	path1_out = path1 ();
819	} else {
820	// Works fine
821	path1_out = path1 ();
822	path2_out = path2 ();
823	}
824
825	path2_out->merge(-`2`, -`1`);
826	path2_out->merge(-`2`, -`1`);
827
828	tv0->computeAt(path2_out, -`1`);
829	tv1->computeAt(path2_out, -`1`);
830
831	TORCH_CHECK(path1_out->nDims() == `1`);
832	TORCH_CHECK(path2_out->nDims() == `1`);
833
834	ComputeAtMap map(&fusion);
835
836	// Make sure the two output tensors are mapped. Note both are 1D.
837	TORCH_CHECK(map.areMapped(
838	path1_out->axis(`0`), path2_out->axis(`0`), IdMappingMode::LOOP));
839
840	auto concrete_id =
841	map.getConcreteMappedID(path2_out->axis(`0`), IdMappingMode::LOOP);
842	TORCH_CHECK(
843	path2_out->axis(`0`) == concrete_id,
844	"Incorrect concrete ID: ",
845	concrete_id->toString());
846	}
847	}
848
849	TEST_F(NVFuserTest, FusionFlattenAfterUnsqueezeOutput_CUDA) {
850	Fusion fusion;
851	FusionGuard fg(&fusion);
852
853	std::vector<int64_t> input_shape{`512`};
854
855	TensorView* x = makeSymbolicTensor(input_shape.size(), DataType::Double);
856	TensorView* bias = makeSymbolicTensor(input_shape.size(), DataType::Double);
857	fusion.addInput(x);
858	fusion.addInput(bias);
859
860	auto x_add_bias = add(x, bias);
861	auto x_unsqueeze = unsqueeze(x_add_bias, -`1`);
862	auto x_view = flatten(x_unsqueeze);
863	fusion.addOutput(x_view);
864
865	auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, `0`);
866	at::Tensor at_x = at::randn(input_shape, options);
867	at::Tensor at_bias = at::randn(input_shape, options);
868	std::vector<IValue> aten_inputs = {at_x, at_bias};
869
870	x_view->split(`0`, `4`);
871	x_add_bias->computeAt(x_view, `1`);
872	x_view->axis(`0`)->parallelize(ParallelType::TIDx);
873
874	FusionExecutor fe;
875	fe.compileFusion(&fusion, aten_inputs);
876	auto outputs = fe.runFusion(aten_inputs);
877
878	auto at_x_add_bias = at_x + at_bias;
879	auto at_x_view = at_x_add_bias.unsqueeze(-`1`).flatten();
880
881	testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__);
882	}
883
884	TEST_F(NVFuserTest, FusionComputeAtRootDomainMapWithView_CUDA) {
885	Fusion fusion;
886	FusionGuard fg(&fusion);
887
888	const std::vector<int64_t> input_shape1{`10`, `12`};
889	const std::vector<int64_t> input_shape2{`10`, `3`, `4`};
890
891	auto tv0 = makeSymbolicTensor(`2`);
892	fusion.addInput(tv0);
893
894	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
895
896	// reduction followed by broadcast
897	auto tv2 = sum(tv1, {`1`});
898	auto tv3 = broadcast(tv2, {false, true, true});
899
900	// Path with a view
901	auto tv4 = view(tv1, input_shape1, input_shape2);
902
903	// Join the reduciton+broadcast and view paths together
904	auto tv5 = add(tv3, tv4);
905	fusion.addOutput(tv5);
906
907	ComputeAtRootDomainMap map;
908	map.build();
909
910	// It's not possible to compute tv1 at the -1 position of
911	// t2. ComputeAtRootDomainMap should tell that by not mapping the
912	// second axis.
913	auto tv1_tv2_mappable_dims =
914	map.getMappableDims(tv1->domain(), tv2->domain());
915	TORCH_CHECK(
916	tv1_tv2_mappable_dims.find(tv1->axis(`1`)) == tv1_tv2_mappable_dims.end(),
917	"Invalid ComputeAtRootDomainMap. Domain should not be mappable: ",
918	tv1->axis(`1`)->toString());
919	}
920
921	TEST_F(NVFuserTest, FusionExpandRepro_CUDA) {
922	Fusion fusion;
923	FusionGuard fg(&fusion);
924
925	const std::vector<int64_t> input_shape1{`4`, `1`, `1`};
926	const std::vector<int64_t> input_shape2{`4`, `3`, `2`};
927
928	auto tv0 = makeConcreteTensor({-`1`, `1`, `1`});
929	fusion.addInput(tv0);
930	auto tv1 = makeSymbolicTensor(`3`);
931	fusion.addInput(tv1);
932
933	auto tv2 = expand_as(tv0, tv1);
934	fusion.addOutput(tv2);
935
936	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
937	at::Tensor at_x = at::randn(input_shape1, options);
938	at::Tensor at_y = at::randn(input_shape2, options);
939	std::vector<IValue> aten_inputs = {at_x, at_y};
940
941	FusionExecutor fe;
942	fe.compileFusion(&fusion);
943	LaunchParams l_params;
944	auto outputs = fe.runFusion(aten_inputs, {}, l_params, `0`);
945
946	auto out = at_x.expand_as(at_y);
947
948	testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__);
949
950	// second run to verify cached output allocation
951	outputs = fe.runFusion(aten_inputs, {}, l_params, `0`);
952	testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__);
953	}
954
955	TEST_F(NVFuserTest, FusionExpandView1_CUDA) {
956	auto fusion = std::make_unique<Fusion>();
957	FusionGuard fg(fusion.get());
958
959	auto tv0 = makeConcreteTensor({`4`, `1`, `8`});
960	fusion ->addInput(tv0);
961
962	auto tv1 = makeConcreteTensor({`12`, `8`});
963	fusion ->addInput(tv1);
964
965	auto tv2 = expand(
966	tv0,
967	{IrBuilder::create<Int>(`4`),
968	IrBuilder::create<Int>(`3`),
969	IrBuilder::create<Int>(`8`)});
970
971	auto tv3 = view(tv2, {`4`, `3`, `8`}, {`12`, `8`});
972	auto tv4 = add(tv3, tv1);
973	fusion ->addOutput(tv4);
974
975	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
976	at::manual_seed(`0`);
977	auto t0 = at::randn({`4`, `1`, `8`}, options);
978	auto t1 = at::randn({`12`, `8`}, options);
979
980	FusionExecutorCache executor_cache(std::move(fusion));
981	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
982
983	auto ref = at::reshape(t0.expand({`4`, `3`, `8`}), {`12`, `8`}) + t1;
984
985	testValidate(
986	executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
987	}
988
989	TEST_F(NVFuserTest, FusionExpandView2_CUDA) {
990	auto fusion = std::make_unique<Fusion>();
991	FusionGuard fg(fusion.get());
992
993	auto tv0 = makeConcreteTensor({`1`, `8`});
994	fusion ->addInput(tv0);
995
996	auto tv1 = makeConcreteTensor({`3`, `4`, `8`});
997	fusion ->addInput(tv1);
998
999	auto tv2 =
1000	expand(tv0, {IrBuilder::create<Int>(`12`), IrBuilder::create<Int>(`8`)});
1001
1002	auto tv3 = view(tv2, {`12`, `8`}, {`3`, `4`, `8`});
1003	auto tv4 = add(tv3, tv1);
1004	fusion ->addOutput(tv4);
1005
1006	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1007	at::manual_seed(`0`);
1008	auto t0 = at::randn({`1`, `8`}, options);
1009	auto t1 = at::randn({`3`, `4`, `8`}, options);
1010
1011	FusionExecutorCache executor_cache(std::move(fusion));
1012	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
1013
1014	auto ref = at::reshape(t0.expand({`12`, `8`}), {`3`, `4`, `8`}) + t1;
1015
1016	testValidate(
1017	executor_cache.fusion(), cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1018	}
1019
1020	TEST_F(NVFuserTest, FusionViewTransformCache_CUDA) {
1021	auto assert_matches = [](view_example example_0, view_example example_1) {
1022	TORCH_INTERNAL_ASSERT(
1023	analyzeViewConstraint(example_0.first, example_0.second) ==
1024	analyzeViewConstraint(example_1.first, example_1.second),
1025	"View: ",
1026	example_0.first,
1027	" -> ",
1028	example_0.second,
1029	" Does not match:",
1030	example_1.first,
1031	" -> ",
1032	example_1.second);
1033	};
1034
1035	auto assert_does_not_match = [](view_example example_0,
1036	view_example example_1) {
1037	TORCH_INTERNAL_ASSERT(
1038	!(analyzeViewConstraint(example_0.first, example_0.second) ==
1039	analyzeViewConstraint(example_1.first, example_1.second)),
1040	"View: ",
1041	example_0.first,
1042	" -> ",
1043	example_0.second,
1044	" Should not match:",
1045	example_1.first,
1046	" -> ",
1047	example_1.second);
1048	};
1049
1050	// Splits are done as splitting out left hand side, so left hand side
1051	// split changes can't reuse view, but right hand side split changes can.
1052	// Merges, since they don't bury hard values in can always be reshared.
1053	// Need to make sure trivial reduction, and broadcast changes don't try to
1054	// reuse view. What matches and what doesn't is very specific to the
1055	// implementation of how the splits/merges are generated. This could be
1056	// changed over time as there isn't a single set of transformations to
1057	// potentially make a view. For example we could always merge all dimensions,
1058	// then split out all dimensions. This would always be valid but would not be
1059	// efficient for indexing.
1060
1061	// "Same"
1062	assert_matches (
1063	{{`1`, `1`, `3333`, `1`}, {`1`, `1`, `3333`, `1`}}, {{`1`, `1`, `3333`, `1`}, {`1`, `1`, -`1`, `1`}});
1064	assert_matches (
1065	{{`8`, `1`, `1`, `2` * `4`, `1`, `8`}, {`8`, `2`, `4`, `1`, `8`}},
1066	{{`8`, `1`, `1`, `2` * `4`, `1`, `8`}, {`8`, `2`, `4`, `1`, -`1`}});
1067
1068	// Trivial reduce matching
1069	assert_matches ({{`1`, `3333`, `1`}, {-`1`}}, {{`1`, `24`, `1`}, {-`1`}});
1070
1071	// Trivial reduce not matching
1072	assert_does_not_match ({{`1`, `3333`, `1`}, {-`1`}}, {{`1`, `3333`}, {-`1`}});
1073
1074	// Broadcast matching
1075	assert_matches ({{`3333`}, {`1`, -`1`, `1`}}, {{`24`}, {`1`, -`1`, `1`}});
1076
1077	// Broadcast not matching
1078	assert_does_not_match ({{`3333`}, {`1`, -`1`, `1`}}, {{`24`}, {`1`, -`1`}});
1079
1080	// RHS split
1081	assert_matches (
1082	{{`3`, `17`, `2` * `4` * `10`, `1`}, {`3` * `17`, `1`, `2`, `4`, -`1`}},
1083	{{`3`, `17`, `2` * `4` * `10` * `7`, `1`}, {`3` * `17`, `1`, `2`, `4`, -`1`}});
1084	assert_matches (
1085	{{`1`, `303` * `11`, `1`}, {`1`, `303`, -`1`, `1`}},
1086	{{`1`, `303` * `11` * `4`, `1`}, {`1`, `303`, -`1`, `1`}});
1087	assert_matches (
1088	{{`2`, `3`, `2` * `2` * `3`, `5`}, {`1`, `2` * `3`, `1`, `2`, -`1`, `5`, `1`}},
1089	{{`2`, `3`, `2` * `2` * `4`, `5`}, {`1`, `2` * `3`, `1`, `2`, -`1`, `5`, `1`}});
1090	assert_matches (
1091	{{`22`, `11` * `2`, `2`}, {`22`, `11`, `1`, `1`, -`1`}},
1092	{{`22`, `11` * `2` * `4`, `2` * `3`}, {`22`, `11`, `1`, `1`, -`1`}});
1093	assert_matches (
1094	{{`1`, `1111` * `3`}, {`1`, `1`, `1`, `1111`, `1`, -`1`}},
1095	{{`1`, `1111` * `3` * `7`}, {`1`, `1`, `1`, `1111`, `1`, -`1`}});
1096	assert_matches (
1097	{{`1`, `303` * `11` * `2`, `1`}, {`1`, `303`, -`1`, `1`}},
1098	{{`1`, `303` * `11` * `3`, `1`}, {`1`, `303`, -`1`, `1`}});
1099	assert_matches (
1100	{{`8`, `1`, `1`, `2` * `4`, `1`, `8`}, {`8`, `2`, -`1`, `1`, `8`}},
1101	{{`8`, `1`, `1`, `2` * `4` * `6`, `1`, `8`}, {`8`, `2`, -`1`, `1`, `8`}});
1102
1103	// LHS split not matching
1104	assert_does_not_match (
1105	{{`3`, `17`, `2` * `4` * `10`, `1`}, {`3` * `17`, `1`, `2`, -`1`, `10`}},
1106	{{`3`, `17`, `2` * `4` * `3` * `10`, `1`}, {`3` * `17`, `1`, `2`, -`1`, `10`}});
1107	assert_does_not_match (
1108	{{`1`, `303` * `11`, `1`}, {`1`, -`1`, `11`, `1`}},
1109	{{`1`, `303` * `11` * `2`, `1`}, {`1`, -`1`, `11`, `1`}});
1110	assert_does_not_match (
1111	{{`2`, `3`, `2` * `2`, `5`}, {`1`, `2` * `3`, `1`, -`1`, `2`, `5`, `1`}},
1112	{{`2`, `3`, `3` * `2`, `5`}, {`1`, `2` * `3`, `1`, -`1`, `2`, `5`, `1`}});
1113	assert_does_not_match (
1114	{{`22`, (`11` + `1`) * `2`, `2`}, {`22`, -`1`, `1`, `1`, `2` * `2`}},
1115	{{`22`, `11` * `2`, `2`}, {`22`, -`1`, `1`, `1`, `2` * `2`}});
1116	assert_does_not_match (
1117	{{`1`, `1111` * `3`}, {`1`, `1`, `1`, -`1`, `1`, `3`}},
1118	{{`1`, `1111` * `2` * `3`}, {`1`, `1`, `1`, -`1`, `1`, `3`}});
1119	assert_does_not_match (
1120	{{`1`, `303` * `11`, `1`}, {`1`, -`1`, `11`, `1`}},
1121	{{`1`, (`303` + `1`) * `11`, `1`}, {`1`, -`1`, `11`, `1`}});
1122	assert_does_not_match (
1123	{{`8`, `1`, `1`, `2` * `4`, `1`, `8`}, {`8`, -`1`, `4`, `1`, `8`}},
1124	{{`8`, `1`, `1`, `3` * `4`, `1`, `8`}, {`8`, -`1`, `4`, `1`, `8`}});
1125
1126	// Merge matching
1127	assert_matches (
1128	{{`3`, `17`, `2` * `4` * `10`, `1`, `9`}, {-`1`, `1`, `2`, `4`, `10`, `9`}},
1129	{{`4`, `18`, `2` * `4` * `10`, `1`, `9`}, {-`1`, `1`, `2`, `4`, `10`, `9`}});
1130	assert_matches ({{`22`, `1`, `23`, `1`}, {-`1`, `1`}}, {{`23`, `1`, `22`, `1`}, {-`1`, `1`}});
1131
1132	// Merge not matching
1133	assert_does_not_match ({{`2`, `3`, `4`}, {-`1`, `4`}}, {{`2`, `3`, `4`}, {`2`, -`1`}});
1134	assert_does_not_match (
1135	{{`22`, `1`, `23`, `1`, `24`}, {-`1`, `24`}}, {{`22`, `1`, `23`, `1`, `24`}, {`22`, -`1`}});
1136
1137	// Split->Merge matching
1138	assert_matches (
1139	{{`22`, `11` * `2`, `3`}, {`22`, `11`, `1`, `1`, -`1`}},
1140	{{`22`, `11` * `3`, `2`}, {`22`, `11`, `1`, `1`, -`1`}});
1141	assert_matches (
1142	{{`1`, `3922` * `3` * `7`, `1`, `2` * `2`}, {`1`, `3922` * `2`, `1`, -`1`}},
1143	{{`1`, `3922` * `7`, `1`, `2`}, {`1`, `3922` * `2`, `1`, -`1`}});
1144
1145	// Split->Merge not matching
1146	assert_does_not_match (
1147	{{`22`, `11` * `2`, `2`}, {`22`, -`1`, `1`, `1`, `4`}},
1148	{{`22`, `11` * `2` * `3`, `2`}, {`22`, -`1`, `1`, `1`, `4`}});
1149	assert_does_not_match (
1150	{{`1`, `3922` * `7`, `1`, `2`}, {`1`, -`1`, `1`, `7`}},
1151	{{`1`, `3922` * `7` * `2`, `1`, `2`}, {`1`, -`1`, `1`, `7`}});
1152
1153	// Merge->Split matching
1154	assert_matches (
1155	{{`1`, `3922` * `2`, `1`, `7`}, {`1`, `3922` * `7`, -`1`}},
1156	{{`1`, `3922` * `2` * `3`, `1`, `7`}, {`1`, `3922` * `7`, -`1`}});
1157	assert_matches (
1158	{{`19`, `3` * `4`, `7`, `99`}, {`19`, `3`, -`1`}}, {{`19`, `3` * `3`, `8`, `10`}, {`19`, `3`, -`1`}});
1159
1160	// Merge->Split not matching
1161	assert_does_not_match (
1162	{{`1`, `3922` * `2`, `1`, `7`}, {`1`, -`1`, `2`}}, {{`1`, `3922`, `1`, `7`}, {`1`, -`1`, `2`}});
1163	assert_does_not_match (
1164	{{`19`, `3` * `4`, `7`, `99`}, {`19`, -`1`, `3`}}, {{`19`, `3` * `5`, `7`, `99`}, {`19`, -`1`, `3`}});
1165	}
1166
1167	TEST_F(NVFuserTest, FusionViewIdGraph_CUDA) {
1168	Fusion fusion;
1169	FusionGuard fg(&fusion);
1170
1171	int w = `2`, x = `3`, y = `4`, z = `5`;
1172
1173	auto tv0 = makeConcreteTensor({w, x, y, z});
1174	fusion.addInput(tv0);
1175
1176	auto tv1 = sin(tv0);
1177
1178	auto tv2 = view(tv1, {w, x, y, z}, {w, y, x * z});
1179	fusion.addOutput(tv2);
1180
1181	auto tv3 = makeConcreteTensor({w, x, y, z});
1182	fusion.addInput(tv3);
1183
1184	auto tv4 = view(tv3, {w, x, y, z}, {w, y, x * z});
1185	fusion.addOutput(tv4);
1186
1187	// Link 0 and 3 together for view analysis done based on before the views
1188	// actually happened.
1189	auto tv5 = add(tv0, tv3);
1190	fusion.addOutput(tv5);
1191
1192	auto tv6 = makeConcreteTensor({w, x, x, y, z});
1193
1194	auto tv7 = sum(tv6, {`2`});
1195	auto tv8 = broadcast(tv7, {false, true, false, true, false, false});
1196
1197	auto tv9 = makeConcreteTensor({w, `6`, x, `7`, y, z});
1198	fusion.addInput(tv9);
1199	auto tv10 = add(tv8, tv9);
1200	fusion.addOutput(tv10);
1201
1202	auto tv12 = view(tv8, {w, `1`, x, `1`, y, z}, {w, y, x * z});
1203	fusion.addOutput(tv12);
1204
1205	// Link the views after the views happen
1206	auto t13 = add(tv12, tv4);
1207	fusion.addOutput(t13);
1208
1209	// Grab the trivial reduced tensor from t12's view.
1210	auto tv11 = ir_utils::producerTvsOf(tv12)[`0`];
1211
1212	// Start from the exact iter domain graph of the fusion
1213	IterDomainGraph id_graph(&fusion);
1214	auto disjoint_view_ids = id_graph.exactNodes();
1215
1216	TORCH_CHECK(
1217	id_graph.exactNodes().strictAreMapped(tv2->axis(`1`), tv4->axis(`1`)));
1218	TORCH_CHECK(
1219	id_graph.exactNodes().strictAreMapped(tv2->axis(`2`), tv4->axis(`2`)));
1220
1221	TORCH_CHECK(id_graph.exactNodes().strictAreMapped(
1222	tv2->getRootDomain()[`1`], tv12->getRootDomain()[`1`]));
1223	TORCH_CHECK(id_graph.exactNodes().strictAreMapped(
1224	tv2->getRootDomain()[`2`], tv12->getRootDomain()[`2`]));
1225	TORCH_CHECK(id_graph.exactNodes().strictAreMapped(
1226	tv2->getRootDomain()[`3`], tv12->getRootDomain()[`3`]));
1227	}
1228
1229	TEST_F(NVFuserTest, FusionViewVectorize_CUDA) {
1230	Fusion fusion;
1231	FusionGuard fg(&fusion);
1232
1233	auto tv0 = makeContigTensor(`3`);
1234	fusion.addInput(tv0);
1235	auto tv1 = flatten(tv0, `1`, `2`);
1236	auto tv2 = flatten(tv0, `1`, `2`);
1237	auto tv3 = sin(tv1);
1238	auto tv4 = sin(tv2);
1239	fusion.addOutput(tv3);
1240	fusion.addOutput(tv4);
1241
1242	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1243	at::Tensor input = at::randn({`256`, `1024`, `1024`}, options);
1244
1245	auto lparams = schedulePointwise(&fusion, {input});
1246
1247	auto hasVectorization = [](TensorView* tv) -> bool {
1248	for (auto i : tv->domain()->domain()) {
1249	if (i->getParallelType() == ParallelType::Vectorize) {
1250	return true;
1251	}
1252	}
1253	return false;
1254	};
1255
1256	for (auto o : fusion.outputs()) {
1257	TORCH_CHECK(hasVectorization(o->as<TensorView>()));
1258	}
1259	for (auto i : fusion.inputs()) {
1260	for (auto c : ir_utils::consumerTvsOf(i->as<TensorView>())) {
1261	TORCH_CHECK(hasVectorization(c));
1262	}
1263	}
1264
1265	FusionExecutor fe;
1266	fe.compileFusion(&fusion, {input}, lparams);
1267	auto outputs = fe.runFusion({input}, lparams);
1268
1269	auto tv_ref = input.flatten(`1`, `2`).sin();
1270
1271	testValidate(&fusion, outputs, {input}, {tv_ref, tv_ref}, __LINE__, __FILE__);
1272	}
1273
1274	TEST_F(NVFuserTest, FusionExpandFlatten_CUDA) {
1275	#ifdef FBCODE_CAFFE2
1276	GTEST_SKIP() << "Fails accuracy on V100 32gb";
1277	#endif
1278	auto fusion = std::make_unique<Fusion>();
1279	FusionGuard fg(fusion.get());
1280
1281	auto tv0 = makeConcreteTensor({-`1`, -`1`, `1`});
1282	fusion ->addInput(tv0);
1283	auto tv1 = expand(
1284	tv0,
1285	{tv0->axis(`0`)->extent(),
1286	tv0->axis(`1`)->extent(),
1287	IrBuilder::create<Int>(`8`)});
1288	auto tv2 = flatten(tv1, `1`, `2`);
1289	auto tv3 = sum(tv2, {`1`});
1290	fusion ->addOutput(tv3);
1291
1292	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1293	at::Tensor input = at::randn({`256`, `1024`, `1`}, options);
1294
1295	FusionExecutorCache executor_cache(std::move(fusion));
1296	auto cg_outputs = executor_cache.runFusionWithInputs({input});
1297
1298	auto aten_out = input.expand({`256`, `1024`, `8`}).flatten(`1`, `2`).sum(`1`);
1299
1300	testValidate(
1301	executor_cache.fusion(),
1302	cg_outputs,
1303	{input},
1304	{aten_out},
1305	__LINE__,
1306	__FILE__);
1307	}
1308
1309	TEST_F(NVFuserTest, FusionIllegalReductionFlatten_CUDA) {
1310	EXPECT_THAT(
1311	[]() {
1312	auto fusion = std::make_unique<Fusion>();
1313	FusionGuard fg(fusion.get());
1314
1315	auto tv0 = makeConcreteTensor({`2`, `3`});
1316	fusion ->addInput(tv0);
1317
1318	auto tv1 = sum(tv0, {`1`});
1319	auto tv2 = flatten(tv1, `0`, `1`);
1320	fusion ->addOutput(tv2);
1321	},
1322	testing::ThrowsMessage<c10::Error>(
1323	testing::HasSubstr("Invalid end_dim")));
1324	}
1325
1326	TEST_F(NVFuserTest, FusionReductionFlatten1_CUDA) {
1327	auto fusion = std::make_unique<Fusion>();
1328	FusionGuard fg(fusion.get());
1329
1330	auto tv0 = makeConcreteTensor({`2`, `3`, `5`});
1331	fusion ->addInput(tv0);
1332
1333	auto tv1 = sum(tv0, {`1`});
1334	auto tv2 = flatten(tv1, `0`, `1`);
1335	fusion ->addOutput(tv2);
1336
1337	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1338	at::manual_seed(`0`);
1339	auto t0 = at::randn({`2`, `3`, `5`}, options);
1340	auto ref = t0.sum({`1`}).flatten(`0`, `1`);
1341
1342	FusionExecutorCache executor_cache(std::move(fusion));
1343	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
1344
1345	testValidate(
1346	executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1347	}
1348
1349	TEST_F(NVFuserTest, FusionPwiseViewSchedule_CUDA) {
1350	Fusion fusion;
1351	FusionGuard fg(&fusion);
1352
1353	int x = `31`, y = `65`, z = `103`;
1354
1355	auto tv0 = makeConcreteTensor({x, y, z});
1356	fusion.addInput(tv0);
1357
1358	auto tv1 = sin(tv0);
1359
1360	auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1361	fusion.addOutput(tv2);
1362
1363	auto tv3 = makeConcreteTensor({x, y, z});
1364	fusion.addInput(tv3);
1365
1366	auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1367	fusion.addOutput(tv4);
1368
1369	// Link 0 and 3 together for view analysis done based on before the views
1370	// actually happened.
1371	auto tv5 = add(tv0, tv3);
1372	fusion.addOutput(tv5);
1373
1374	TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion));
1375	{
1376	TransformPropagator propagator(tv4);
1377	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
1378	}
1379
1380	for (auto i : c10::irange(tv5->nDims() - `1`)) {
1381	tv5->merge(`0`);
1382	}
1383	tv5->split(`0`, `32`);
1384	tv5->split(`0`, `4`);
1385	tv5->axis(`0`)->parallelize(ParallelType::BIDx);
1386	tv5->axis(`1`)->parallelize(ParallelType::Unroll);
1387	tv5->axis(`2`)->parallelize(ParallelType::TIDx);
1388
1389	{
1390	TransformPropagator propagator(tv5);
1391	MaxRootDomainInfoSpanningTree spanning_tree(tv5);
1392	spanning_tree.traverse(&propagator);
1393	scheduler_utils::parallelizeAllLike(tv5);
1394
1395	// Inline the schedule
1396	inlineMost();
1397	}
1398
1399	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1400
1401	at::Tensor t0 = at::randn({x, y, z}, options);
1402	at::Tensor t3 = at::randn({x, y, z}, options);
1403	auto t1 = sin(t0);
1404	auto t2 = at::native::view(t1, {x, y * z});
1405	auto t4 = at::native::view(t3, {x, y * z});
1406	auto t5 = t0 + t3;
1407
1408	FusionExecutor fe;
1409	fe.compileFusion(&fusion, {t0, t3});
1410	auto cg_outputs = fe.runFusion({t0, t3});
1411
1412	testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__);
1413	}
1414
1415	TEST_F(NVFuserTest, FusionSumViewSchedule_CUDA) {
1416	Fusion fusion;
1417	FusionGuard fg(&fusion);
1418
1419	int x = `31`, y = `65`, z = `103`;
1420
1421	auto tv0 = makeConcreteTensor({x, y, z});
1422	fusion.addInput(tv0);
1423
1424	auto tv1 = sin(tv0);
1425
1426	auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1427	fusion.addOutput(tv2);
1428
1429	auto tv3 = makeConcreteTensor({x, y, z});
1430	fusion.addInput(tv3);
1431
1432	auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1433	auto tv5 = sum(tv4, {`1`});
1434	fusion.addOutput(tv5);
1435
1436	// Link 0 and 3 together for view analysis done based on before the views
1437	// actually happened.
1438	auto tv6 = add(tv0, tv3);
1439	fusion.addOutput(tv6);
1440
1441	TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion));
1442	{
1443	TransformPropagator propagator(tv4);
1444	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
1445	}
1446
1447	tv5->split(`1`, `128`);
1448	tv5->split(`1`, `4`);
1449
1450	auto tv5_rf = tv5->rFactor({`1`, `2`});
1451	tv5_rf->axis(`0`)->parallelize(ParallelType::BIDx);
1452	tv5_rf->axis(`2`)->parallelize(ParallelType::Unroll);
1453	tv5_rf->axis(`3`)->parallelize(ParallelType::TIDx);
1454
1455	{
1456	TransformPropagator propagator(tv5_rf);
1457	MaxRootDomainInfoSpanningTree spanning_tree(tv5_rf);
1458	spanning_tree.traverse(&propagator);
1459	scheduler_utils::parallelizeAllLike(tv5_rf);
1460
1461	// Inline the schedule
1462	inlineMost();
1463	}
1464
1465	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1466
1467	at::Tensor t0 = at::randn({x, y, z}, options);
1468	at::Tensor t3 = at::randn({x, y, z}, options);
1469	auto t1 = sin(t0);
1470	auto t2 = at::native::view(t1, {x, y * z});
1471	auto t4 = at::native::view(t3, {x, y * z});
1472	auto t5 = t4.sum({`1`});
1473	auto t6 = t0 + t3;
1474
1475	FusionExecutor fe;
1476	fe.compileFusion(&fusion, {t0, t3});
1477	auto cg_outputs = fe.runFusion({t0, t3});
1478
1479	testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__);
1480	}
1481
1482	// Make sure matching views are segmented into the same kernel
1483	TEST_F(NVFuserTest, FusionViewMagicSchedule1_CUDA) {
1484	auto fusion_ptr = std::make_unique<Fusion>();
1485	Fusion& fusion = *fusion_ptr.get();
1486	FusionGuard fg(&fusion);
1487
1488	int x = `31`, y = `65`, z = `103`;
1489
1490	auto tv0 = makeConcreteTensor({x, y, z});
1491	fusion.addInput(tv0);
1492
1493	auto tv1 = sin(tv0);
1494
1495	auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1496	fusion.addOutput(tv2);
1497
1498	auto tv3 = makeConcreteTensor({x, y, z});
1499	fusion.addInput(tv3);
1500
1501	auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1502	fusion.addOutput(tv4);
1503
1504	// Link 0 and 3 together for view analysis done based on before the views
1505	// actually happened.
1506	auto tv5 = add(tv0, tv3);
1507	fusion.addOutput(tv5);
1508
1509	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1510
1511	at::Tensor t0 = at::randn({x, y, z}, options);
1512	at::Tensor t3 = at::randn({x, y, z}, options);
1513	auto t1 = sin(t0);
1514	auto t2 = at::native::view(t1, {x, y * z});
1515	auto t4 = at::native::view(t3, {x, y * z});
1516	auto t5 = t0 + t3;
1517
1518	FusionExecutorCache executor_cache(std::move(fusion_ptr));
1519	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3});
1520	TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1521
1522	testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__);
1523	}
1524
1525	// Make sure views of views are correct
1526	TEST_F(NVFuserTest, FusionViewMagicSchedule2_CUDA) {
1527	auto fusion_ptr = std::make_unique<Fusion>();
1528	Fusion& fusion = *fusion_ptr.get();
1529	FusionGuard fg(&fusion);
1530
1531	int x = `31`, y = `65`, z = `103`;
1532
1533	auto tv0 = makeConcreteTensor({x, y, z});
1534	fusion.addInput(tv0);
1535
1536	auto tv1 = sin(tv0);
1537
1538	auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1539	auto tv3 = view(tv2, {x, y * z}, {x * y, z});
1540	auto tv4 = view(tv3, {x * y, z}, {y, x * z});
1541	auto tv5 = view(tv4, {y, x * z}, {x, y, z});
1542	fusion.addOutput(tv5);
1543
1544	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1545
1546	at::Tensor t0 = at::randn({x, y, z}, options);
1547	auto aten_out = sin(t0);
1548
1549	// For now pointwise scheduler only accepts a single view at a time, so this
1550	// will be broken up into multiple kernels. This is due to the reference check
1551	// looking for all mappings to all input IDs.
1552	// TODO: Fix the reference check for this case
1553	FusionExecutorCache executor_cache(std::move(fusion_ptr));
1554	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
1555
1556	testValidate(&fusion, cg_outputs, {t0}, {aten_out}, __LINE__, __FILE__);
1557	}
1558
1559	// Make sure broadcasts not on the view path that don't interfere with view are
1560	// segmented in one kernel and correctly trigger 2D pointwise scheduling
1561	TEST_F(NVFuserTest, FusionViewMagicSchedule3_CUDA) {
1562	auto fusion_ptr = std::make_unique<Fusion>();
1563	Fusion& fusion = *fusion_ptr.get();
1564	FusionGuard fg(&fusion);
1565
1566	int w = `15`, x = `31`, y = `49`, z = `65`;
1567
1568	auto tv0 = makeConcreteTensor({x, y, z});
1569	fusion.addInput(tv0);
1570
1571	auto tv1 = sin(tv0);
1572
1573	auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1574	fusion.addOutput(tv2);
1575
1576	auto tv3 = makeConcreteTensor({x, y, z});
1577	fusion.addInput(tv3);
1578
1579	auto tv4 = view(tv3, {x, y, z}, {x, y * z});
1580	fusion.addOutput(tv4);
1581
1582	// Link 0 and 3 together for view analysis done based on before the views
1583	// actually happened.
1584	auto tv5 = add(tv0, tv3);
1585	fusion.addOutput(tv5);
1586
1587	// Broadcast on another branch to drive the pointwise reference to not be on
1588	// the view paths.
1589
1590	auto tv6 = makeConcreteTensor({w, x, y, z});
1591	fusion.addInput(tv6);
1592	auto tv7 = broadcast(tv0, {true, false, false, false});
1593	auto tv8 = add(tv6, tv7);
1594	// tv8 should be the reference for the pointwise fusion. This broadcast
1595	// pattern doesn't interfere with the views, so this should also be scheduled
1596	// as 2D.
1597	fusion.addOutput(tv8);
1598
1599	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1600
1601	at::Tensor t0 = at::randn({x, y, z}, options);
1602	at::Tensor t3 = at::randn({x, y, z}, options);
1603	auto t1 = sin(t0);
1604	auto t2 = at::native::view(t1, {x, y * z});
1605	auto t4 = at::native::view(t3, {x, y * z});
1606	auto t5 = t0 + t3;
1607	at::Tensor t6 = at::randn({w, x, y, z}, options);
1608	auto t8 = t6.add(t0);
1609
1610	FusionExecutorCache executor_cache(std::move(fusion_ptr));
1611	// Collect the heuristic params
1612	executor_cache.profile(true);
1613	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6});
1614
1615	TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1616	TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
1617	.params ->isA<PointwiseParams>());
1618	auto pparams =
1619	executor_cache.getMostRecentExecutorInfo().params ->as<PointwiseParams>();
1620	TORCH_CHECK(pparams->break_point == `1`);
1621
1622	testValidate(
1623	&fusion, cg_outputs, {t0, t3, t6}, {t2, t4, t5, t8}, __LINE__, __FILE__);
1624	}
1625
1626	// Make sure broadcasts through views when not conflicting with view are
1627	// segmented into one kernel and trigger 2D pointwise scheduler.
1628	TEST_F(NVFuserTest, FusionViewMagicSchedule4_CUDA) {
1629	auto fusion_ptr = std::make_unique<Fusion>();
1630	Fusion& fusion = *fusion_ptr.get();
1631	FusionGuard fg(&fusion);
1632
1633	int w = `15`, x = `31`, y = `49`, z = `65`;
1634
1635	auto tv0 = makeConcreteTensor({x, y, z});
1636	fusion.addInput(tv0);
1637
1638	auto tv1 = sin(tv0);
1639
1640	auto tv2 = view(tv1, {x, y, z}, {x, y * z});
1641	fusion.addOutput(tv2);
1642
1643	auto tv3 = makeConcreteTensor({x, y, z});
1644	fusion.addInput(tv3);
1645
1646	auto tv4 = makeConcreteTensor({x, `1`, `1`});
1647	fusion.addInput(tv4);
1648
1649	auto tv5 = add(tv4, tv3);
1650
1651	auto tv6 = view(tv5, {x, y, z}, {x, y * z});
1652	fusion.addOutput(tv6);
1653
1654	// Link 0 and 3 together for view analysis done based on before the views
1655	// actually happened.
1656	auto tv7 = add(tv0, tv3);
1657	fusion.addOutput(tv7);
1658
1659	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1660
1661	at::Tensor t0 = at::randn({x, y, z}, options);
1662	at::Tensor t3 = at::randn({x, y, z}, options);
1663	at::Tensor t4 = at::randn({x, `1`, `1`}, options);
1664	auto t1 = sin(t0);
1665	auto t2 = at::native::view(t1, {x, y * z});
1666	auto t5 = t4 + t3;
1667	auto t6 = at::native::view(t5, {x, y * z});
1668	auto t7 = t0 + t3;
1669
1670	FusionExecutorCache executor_cache(std::move(fusion_ptr));
1671	// Collect the heuristic params
1672	executor_cache.profile(true);
1673	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t4});
1674
1675	TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1676	TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
1677	.params ->isA<PointwiseParams>());
1678	auto pparams =
1679	executor_cache.getMostRecentExecutorInfo().params ->as<PointwiseParams>();
1680	TORCH_CHECK(pparams->break_point == `1`);
1681
1682	testValidate(
1683	&fusion, cg_outputs, {t0, t3, t4}, {t2, t6, t7}, __LINE__, __FILE__);
1684	}
1685
1686	// Make sure different views that are consumed by the reference are segmented
1687	// into a single kernel.
1688	TEST_F(NVFuserTest, FusionViewMagicSchedule5_CUDA) {
1689	auto fusion_ptr = std::make_unique<Fusion>();
1690	Fusion& fusion = *fusion_ptr.get();
1691	FusionGuard fg(&fusion);
1692
1693	int w = `15`, x = `31`, y = `49`, z = `65`;
1694
1695	auto tv0 = makeConcreteTensor({w, x, y * z});
1696	fusion.addInput(tv0);
1697	auto tv1 = sin(tv0);
1698	auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w});
1699
1700	auto tv3 = makeConcreteTensor({w, x * y, z});
1701	fusion.addInput(tv3);
1702	auto tv4 = cos(tv3);
1703	auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w});
1704
1705	auto tv6 = add(tv2, tv5);
1706	fusion.addOutput(tv6);
1707
1708	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1709
1710	at::Tensor t0 = at::randn({w, x, y * z}, options);
1711	auto t1 = sin(t0);
1712	auto t2 = at::native::view(t1, {z, y, x, w});
1713	at::Tensor t3 = at::randn({w, x * y, z}, options);
1714	auto t4 = cos(t3);
1715	auto t5 = at::native::view(t4, {z, y, x, w});
1716	auto t6 = add(t2, t5);
1717
1718	FusionExecutorCache executor_cache(std::move(fusion_ptr));
1719	// Collect the heuristic params
1720	executor_cache.profile(true);
1721	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3});
1722
1723	TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
1724	TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
1725	.params ->isA<PointwiseParams>());
1726
1727	testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__);
1728	}
1729
1730	// Make sure different views that are consumed by the reference are segmented
1731	// into a single kernel.
1732	TEST_F(NVFuserTest, FusionViewMapping_CUDA) {
1733	auto fusion_ptr = std::make_unique<Fusion>();
1734	Fusion& fusion = *fusion_ptr.get();
1735	FusionGuard fg(&fusion);
1736
1737	int w = `15`, x = `31`, y = `49`, z = `65`;
1738
1739	auto tv0 = makeConcreteTensor({w, x, y * z});
1740	fusion.addInput(tv0);
1741	auto tv1 = sin(tv0);
1742	auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w});
1743
1744	auto tv3 = makeConcreteTensor({w, x * y, z});
1745	fusion.addInput(tv3);
1746	auto tv4 = cos(tv3);
1747	auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w});
1748
1749	auto tv6 = add(tv2, tv5);
1750	fusion.addOutput(tv6);
1751
1752	tv6->merge(`0`);
1753	tv6->merge(`0`);
1754	tv6->merge(`0`);
1755	tv6->split(`0`, `128`);
1756	tv6->split(`0`, `4`);
1757	tv6->axis(`0`)->parallelize(ParallelType::BIDx);
1758	tv6->axis(`1`)->parallelize(ParallelType::Unroll);
1759	tv6->axis(`2`)->parallelize(ParallelType::TIDx);
1760
1761	TransformPropagator propagator(tv6);
1762	MaxRootDomainInfoSpanningTree spanning_tree(tv6);
1763	spanning_tree.traverse(&propagator);
1764	scheduler_utils::parallelizeAllLike(tv6);
1765
1766	// Inline the schedule
1767	inlineMost();
1768
1769	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1770
1771	at::Tensor t0 = at::randn({w, x, y * z}, options);
1772	auto t1 = sin(t0);
1773	auto t2 = at::native::view(t1, {z, y, x, w});
1774	at::Tensor t3 = at::randn({w, x * y, z}, options);
1775	auto t4 = cos(t3);
1776	auto t5 = at::native::view(t4, {z, y, x, w});
1777	auto t6 = add(t2, t5);
1778
1779	FusionExecutor fe;
1780	fe.compileFusion(&fusion, {t0, t3});
1781	auto cg_outputs = fe.runFusion({t0, t3});
1782
1783	testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__);
1784	}
1785
1786	TEST_F(NVFuserTest, FusionLowerDivisibleSplits_CUDA) {
1787	auto fusion_ptr = std::make_unique<Fusion>();
1788	Fusion& fusion = *fusion_ptr.get();
1789	FusionGuard fg(&fusion);
1790
1791	int w = `15`, x = `31`, y = `49`, z = `65`;
1792
1793	auto tv0 = makeContigTensor(`4`);
1794	fusion.addInput(tv0);
1795	auto tv1 = sin(tv0);
1796	auto tv2 = view(tv1, {w, x, y, z}, {z, y, x, w});
1797
1798	fusion.addOutput(tv2);
1799
1800	tv2->merge(`0`)->merge(`0`)->merge(`0`)->split(`0`, `4`)->split(`0`, `8`, false);
1801
1802	TransformPropagator propagator(tv2);
1803	MaxRootDomainInfoSpanningTree spanning_tree(tv2);
1804	spanning_tree.traverse(&propagator);
1805	scheduler_utils::parallelizeAllLike(tv2);
1806
1807	// Inline the schedule
1808	inlineMost();
1809
1810	auto divisible_splits = getAllDivisibleSplits(&fusion);
1811
1812	// Operations on all tensors are basically:
1813	// [10] merge(0) [9]->outer->definition
1814	// [9] merge(0) [8]->outer->definition
1815	// [8] merge(0) [7]->in->definition
1816	// [7] split(0, z, false) [6]->in->definition
1817	// [6] split(1, y, false) [5]->in->definition
1818	// [5] split(2, x, false) [3]->inner->definition
1819	// RFactor of tv2
1820	// [4] merge(0) [3]->outer->definition
1821	// [3] merge(0) [2]->outer->definition
1822	// [2] merge(0) [1]->in->definition
1823	// [1] split(0, 4) [0]->in->definition
1824	// [0] split(0, 8, false) tv->axis(0)->definition
1825
1826	for (auto tv : std::vector<TensorView*>({tv2, tv1, tv0})) {
1827	auto transform_0 = tv->axis(`0`)->definition()->as<Split>();
1828	auto transform_1 = transform_0->in()->definition()->as<Split>();
1829	auto transform_2 = transform_1->in()->definition()->as<Merge>();
1830	auto transform_3 = transform_2->outer()->definition()->as<Merge>();
1831
1832	auto transform_5 = transform_3->inner()->definition()->as<Split>();
1833	auto transform_6 = transform_5->in()->definition()->as<Split>();
1834	auto transform_7 = transform_6->in()->definition()->as<Split>();
1835
1836	TORCH_CHECK(
1837	divisible_splits.find(transform_5) != divisible_splits.end(),
1838	"Expecting: ",
1839	transform_5->toString(),
1840	"\nFrom TV: ",
1841	tv,
1842	"\nTo be a divisible split.");
1843	TORCH_CHECK(
1844	divisible_splits.find(transform_6) != divisible_splits.end(),
1845	"Expecting: ",
1846	transform_6->toString(),
1847	"\nFrom TV: ",
1848	tv,
1849	"\nTo be a divisible split.");
1850	TORCH_CHECK(
1851	divisible_splits.find(transform_7) != divisible_splits.end(),
1852	"Expecting: ",
1853	transform_7->toString(),
1854	"\nFrom TV: ",
1855	tv,
1856	"\nTo be a divisible split.");
1857	}
1858	}
1859
1860	} // namespace jit
1861	} // namespace torch
1862	#endif // #if defined(USE_CUDA)
1863

Browse the source code of pytorch/third_party/nvfuser/test/test_gpu_view.cpp