test_gpu3.cpp source code [pytorch/third_party/nvfuser/test/test_gpu3.cpp]

1	#if defined(USE_CUDA)
2	#include <gmock/gmock-matchers.h>
3	#include <gtest/gtest.h>
4
5	#include <arith.h>
6	#include <codegen.h>
7	#include <disjoint_set.h>
8	#include <executor.h>
9	#include <executor_launch_params.h>
10	#include <expr_evaluator.h>
11	#include <fusion.h>
12	#include <fusion_segmenter.h>
13	#include <grouped_reduction.h>
14	#include <inlining.h>
15	#include <ir_all_nodes.h>
16	#include <ir_builder.h>
17	#include <ir_graphviz.h>
18	#include <ir_iostream.h>
19	#include <ir_utils.h>
20	#include <iter_visitor.h>
21	#include <kernel_cache.h>
22	#include <kernel_expr_evaluator.h>
23	#include <kernel_ir.h>
24	#include <kernel_ir_dispatch.h>
25	#include <lower2device.h>
26	#include <lower_magic_zero.h>
27	#include <mutator.h>
28	#include <ops/all_ops.h>
29	#include <parser.h>
30	#include <register_interface.h>
31	#include <root_domain_map.h>
32	#include <scheduler/all_schedulers.h>
33	#include <scheduler/reduction_utils.h>
34	#include <scheduler/utils.h>
35	#include <test/test_gpu_validator.h>
36	#include <test/test_utils.h>
37	#include <transform_replay.h>
38	#include <transform_rfactor.h>
39
40	#include <test/cpp/jit/test_utils.h>
41	#include <torch/csrc/jit/api/function_impl.h>
42	#include <torch/csrc/jit/codegen/cuda/interface.h>
43	#include <torch/csrc/jit/ir/irparser.h>
44	#include <torch/torch.h>
45
46	#include <ATen/cuda/CUDAContext.h>
47	#include <ATen/cuda/Exceptions.h>
48	#include <c10/cuda/CUDAStream.h>
49
50	#include <algorithm>
51	#include <iostream>
52	#include <sstream>
53	#include <thread>
54
55	// Tests go in torch::jit
56	namespace torch {
57	namespace jit {
58
59	using namespace torch::jit::fuser::cuda;
60	using namespace at::indexing;
61
62	TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
63	Fusion fusion;
64	FusionGuard fg(&fusion);
65
66	auto tv0 = makeSymbolicTensor(`1`);
67	fusion.addInput(tv0);
68
69	auto tv1 = sum(tv0, {`0`});
70	fusion.addOutput(tv1);
71
72	// [I]
73	tv1->split(`0`, `5`);
74	// [ceilDiv(I, 5), 5]
75
76	// This second split is non-divisible. The split domain must be predicated.
77	tv1->split(`1`, `3`);
78	// [ceilDiv(I, 5), 2, 3]
79
80	auto tv2 = sum(tv0, {`0`});
81	fusion.addOutput(tv2);
82
83	// tv2 shouldn't need to have another predicate
84	tv2->split(`0`, `4`);
85	tv2->split(`1`, `2`);
86
87	GpuLower gpulw(&fusion);
88	TORCH_CHECK(
89	gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
90	"There must be no split to validate");
91	TORCH_CHECK(
92	gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == `1`,
93	"Only tv1 should have a non-divisible predicate.");
94	for (auto tv : {loweredTv(tv1, gpulw)}) {
95	auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
96	TORCH_CHECK(
97	it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
98	"No info found for ",
99	tv);
100	const auto& splits_to_predicate = it ->second;
101	TORCH_CHECK(
102	splits_to_predicate.size() == `1`,
103	"There must be one split to predicate");
104	}
105
106	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
107	at::manual_seed(`0`);
108	at::Tensor t0 = at::randn({`24`}, options);
109
110	FusionExecutor fe;
111	fe.compileFusion(&fusion, {t0});
112	auto cg_outputs = fe.runFusion({t0});
113
114	auto ref = t0.sum();
115
116	testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__);
117	}
118
119	// Repro of issue #1074
120	TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
121	Fusion fusion;
122	FusionGuard fg(&fusion);
123
124	auto tv0 = makeSymbolicTensor(`2`);
125	fusion.addInput(tv0);
126	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
127	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
128	fusion.addOutput(tv2);
129
130	tv2->split(`0`, `2`);
131	tv2->split(-`1`, `4`);
132	tv2->reorder({{`1`, `2`}, {`2`, `1`}});
133	tv0->computeAt(tv2, `2`);
134
135	tv2->split(-`1`, `3`);
136
137	// To make the sanitizer catch the invalid accesses. Not necessary
138	// to expose the bug.
139	tv1->setMemoryType(MemoryType::Shared);
140
141	GpuLower gpulw(&fusion);
142	TORCH_CHECK(
143	gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
144	"There must be no split to validate");
145	TORCH_CHECK(
146	gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == `1`,
147	"Only tv2 should have a non-divisible predicate.");
148	for (auto tv : {loweredTv(tv2, gpulw)}) {
149	auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
150	TORCH_CHECK(
151	it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
152	"No info found for ",
153	tv);
154	const auto& splits_to_predicate = it ->second;
155	TORCH_CHECK(
156	splits_to_predicate.size() == `1`,
157	"There must be one split to predicate");
158	}
159
160	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
161	at::manual_seed(`0`);
162	at::Tensor t0 = at::randn({`13`, `17`}, options);
163
164	FusionExecutor fe;
165	fe.compileFusion(&fusion, {t0});
166	auto cg_outputs = fe.runFusion({t0});
167
168	auto ref = t0 + `2`;
169
170	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
171	}
172
173	// Similar to FusionNonDivisibleSplit1 but with unswitch
174	TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
175	Fusion fusion;
176	FusionGuard fg(&fusion);
177
178	auto tv0 = makeSymbolicTensor(`1`);
179	fusion.addInput(tv0);
180
181	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
182	auto tv2 = sum(tv1, {`0`});
183	fusion.addOutput(tv2);
184
185	tv2->split(`0`, `5`);
186	tv2->split(`1`, `3`);
187
188	tv0->computeAt(tv2, -`1`);
189
190	tv2->axis(`0`)->parallelize(ParallelType::Unswitch);
191
192	GpuLower gpulw(&fusion);
193	TORCH_CHECK(
194	gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
195	"There must be no split to validate");
196	TORCH_CHECK(
197	gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == `2`,
198	"Both tv1 and tv2 should have a non-divisible predicate.");
199	for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
200	auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
201	TORCH_CHECK(
202	it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
203	"No info found for ",
204	tv);
205	const auto& splits_to_predicate = it ->second;
206	TORCH_CHECK(
207	splits_to_predicate.size() == `1`,
208	"There must be one split to predicate");
209	}
210
211	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
212	at::manual_seed(`0`);
213	at::Tensor t0 = at::randn({`24`}, options);
214
215	FusionExecutor fe;
216	fe.compileFusion(&fusion, {t0});
217	auto cg_outputs = fe.runFusion({t0});
218
219	auto ref = (t0 + `1`).sum();
220
221	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
222	}
223
224	// Non-divisible split through merge
225	TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
226	Fusion fusion;
227	FusionGuard fg(&fusion);
228
229	auto tv0 = makeSymbolicTensor(`2`);
230	fusion.addInput(tv0);
231
232	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
233	auto tv2 = sum(tv1, {`0`, `1`});
234	fusion.addOutput(tv2);
235
236	tv2->split(`0`, `5`);
237	tv2->merge(`1`, `2`);
238	tv2->split(`1`, `3`);
239
240	tv0->computeAt(tv2, -`1`);
241
242	GpuLower gpulw(&fusion);
243	TORCH_CHECK(
244	gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
245	"There must be no split to validate");
246	TORCH_CHECK(
247	gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == `2`,
248	"Both tv1 and tv2 should have a non-divisible predicate.");
249	for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
250	auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
251	TORCH_CHECK(
252	it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
253	"No info found for ",
254	tv);
255	const auto& splits_to_predicate = it ->second;
256	TORCH_CHECK(
257	splits_to_predicate.size() == `1`,
258	"There must be one split to predicate");
259	}
260
261	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
262	at::manual_seed(`0`);
263	at::Tensor t0 = at::randn({`24`, `2`}, options);
264
265	FusionExecutor fe;
266	fe.compileFusion(&fusion, {t0});
267	auto cg_outputs = fe.runFusion({t0});
268
269	auto ref = (t0 + `1`).sum();
270
271	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
272	}
273
274	// Nested splits
275	TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
276	Fusion fusion;
277	FusionGuard fg(&fusion);
278
279	auto tv0 = makeSymbolicTensor(`1`);
280	fusion.addInput(tv0);
281
282	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
283	auto tv2 = sum(tv1, {`0`});
284	fusion.addOutput(tv2);
285
286	// [I]
287	tv2->split(`0`, `8`);
288	// [I/8, 8]
289	tv2->split(`1`, `2`);
290	// [I/8, 4, 2]
291	tv2->split(`1`, `3`); // non-divisible split of outer output
292	// [I/8, 2, 3, 2]
293
294	tv0->computeAt(tv2, -`1`);
295
296	GpuLower gpulw(&fusion);
297	TORCH_CHECK(
298	gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
299	"There must be no split to validate");
300	TORCH_CHECK(
301	gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == `2`,
302	"Both tv1 and tv2 should have a non-divisible predicate.");
303	for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
304	auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
305	TORCH_CHECK(
306	it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
307	"No info found for ",
308	tv);
309	const auto& splits_to_predicate = it ->second;
310	TORCH_CHECK(
311	splits_to_predicate.size() == `1`,
312	"There must be one split to predicate");
313	}
314
315	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
316	at::manual_seed(`0`);
317	at::Tensor t0 = at::randn({`24`}, options);
318
319	FusionExecutor fe;
320	fe.compileFusion(&fusion, {t0});
321	auto cg_outputs = fe.runFusion({t0});
322
323	auto ref = (t0 + `1`).sum();
324
325	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
326	}
327
328	// Vectorized non-divisible split. Must be validated at run time
329	TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
330	Fusion fusion;
331	FusionGuard fg(&fusion);
332
333	auto tv0 = makeContigTensor(`1`);
334	fusion.addInput(tv0);
335
336	auto tv1 = set(tv0);
337	fusion.addOutput(tv1);
338
339	tv1->split(`0`, `8`, false);
340	tv1->split(`1`, `4`);
341
342	tv1->axis(-`1`)->parallelize(ParallelType::Vectorize);
343
344	GpuLower gpulw(&fusion);
345	TORCH_CHECK(
346	gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == `1`,
347	"There should be one split to validate");
348	for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
349	const auto& splits_to_predicate = kv.second;
350	TORCH_CHECK(
351	splits_to_predicate.empty(),
352	"There must be no split to predicate, but tensor t",
353	kv.first->name(),
354	" has:",
355	splits_to_predicate);
356	}
357
358	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
359	at::manual_seed(`0`);
360	auto t0 = at::randn({`32`}, options);
361
362	FusionExecutor fe;
363	fe.compileFusion(&fusion, {t0});
364	auto cg_outputs = fe.runFusion({t0});
365
366	auto ref = t0;
367
368	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
369
370	auto t0_non_divisible = at::randn({`8`}, options);
371	// Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
372	// illegal. The run-time validation of vectorization should throw an error.
373	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
374	ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
375	}
376
377	// If a split is validated at run time, it's not necessary to predicate.
378	TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
379	Fusion fusion;
380	FusionGuard fg(&fusion);
381
382	auto tv0 = makeContigTensor(`1`);
383	fusion.addInput(tv0);
384
385	auto tv1 = set(tv0);
386	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
387	auto tv3 = sum(tv2, {`0`});
388	fusion.addOutput(tv3);
389
390	tv3->split(`0`, `8`, false);
391	tv3->split(`1`, `4`);
392	TransformPropagatorWithCheck propagator(tv3);
393	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
394
395	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
396	scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
397
398	tv1->axis(`2`)->parallelize(ParallelType::Vectorize);
399
400	GpuLower gpulw(&fusion);
401	TORCH_CHECK(
402	gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == `1`,
403	"There should be one split to validate");
404	for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
405	const auto& splits_to_predicate = kv.second;
406	TORCH_CHECK(
407	splits_to_predicate.empty(),
408	"There must be no split to predicate, but tensor t",
409	kv.first->name(),
410	" has:",
411	splits_to_predicate);
412	}
413
414	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
415	at::manual_seed(`0`);
416
417	auto t0 = at::randn({`1024`}, options);
418
419	FusionExecutor fe;
420	fe.compileFusion(&fusion, {t0});
421	auto cg_outputs = fe.runFusion({t0});
422
423	auto ref = (t0 + `1`).sum();
424
425	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
426	}
427
428	TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) {
429	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
430	Fusion& fusion = *fusion_ptr.get();
431	FusionGuard fg(&fusion);
432
433	std::vector<int64_t> input_shape_0 = {`10`, `20`};
434	std::vector<int64_t> input_shape_1 = {`15`};
435
436	TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
437	TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
438	fusion.addInput(in_0);
439	fusion.addInput(in_1);
440
441	TensorView* out_0 = add(in_0, IrBuilder::create<Double>(`0.f`));
442	TensorView* out_1 = add(in_1, IrBuilder::create<Double>(`2.f`));
443
444	fusion.addOutput(out_0);
445	fusion.addOutput(out_1);
446
447	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
448	at::Tensor at_in_0 = at::randn(input_shape_0, options);
449	at::Tensor at_in_1 = at::randn(input_shape_1, options);
450	std::vector<IValue> aten_inputs = {at_in_0, at_in_1};
451
452	FusionExecutorCache fec(std::move(fusion_ptr));
453	auto outputs = fec.runFusionWithInputs(aten_inputs);
454
455	auto t1 = at_in_1 + `2`;
456
457	auto runtime = fec.getMostRecentKernelRuntime();
458	TORCH_INTERNAL_ASSERT(runtime->isSegmented());
459	TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == `2`);
460
461	testValidate(
462	&fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__);
463	}
464
465	TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) {
466	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
467	Fusion& fusion = *fusion_ptr.get();
468	FusionGuard fg(&fusion);
469
470	std::vector<int64_t> input_shape_0 = {`4`, `4`};
471	std::vector<int64_t> input_shape_1 = {`3`, `4`, `4`};
472	std::vector<int64_t> input_shape_2 = {`2`, `8`, `4`, `4`};
473
474	TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
475	TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
476	TensorView* in_2 = makeSymbolicTensor(input_shape_2.size());
477
478	fusion.addInput(in_0);
479	fusion.addInput(in_1);
480	fusion.addInput(in_2);
481
482	TensorView* out_0 = add(in_0, in_1);
483	TensorView* out_1 = add(in_0, in_2);
484
485	fusion.addOutput(out_0);
486	fusion.addOutput(out_1);
487
488	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
489	at::Tensor at_in_0 = at::randn(input_shape_0, options);
490	at::Tensor at_in_1 = at::randn(input_shape_1, options);
491	at::Tensor at_in_2 = at::randn(input_shape_2, options);
492
493	std::vector<IValue> aten_inputs = {at_in_0, at_in_1, at_in_2};
494
495	FusionExecutorCache fec(std::move(fusion_ptr));
496	auto outputs = fec.runFusionWithInputs(aten_inputs);
497
498	auto t0 = at_in_0 + at_in_1;
499	auto t1 = at_in_0 + at_in_2;
500
501	auto runtime = fec.getMostRecentKernelRuntime();
502	TORCH_INTERNAL_ASSERT(runtime->isSegmented());
503	TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == `2`);
504
505	testValidate(
506	&fusion,
507	outputs,
508	{at_in_0, at_in_1, at_in_2},
509	{t0, t1},
510	__LINE__,
511	__FILE__);
512	}
513
514	TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) {
515	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
516	Fusion& fusion = *fusion_ptr.get();
517	FusionGuard fg(&fusion);
518
519	auto t0 = makeContigTensor(`1`);
520	auto t1 = makeContigTensor(`2`);
521
522	fusion.addInput(t0);
523	fusion.addInput(t1);
524
525	auto t2 = broadcast(t0, {true, false});
526	auto t3 = add(t1, t2);
527	auto t4 = add(t3, t2);
528	auto t5 = sum(t4, {`1`});
529	auto t6 = broadcast(t5, {false, true});
530	auto t7 = add(t3, t6);
531
532	fusion.addOutput(t7);
533
534	t3->computeAt(t7, -`1`, ComputeAtMode::MostInlined);
535
536	TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == `1`);
537	}
538
539	TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) {
540	Fusion fusion;
541	FusionGuard fg(&fusion);
542
543	auto tv0 = makeContigTensor(`1`);
544	fusion.addInput(tv0);
545
546	auto tv1 = set(tv0);
547	auto tv2 = add(tv1, IrBuilder::create<Double>(`1.0`));
548	auto tv3 = set(tv2);
549	fusion.addOutput(tv3);
550
551	tv1->setMemoryType(MemoryType::Shared);
552
553	tv3->split(-`1`, `128`);
554	tv3->split(-`1`, `32`);
555	TransformPropagatorWithCheck propagator(tv3);
556	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
557
558	tv0->computeAt(tv3, `1`);
559
560	tv3->axis(-`2`)->parallelize(ParallelType::BIDx);
561	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
562	scheduler_utils::parallelizeAllLike(tv3);
563
564	tv1->doubleBuffer();
565
566	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
567	at::manual_seed(`0`);
568	auto t0 = at::randn({`1000`}, options);
569
570	FusionExecutor fe;
571	fe.compileFusion(&fusion, {t0});
572	auto cg_outputs = fe.runFusion({t0});
573
574	auto ref = t0 + `1`;
575
576	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
577	}
578
579	TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) {
580	Fusion fusion;
581	FusionGuard fg(&fusion);
582
583	auto tv0 = makeContigTensor(`1`);
584	fusion.addInput(tv0);
585
586	auto tv1 = set(tv0);
587	auto tv2 = add(tv1, IrBuilder::create<Double>(`1.0`));
588	auto tv3 = set(tv2);
589	fusion.addOutput(tv3);
590
591	tv3->split(-`1`, `128`);
592	tv3->split(-`1`, `32`);
593	TransformPropagatorWithCheck propagator(tv3);
594	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
595
596	tv0->computeAt(tv3, -`1`);
597
598	tv3->axis(-`2`)->parallelize(ParallelType::BIDx);
599	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
600	scheduler_utils::parallelizeAllLike(tv3);
601
602	tv1->doubleBuffer();
603
604	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
605	at::manual_seed(`0`);
606	auto t0 = at::randn({`1000`}, options);
607
608	FusionExecutor fe;
609	fe.compileFusion(&fusion, {t0});
610	auto cg_outputs = fe.runFusion({t0});
611
612	auto ref = t0 + `1`;
613
614	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
615	}
616
617	TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) {
618	Fusion fusion;
619	FusionGuard fg(&fusion);
620
621	auto tv0 = makeContigTensor(`1`);
622	fusion.addInput(tv0);
623
624	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
625	auto tv2 = set(tv1);
626	auto tv3 = add(tv2, IrBuilder::create<Double>(`1.0`));
627	fusion.addOutput(tv3);
628
629	tv1->setMemoryType(MemoryType::Shared);
630
631	tv3->split(-`1`, `128`);
632	tv3->split(-`1`, `32`);
633	TransformPropagatorWithCheck propagator(tv3);
634	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
635
636	tv0->computeAt(tv3, `1`);
637
638	// tv2 is invalid to double-buffer as its producer, tv1, is
639	// computed inside the double-buffering loop.
640	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
641	ASSERT_ANY_THROW(tv2->doubleBuffer());
642
643	// Moving tv2 inner makes tv1 large enough to double-buffer tv2
644	tv2->computeAt(tv3, `2`);
645
646	tv2->doubleBuffer();
647
648	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
649	scheduler_utils::parallelizeAllLike(tv3);
650
651	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
652	at::manual_seed(`0`);
653	auto t0 = at::randn({`1000`}, options);
654
655	FusionExecutor fe;
656	fe.compileFusion(&fusion, {t0});
657	auto cg_outputs = fe.runFusion({t0});
658
659	auto ref = t0 + `2`;
660
661	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
662	}
663
664	// Double buffering smem to local and unswitch
665	TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) {
666	Fusion fusion;
667	FusionGuard fg(&fusion);
668
669	auto tv0 = makeContigTensor(`1`);
670	fusion.addInput(tv0);
671
672	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
673	auto tv2 = set(tv1);
674	auto tv3 = add(tv2, IrBuilder::create<Double>(`1.0`));
675	fusion.addOutput(tv3);
676
677	tv1->setMemoryType(MemoryType::Shared);
678
679	tv3->split(-`1`, `128`);
680	tv3->split(-`1`, `32`);
681	tv3->split(-`1`, `8`);
682	TransformPropagatorWithCheck propagator(tv3);
683	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
684
685	tv0->computeAt(tv3, `2`);
686	tv2->computeAt(tv3, -`1`);
687
688	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
689	tv3->axis(`1`)->parallelize(ParallelType::Unswitch);
690	scheduler_utils::parallelizeAllLike(tv3);
691
692	tv2->doubleBuffer();
693
694	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
695	at::manual_seed(`0`);
696	auto t0 = at::randn({`1000`}, options);
697
698	FusionExecutor fe;
699	fe.compileFusion(&fusion, {t0});
700	auto cg_outputs = fe.runFusion({t0});
701
702	auto ref = t0 + `2`;
703
704	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
705	}
706
707	// Double buffering gmem to shared and unswitch
708	TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) {
709	Fusion fusion;
710	FusionGuard fg(&fusion);
711
712	auto tv0 = makeContigTensor(`1`);
713	fusion.addInput(tv0);
714
715	auto tv1 = set(tv0);
716	auto tv2 = add(tv1, IrBuilder::create<Double>(`1.0`));
717	fusion.addOutput(tv2);
718
719	tv1->setMemoryType(MemoryType::Shared);
720
721	tv2->split(-`1`, `128`);
722	tv2->split(-`1`, `32`);
723	tv2->split(-`1`, `8`);
724	TransformPropagatorWithCheck propagator(tv2);
725	MaxRootDomainInfoSpanningTree (tv2).traverse(&propagator);
726
727	tv0->computeAt(tv2, `2`);
728	tv1->computeAt(tv2, -`1`);
729
730	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
731	tv2->axis(`1`)->parallelize(ParallelType::Unswitch);
732	scheduler_utils::parallelizeAllLike(tv2);
733
734	tv1->doubleBuffer();
735
736	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
737	at::manual_seed(`0`);
738	auto t0 = at::randn({`1000`}, options);
739
740	FusionExecutor fe;
741	fe.compileFusion(&fusion, {t0});
742	auto cg_outputs = fe.runFusion({t0});
743
744	auto ref = t0 + `1`;
745
746	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
747	}
748
749	// Double buffering smem to local and unroll
750	TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) {
751	Fusion fusion;
752	FusionGuard fg(&fusion);
753
754	auto tv0 = makeContigTensor(`1`);
755	fusion.addInput(tv0);
756
757	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
758	auto tv2 = set(tv1);
759	auto tv3 = add(tv2, IrBuilder::create<Double>(`1.0`));
760	fusion.addOutput(tv3);
761
762	tv1->setMemoryType(MemoryType::Shared);
763
764	tv3->split(-`1`, `128`);
765	tv3->split(-`1`, `16`);
766	tv3->split(-`2`, `4`);
767	tv3->split(-`2`, `2`);
768	TransformPropagatorWithCheck propagator(tv3);
769	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
770
771	tv0->computeAt(tv3, `1`);
772	tv2->computeAt(tv3, -`1`);
773
774	tv3->axis(`2`)->parallelize(ParallelType::Unroll);
775	tv3->axis(`4`)->parallelize(ParallelType::TIDx);
776
777	tv2->doubleBuffer();
778
779	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
780	at::manual_seed(`0`);
781	auto t0 = at::randn({`199`}, options);
782
783	FusionExecutor fe;
784	fe.compileFusion(&fusion, {t0});
785	auto cg_outputs = fe.runFusion({t0});
786
787	auto ref = t0 + `2`;
788
789	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
790	}
791
792	// Double buffering and vectorize
793	TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) {
794	Fusion fusion;
795	FusionGuard fg(&fusion);
796
797	auto tv0 = makeContigTensor(`1`);
798	fusion.addInput(tv0);
799
800	auto tv1 = set(tv0);
801	auto tv2 = add(tv1, IrBuilder::create<Double>(`1.0`));
802	fusion.addOutput(tv2);
803
804	tv2->split(-`1`, `128`);
805	tv2->split(-`1`, `4`);
806	TransformPropagatorWithCheck propagator(tv2);
807	MaxRootDomainInfoSpanningTree (tv2).traverse(&propagator);
808
809	tv1->computeAt(tv2, `2`);
810
811	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
812
813	tv1->axis(-`1`)->parallelize(ParallelType::Vectorize);
814
815	tv1->doubleBuffer();
816
817	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
818	at::manual_seed(`0`);
819	auto t0 = at::randn({`200`}, options);
820
821	FusionExecutor fe;
822	fe.compileFusion(&fusion, {t0});
823	auto cg_outputs = fe.runFusion({t0});
824
825	auto ref = t0 + `1`;
826
827	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
828	}
829
830	// Multiple tensors to double-buffer
831	TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) {
832	Fusion fusion;
833	FusionGuard fg(&fusion);
834
835	auto tv0 = makeContigTensor(`1`);
836	fusion.addInput(tv0);
837	auto tv1 = makeContigTensor(`1`);
838	fusion.addInput(tv1);
839
840	auto tv2 = set(tv0);
841	auto tv3 = set(tv1);
842	auto tv4 = add(tv2, tv3);
843	fusion.addOutput(tv4);
844
845	tv4->split(`0`, `32`);
846	tv4->split(`0`, `4`);
847	TransformPropagatorWithCheck propagator(tv4);
848	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
849
850	tv0->computeAt(tv4, `1`);
851	tv1->computeAt(tv4, `1`);
852
853	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
854	scheduler_utils::parallelizeAllLike(tv4);
855
856	tv2->doubleBuffer();
857	tv3->doubleBuffer();
858
859	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
860	at::manual_seed(`0`);
861	auto t0 = at::randn({`100`}, options);
862	auto t1 = at::randn({`100`}, options);
863
864	FusionExecutor fe;
865	fe.compileFusion(&fusion, {t0, t1});
866	auto cg_outputs = fe.runFusion({t0, t1});
867
868	auto ref = t0 + t1;
869
870	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
871	}
872
873	// Nested double buffering from gmem to smem and smem to register
874	TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) {
875	Fusion fusion;
876	FusionGuard fg(&fusion);
877
878	auto tv0 = makeContigTensor(`1`);
879	fusion.addInput(tv0);
880	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
881	auto out = tv1;
882	fusion.addOutput(out);
883
884	auto tv2 = tv0->cacheAfter();
885	auto tv3 = tv2->cacheAfter();
886
887	out->split(`0`, `32`);
888	out->split(`0`, `4`);
889	TransformPropagatorWithCheck propagator(out);
890	MaxRootDomainInfoSpanningTree (out).traverse(&propagator);
891
892	tv2->setMemoryType(MemoryType::Shared);
893
894	tv2->computeAt(out, `1`);
895	tv3->computeAt(out, -`1`);
896
897	out->axis(-`1`)->parallelize(ParallelType::TIDx);
898	scheduler_utils::parallelizeAllLike(out);
899
900	tv2->doubleBuffer();
901	tv3->doubleBuffer();
902
903	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
904	at::manual_seed(`0`);
905	auto t0 = at::randn({`1001`}, options);
906
907	FusionExecutor fe;
908	fe.compileFusion(&fusion, {t0});
909	auto cg_outputs = fe.runFusion({t0});
910
911	auto ref = t0 + `1`;
912
913	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
914	}
915
916	// FusionSmemBlockGemmCache + double buffering at both smem and local
917	TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
918	Fusion fusion;
919	FusionGuard fg(&fusion);
920
921	// Algorithm
922	TensorView* tv0 = makeSymbolicTensor(`2`); // (M, K)
923	TensorView* tv1 = makeSymbolicTensor(`2`); // (K, N)
924	TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
925	TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
926	TensorView* tv4 = mul(tv2, tv3); // M, K, N
927	TensorView* tv5 = sum(tv4, {`1`}); // M, R, N
928	fusion.addInput(tv0);
929	fusion.addInput(tv1);
930	fusion.addOutput(tv5);
931
932	TensorView* tv6 = tv5->cacheBefore();
933
934	// For smem double buffering
935	auto tv0_cache_local = tv0->cacheAfter();
936	auto tv1_cache_local = tv1->cacheAfter();
937
938	// For register double buffering
939	auto tv0_cache_smem = tv0->cacheAfter();
940	auto tv1_cache_smem = tv1->cacheAfter();
941
942	const int BSX = `32`;
943	const int TSX = `8`;
944
945	// [M, K, N]
946	tv6->split(-`1`, BSX);
947	tv6->split(-`1`, TSX);
948	tv6->split(`1`, BSX);
949	tv6->split(`0`, BSX);
950	tv6->split(`1`, TSX);
951	// [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX]
952	tv6->reorder(
953	{{`4`, `7`}, {`7`, `6`}, {`6`, `5`}, {`2`, `4`}, {`1`, `3`}, {`3`, `2`}, {`5`, `1`}, {`0`, `0`}});
954	// [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX]
955
956	auto tv6_rf = tv6->rFactor({-`1`});
957
958	TransformPropagatorWithCheck propagator(tv6_rf);
959	MaxRootDomainInfoSpanningTree (tv6_rf).traverse(&propagator);
960
961	tv0->computeAt(tv6, `3`);
962	tv1->computeAt(tv6, `3`);
963
964	tv6_rf->computeAt(tv6, -`1`);
965	tv0_cache_local->computeAt(tv6_rf, -`1`);
966	tv1_cache_local->computeAt(tv6_rf, -`1`);
967
968	tv0_cache_smem->setMemoryType(MemoryType::Shared);
969	tv1_cache_smem->setMemoryType(MemoryType::Shared);
970
971	tv5->axis(`0`)->parallelize(ParallelType::BIDx);
972	tv5->axis(`1`)->parallelize(ParallelType::BIDy);
973	tv5->axis(-`3`)->parallelize(ParallelType::TIDy);
974	tv5->axis(-`1`)->parallelize(ParallelType::TIDx);
975
976	scheduler_utils::parallelizeAllLike(tv5);
977
978	tv0_cache_local->doubleBuffer();
979	tv1_cache_local->doubleBuffer();
980
981	tv0_cache_smem->doubleBuffer();
982	tv1_cache_smem->doubleBuffer();
983
984	constexpr int M = `154`, K = `45`, N = `1524`;
985
986	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
987	at::Tensor t0 = at::randn({M, K}, options);
988	at::Tensor t1 = at::randn({K, N}, options);
989	at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
990
991	std::vector<IValue> aten_inputs = {t0, t1};
992
993	FusionExecutor fe;
994	fe.compileFusion(&fusion, aten_inputs);
995	auto cg_outputs = fe.runFusion(aten_inputs);
996
997	testValidate(
998	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
999	// The smem cache write in this test case is redundant predicated,
1000	// and also double buffered. Currently we are relying on WAR sync
1001	// insertion to ensure ordering of double buffered tensor access.
1002	// The check below makes sure that the sync is inserted so that the
1003	// test isn't running on a race condition.
1004	TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count > `0`);
1005	}
1006
1007	TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
1008	std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};
1009
1010	for (auto mem_type : mem_types) {
1011	Fusion fusion;
1012	FusionGuard fg(&fusion);
1013
1014	auto tv0 = makeContigTensor(`1`);
1015	fusion.addInput(tv0);
1016
1017	auto tv1 = set(tv0);
1018	auto tv2 = set(tv1);
1019	auto tv3 = set(tv2);
1020	fusion.addOutput(tv3);
1021
1022	tv1->setMemoryType(mem_type);
1023
1024	tv3->split(-`1`, `4`);
1025	TransformPropagatorWithCheck propagator(tv3);
1026	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
1027
1028	tv1->computeAt(tv3, -`2`);
1029
1030	tv2->axis(-`1`)->parallelize(ParallelType::Vectorize);
1031
1032	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1033	at::manual_seed(`0`);
1034	auto t0 = at::randn({`15`}, options);
1035	FusionExecutor fe;
1036	fe.compileFusion(&fusion);
1037
1038	// This should throw an exception as the extent of t0 is not
1039	// divisible by the vector width
1040	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1041	ASSERT_ANY_THROW(fe.runFusion({t0}));
1042
1043	auto t1 = at::randn({`16`}, options);
1044	auto cg_outputs = fe.runFusion({t1});
1045
1046	auto ref = t1;
1047
1048	testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__);
1049	}
1050	}
1051
1052	TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) {
1053	Fusion fusion;
1054	FusionGuard fg(&fusion);
1055
1056	auto tv0 = makeConcreteTensor({`10`, `1`});
1057	fusion.addInput(tv0);
1058	auto tv1 = makeConcreteTensor({`10`, `20`});
1059	fusion.addInput(tv1);
1060	auto tv2 = makeConcreteTensor({`10`, `10`});
1061	fusion.addInput(tv2);
1062
1063	// Not concretized
1064	auto tv3 = sum(tv2, {`1`});
1065	auto tv4 = broadcast(tv3, {false, true});
1066	auto tv5 = add(tv0, tv4);
1067	fusion.addOutput(tv5);
1068
1069	// Concretized
1070	auto tv6 = sum(tv2, {`1`});
1071	auto tv7 = broadcast(tv6, {false, true});
1072	auto tv8 = add(tv1, tv7);
1073	fusion.addOutput(tv8);
1074
1075	for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) {
1076	tv->axis(`1`)->parallelize(ParallelType::TIDx);
1077	}
1078
1079	GpuLower gpulw(&fusion);
1080	TORCH_CHECK(!gpulw.concretizedBroadcastDomains()->isConcretized(
1081	loweredTv(tv4, gpulw)->axis(`1`)));
1082	TORCH_CHECK(gpulw.concretizedBroadcastDomains()->isConcretized(
1083	loweredTv(tv7, gpulw)->axis(`1`)));
1084
1085	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1086	at::manual_seed(`0`);
1087	auto t0 = at::randn({`10`, `1`}, options);
1088	auto t1 = at::randn({`10`, `20`}, options);
1089	auto t2 = at::randn({`10`, `10`}, options);
1090	std::vector<IValue> aten_inputs = {t0, t1, t2};
1091
1092	FusionExecutor fe;
1093	fe.compileFusion(&fusion, aten_inputs);
1094	auto outputs = fe.runFusion(aten_inputs);
1095
1096	auto t5 = t0 + t2.sum({`1`}).unsqueeze(-`1`);
1097	auto t8 = t1 + t2.sum({`1`}).unsqueeze(-`1`);
1098
1099	testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__);
1100	}
1101
1102	TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) {
1103	Fusion fusion;
1104	FusionGuard fg(&fusion);
1105
1106	auto tv0 = makeSymbolicTensor(`2`);
1107	fusion.addInput(tv0);
1108
1109	auto tv1 = sum(tv0, {`0`, `1`});
1110	auto tv2 = broadcast(tv1, {true});
1111	auto tv3 = broadcast(tv2, {false, true});
1112	fusion.addOutput(tv3);
1113
1114	// tv1 is thread-predicated with TIDx and TIDy
1115	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
1116	tv1->axis(`1`)->parallelize(ParallelType::TIDy);
1117	// tv2 broadcasts along TIDx
1118	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
1119	// tv3 broadcasts along TIDy
1120	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
1121	tv3->axis(`1`)->parallelize(ParallelType::TIDy);
1122
1123	// Both tv2 and tv3 broadcast along predicated TID dimensions, but
1124	// since the broadcast domains are not concretized, there should be
1125	// no actual parallel broadcast
1126
1127	GpuLower gpulw(&fusion);
1128	TORCH_CHECK(
1129	!gpulw.kernel()->summary().has_block_broadcasts &&
1130	!gpulw.kernel()->summary().has_grid_broadcasts,
1131	"There must be no parallel broadcast in this fusion");
1132
1133	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1134	at::manual_seed(`0`);
1135	auto t0 = at::randn({`10`, `11`}, options);
1136	std::vector<IValue> aten_inputs = {t0};
1137
1138	FusionExecutor fe;
1139	fe.compileFusion(&fusion, aten_inputs);
1140	auto outputs = fe.runFusion(aten_inputs);
1141
1142	auto t3 = t0.sum().unsqueeze(-`1`).unsqueeze(-`1`);
1143
1144	testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__);
1145	}
1146
1147	TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) {
1148	Fusion fusion;
1149	FusionGuard fg(&fusion);
1150
1151	std::vector<int64_t> input_shape({`10`, `4`, `8`});
1152	std::vector<int64_t> output_shape({`8`, `4`, `1`});
1153
1154	auto tv0 = makeConcreteTensor(input_shape);
1155	fusion.addInput(tv0);
1156
1157	auto tv2 = sum(tv0, {`0`});
1158	auto tv3 = set(tv2);
1159	auto tv4 =
1160	view(tv3, {input_shape.begin() + `1`, input_shape.end()}, output_shape);
1161	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
1162	fusion.addOutput(tv5);
1163
1164	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
1165	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
1166	tv5->axis(-`1`)->parallelize(ParallelType::TIDx);
1167
1168	// The view op adds a broadcast domain in tv4, which is
1169	// parallelized. Howver, it is never materialized, so there should
1170	// be no parallel broadcast.
1171
1172	GpuLower gpulw(&fusion);
1173	TORCH_CHECK(
1174	!gpulw.kernel()->summary().has_block_broadcasts &&
1175	!gpulw.kernel()->summary().has_grid_broadcasts,
1176	"There must be no parallel broadcast in this fusion");
1177
1178	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1179	at::manual_seed(`0`);
1180	auto t0 = at::randn(input_shape, options);
1181	std::vector<IValue> aten_inputs = {t0};
1182
1183	FusionExecutor fe;
1184	fe.compileFusion(&fusion, aten_inputs);
1185	auto outputs = fe.runFusion(aten_inputs);
1186
1187	auto t5 = at::native::view(t0.sum(`0`), output_shape) + `1`;
1188
1189	testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__);
1190	}
1191
1192	// Merging non-broadcast and broadcast domains
1193	// TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418
1194	// validateParallelize does not pass. Even if it's skipped,
1195	// generated code is invalid as blockBroadcast is not used.
1196	#if 0
1197	TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {
1198	Fusion fusion;
1199	FusionGuard fg(&fusion);
1200
1201	auto tv0 = makeSymbolicTensor(`2`);
1202	fusion.addInput(tv0);
1203
1204	auto tv1 = sum(tv0, {`1`});
1205	auto tv2 = broadcast(tv1, {false, true});
1206	auto tv3 = add(tv2, tv0);
1207	fusion.addOutput(tv3);
1208
1209	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
1210
1211	tv2->merge(`0`, `1`);
1212	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
1213	// TODO: When set to shared memory, this kernel should be correct, but fails
1214	// validation and when skipped produces incorrect code
1215	tv2->setMemoryType(MemoryType::Shared);
1216
1217	tv3->merge(`0`, `1`);
1218	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
1219
1220	fusion.printMath();
1221	fusion.printKernel();
1222	}
1223	#endif
1224
1225	TEST_F(NVFuserTest, FusionBroadcastConcretization5_CUDA) {
1226	Fusion fusion;
1227	FusionGuard fg(&fusion);
1228
1229	auto tv0 = makeSymbolicTensor(`1`);
1230	fusion.addInput(tv0);
1231	auto tv1 = makeSymbolicTensor(`1`);
1232	fusion.addInput(tv1);
1233	auto tv2 = makeSymbolicTensor(`1`);
1234	fusion.addInput(tv2);
1235	auto tv3 = makeSymbolicTensor(`1`);
1236	fusion.addInput(tv3);
1237
1238	// Assert tv2 and tv3 have the same shape
1239	auto tv4 = add(tv2, tv3);
1240	fusion.addOutput(tv4);
1241
1242	// Concretize a broadcast domain to multiple non-concrete domains
1243	// through a multi-output expression. It should be considered to be
1244	// non-uniquely concretized.
1245	auto tv5 = broadcast(tv0, {false, true});
1246	// Reduce only the non-broadcast domain.
1247	auto tvs = Welford(tv5, {`0`});
1248	auto tv9 = add(tvs.avg, tv1);
1249	auto tv10 = add(tvs.var_sum, tv2);
1250	fusion.addOutput(tv9);
1251	fusion.addOutput(tv10);
1252
1253	// Same pattern as the above, but concretize the broadcast domain
1254	// with tv2 and tv3, which have the exactly same shape, so the
1255	// broadcast should be considered uniquely concretized.
1256	auto tv11 = broadcast(tv0, {false, true});
1257	// Reduce only the non-broadcast domain.
1258	auto tvs2 = Welford(tv11, {`0`});
1259	auto tv15 = add(tvs2.avg, tv2);
1260	auto tv16 = add(tvs2.var_sum, tv3);
1261	fusion.addOutput(tv15);
1262	fusion.addOutput(tv16);
1263
1264	// Reduce only the broadcast domain. Since it's reduced, it should
1265	// not be considered to be concretized.
1266	auto tv17 = broadcast(tv0, {false, true});
1267	auto tvs3 = Welford(tv17, {`1`});
1268	fusion.addOutput(tvs3.avg);
1269
1270	ConcretizedBroadcastDomains bcast_concretization_info(&fusion);
1271
1272	TORCH_CHECK(
1273	bcast_concretization_info.maybeNonUniquelyConcretized(tv5->axis(`1`)),
1274	"Failed to detect non-unique concretization of ",
1275	tv5->toString());
1276
1277	TORCH_CHECK(
1278	bcast_concretization_info.isUniquelyConcretized(tv11->axis(`1`)),
1279	"Failed to detect unique concretization of ",
1280	tv11->toString());
1281
1282	TORCH_CHECK(
1283	!bcast_concretization_info.isConcretized(tv17->axis(`1`)),
1284	"Failed to detect non-concretization of ",
1285	tv17->toString());
1286	}
1287
1288	TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
1289	// Derived from an expression sorting issue when using loop map, now expr
1290	// sorting uses parallel map.
1291	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
1292	Fusion& fusion = *fusion_ptr.get();
1293	FusionGuard fg(&fusion);
1294
1295	int V = `2`, W = `3`, X = `4`, Y = `5`, Z = `6`;
1296
1297	// setup fusion
1298	auto tv0 = TensorViewBuilder ()
1299	.ndims(`5`)
1300	.dtype(DataType::Half)
1301	.contiguity(std::vector<bool>(`5`, true))
1302	.shape({V, W, X, Y, Z})
1303	.build();
1304
1305	fusion.addInput(tv0);
1306	auto tv1 = set(tv0);
1307	auto tv2 = castOp(DataType::Float, tv1);
1308
1309	auto tvs = Welford(tv2, {`1`, `2`, `3`, `4`});
1310	auto tv3 = tvs.avg;
1311	auto tv4 = tvs.var_sum;
1312	auto tv5 = tvs.n;
1313
1314	// avg
1315	auto tv6 = broadcast(tvs.avg, {false, true, true, true, true});
1316
1317	// var
1318	auto tv7 = mul(tv4, IrBuilder::create<Double>(`1.` / (W * X * Y * Z)));
1319	auto tv8 = add(tv7, IrBuilder::create<Double>(`1.e-6`));
1320	auto tv9 = broadcast(tv8, {false, true, true, true, true});
1321	auto tv10 = rsqrt(tv9);
1322
1323	auto tv11 = castOp(DataType::Float, tv1);
1324	auto tv12 = sub(tv11, tv6);
1325	auto tv13 = mul(tv12, tv10);
1326
1327	auto tv14 = set(tv13);
1328	fusion.addOutput(tv14);
1329
1330	tv3->axis(`0`)->parallelize(ParallelType::BIDy);
1331	tv3->axis(`2`)->parallelize(ParallelType::BIDx);
1332	tv3->axis(`3`)->parallelize(ParallelType::TIDx);
1333	tv3->axis(`4`)->parallelize(ParallelType::Vectorize);
1334
1335	// tv3->reorder({{1, -2}});
1336
1337	auto rfactor = ir_utils::rfactorHelper(tv3, {`1`, `4`});
1338
1339	scheduler_utils::parallelizeAllLike(rfactor);
1340
1341	for (auto tv : ir_utils::allTvs(&fusion)) {
1342	if (tv != tv1 \|\| tv != tv3) {
1343	for (auto i : c10::irange(tv->nDims())) {
1344	if (isParallelTypeVectorize(tv->axis(i)->getParallelType())) {
1345	tv->axis(i)->parallelize(ParallelType::Serial);
1346	}
1347	}
1348	}
1349	}
1350
1351	tv0->computeAt(tv14, `1`);
1352	tv13->computeAt(tv14, -`2`);
1353	tv2->computeAt(tv14, -`1`, ComputeAtMode::MostInlined);
1354	tv11->computeAt(tv14, -`1`, ComputeAtMode::MostInlined);
1355
1356	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
1357	at::Tensor t0 = at::randn({V, W, X, Y, Z}, options);
1358
1359	FusionExecutor fe;
1360	fe.compileFusion(&fusion);
1361	auto cg_outputs = fe.runFusion({t0}, LaunchParams (X, V, -`1`, Y, -`1`, -`1`));
1362
1363	auto t0_double = t0.to(at::kDouble);
1364
1365	auto at_mu = at::mean(t0_double, {`1`, `2`, `3`, `4`})
1366	.unsqueeze(-`1`)
1367	.unsqueeze(-`1`)
1368	.unsqueeze(-`1`)
1369	.unsqueeze(-`1`);
1370	auto at_var = at::var(t0_double, {`1`, `2`, `3`, `4`}, false)
1371	.unsqueeze(-`1`)
1372	.unsqueeze(-`1`)
1373	.unsqueeze(-`1`)
1374	.unsqueeze(-`1`);
1375
1376	auto at_out = t0_double.sub(at_mu).div(at_var.add(`1.e-6`).sqrt());
1377
1378	testValidate(
1379	&fusion,
1380	cg_outputs,
1381	{t0},
1382	{at_out},
1383	__LINE__,
1384	__FILE__,
1385	"",
1386	LaunchParams (X, V, -`1`, Y, -`1`, -`1`));
1387	}
1388
1389	// Test code generation of allocated scalars
1390	TEST_F(NVFuserTest, FusionCodegenAllocatedScalars_CUDA) {
1391	Fusion fusion;
1392	FusionGuard fg(&fusion);
1393
1394	// Fusion is just a dummy container in this test, just used for
1395	// getting a Kernel container
1396	auto tv0 = makeSymbolicTensor(`0`);
1397	fusion.addInput(tv0);
1398	auto tv1 = set(tv0);
1399	fusion.addOutput(tv1);
1400
1401	GpuLower gpulw(&fusion);
1402	auto kernel = gpulw.kernel();
1403
1404	// Set the kernel as the current fusion
1405	FusionGuard kg(kernel);
1406
1407	// Create alocated scalars
1408	auto ks0 = add(kernel->zeroVal(), kernel->oneVal());
1409	auto ks0_alloc = IrBuilder::create<kir::Allocate>(
1410	ks0, MemoryType::Local, kernel->oneVal());
1411
1412	auto ks1 = add(ks0, kernel->oneVal());
1413	auto ks1_alloc = IrBuilder::create<kir::Allocate>(
1414	ks1, MemoryType::Local, kernel->oneVal());
1415
1416	auto tk0 = kernel->inputs()[`0`]->as<TensorView>();
1417	auto tki0 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks0});
1418	auto tki1 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks1});
1419	auto tk0_expr = IrBuilder::create<UnaryOp>(UnaryOpType::Set, tki0, tki1);
1420
1421	// Insert the scalar expression and the allocation of the
1422	// output directly to the kernel
1423	auto proxy = kir::KernelInternalProxy (kernel);
1424
1425	const auto indent = " ";
1426	const auto ks0_name = "i" + std::to_string(ks0->name());
1427	const auto ks1_name = "i" + std::to_string(ks1->name());
1428	const auto tk0_name = "T" + std::to_string(tk0->name());
1429
1430	auto& exprs = proxy.topLevelExprs();
1431	exprs.push_back(tk0_expr);
1432
1433	// Invalid code gen
1434	const auto no_alloc_code = codegen::generateCudaKernel(kernel);
1435
1436	// Without alloc, Int vals are just inlined, resulting in:
1437	// t0[(0 + 1)] = t0[((0 + 1) + 1)]
1438	std::stringstream no_alloc_ref;
1439	no_alloc_ref << "\n"
1440	<< indent << tk0_name << "[(0 + 1)]\n"
1441	<< indent << indent << " = " << tk0_name << "[((0 + 1) + 1)];\n";
1442
1443	TORCH_CHECK(
1444	no_alloc_code.find(no_alloc_ref.str()) != std::string::npos,
1445	"Invalid code generation. Expected:",
1446	no_alloc_ref.str(),
1447	"Actual:\n",
1448	no_alloc_code);
1449
1450	// Insert proper allocations and definitions
1451	exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks0_alloc);
1452	exprs.insert(
1453	std::find(exprs.begin(), exprs.end(), tk0_expr), ks0->definition());
1454	exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks1_alloc);
1455	exprs.insert(
1456	std::find(exprs.begin(), exprs.end(), tk0_expr), ks1->definition());
1457
1458	const auto valid_code = codegen::generateCudaKernel(kernel);
1459
1460	std::stringstream valid_ref;
1461	valid_ref << "\n"
1462	<< indent << tk0_name << "[" << ks0_name << "]\n"
1463	<< indent << indent << " = " << tk0_name << "[" << ks1_name
1464	<< "];\n";
1465
1466	TORCH_CHECK(
1467	valid_code.find(valid_ref.str()) != std::string::npos,
1468	"Invalid code generation. Expected:",
1469	valid_ref.str(),
1470	"Actual:\n",
1471	valid_code);
1472	}
1473
1474	TEST_F(NVFuserTest, FusionIndexHoist1_CUDA) {
1475	if (isOptionDisabled(DisableOption::IndexHoist)) {
1476	GTEST_SKIP() << "Index hoisting disabled";
1477	}
1478
1479	Fusion fusion;
1480	FusionGuard fg(&fusion);
1481
1482	auto tv0 = makeSymbolicTensor(`2`);
1483	fusion.addInput(tv0);
1484
1485	auto tv1 = set(tv0);
1486	auto tv2 = set(tv1);
1487	auto tv3 = set(tv2);
1488	auto tv4 = set(tv3);
1489	auto tv5 = set(tv4);
1490	fusion.addOutput(tv5);
1491
1492	tv1->split(-`1`, `4`);
1493	tv2->split(-`1`, `4`);
1494	tv3->merge(`0`, `1`);
1495	tv3->split(`0`, `8`);
1496	tv5->merge(`0`, `1`);
1497	tv5->split(`0`, `8`);
1498	tv4->computeAt(tv5, -`1`);
1499
1500	tv1->setMemoryType(MemoryType::Global);
1501	tv2->setMemoryType(MemoryType::Global);
1502	tv3->setMemoryType(MemoryType::Global);
1503
1504	// Use Int32 as the index type to verify Int32 is used as the type
1505	// of hoisted indices
1506	GpuLower gpulw(&fusion, DataType::Int32);
1507	auto kernel = gpulw.kernel();
1508
1509	auto is_index_times_ns = [](Val* val, Val* index, std::string name) -> bool {
1510	auto def = dynamic_cast<BinaryOp*>(val->definition());
1511	if (def == nullptr) {
1512	return false;
1513	}
1514	return def->getBinaryOpType() == BinaryOpType::Mul &&
1515	def->rhs()->isA<NamedScalar>() &&
1516	def->rhs()->as<NamedScalar>()->name() == name && def->lhs() == index;
1517	};
1518
1519	// Validate indices in the kernel are hoisted as
1520	// intended. Validation could be also done by just string comparison
1521	// as the parser test, but updating such tests would be tedious.
1522	for (auto top_level_loop :
1523	ir_utils::filterByType<kir::ForLoop>(kernel->topLevelExprs())) {
1524	auto innermost_loop = top_level_loop;
1525	while (auto first_expr_loop = dynamic_cast<kir::ForLoop*>(
1526	innermost_loop->body().exprs().at(`0`))) {
1527	innermost_loop = first_expr_loop;
1528	}
1529	const auto& exprs = innermost_loop->body().exprs();
1530	TORCH_CHECK(!exprs.empty(), "No expression found");
1531	TORCH_CHECK(
1532	exprs.at(`0`)->isA<kir::Allocate>(),
1533	"Invalid expression: ",
1534	exprs.at(`0`)->toString());
1535	auto hoisted_index = exprs.at(`0`)->as<kir::Allocate>()->buffer();
1536	TORCH_CHECK(
1537	hoisted_index->dtype() == DataType::Int32,
1538	"Invalid data type of hoisted indices. Should be Int32 but: ",
1539	hoisted_index->dtype());
1540	kir::Predicate* pred = nullptr;
1541	for (auto expr : exprs) {
1542	if (expr->isA<kir::IfThenElse>()) {
1543	pred = expr->as<kir::IfThenElse>()->predicate();
1544	auto arith_expr = expr->as<kir::IfThenElse>()->thenBody().exprs().at(`0`);
1545	auto out_ti = arith_expr->outputs()[`0`]->as<kir::TensorIndex>();
1546	if (out_ti->view()->name() == `1`) {
1547	// Ref: T1[, hoisted_index] = T0[, hoisted_index T0.stride];*
1548	auto t1_index = out_ti->index(`1`);
1549	TORCH_CHECK(
1550	t1_index == hoisted_index,
1551	"Invalid index: ",
1552	t1_index->toInlineString());
1553	// Pred: hoisted_index < T0.size[1]
1554	TORCH_CHECK(
1555	pred->value()->definition()->as<BinaryOp>()->lhs() ==
1556	hoisted_index,
1557	"Invalid predicate: ",
1558	pred->value()->toInlineString(),
1559	", ",
1560	expr->toString());
1561	TORCH_CHECK(arith_expr->inputs().size() == `1`);
1562	auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1563	TORCH_CHECK(in0->view()->name() == `0`);
1564	// hoisted_index T0.stride[1]*
1565	auto t0_index = in0->index(`1`);
1566	TORCH_CHECK(
1567	is_index_times_ns(t0_index, hoisted_index, "T0.stride[1]"),
1568	"Invalid index: ",
1569	t0_index->toInlineString(),
1570	", ",
1571	expr->toString());
1572	} else if (out_ti->view()->name() == `2`) {
1573	// Ref: T3[, hoisted_index] = T2[, hoisted_index];
1574	auto out_index = out_ti->index(`1`);
1575	TORCH_CHECK(
1576	out_index == hoisted_index,
1577	"Invalid index: ",
1578	out_index->toInlineString(),
1579	", ",
1580	expr->toString());
1581	TORCH_CHECK(
1582	pred->value()->definition()->as<BinaryOp>()->lhs() ==
1583	hoisted_index,
1584	"Invalid predicate: ",
1585	pred->value()->toInlineString(),
1586	", ",
1587	expr->toString());
1588	TORCH_CHECK(arith_expr->inputs().size() == `1`);
1589	auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1590	TORCH_CHECK(in0->view()->name() == `1`);
1591	auto in0_index = in0->index(`1`);
1592	TORCH_CHECK(
1593	in0_index == hoisted_index,
1594	"Invalid index: ",
1595	in0_index->toInlineString(),
1596	", ",
1597	expr->toString());
1598	} else if (out_ti->view()->name() == `3`) {
1599	// Ref: T3[hoisted_index] = T2[hoisted_index];
1600	auto out_index = out_ti->index(`0`);
1601	TORCH_CHECK(
1602	out_index == hoisted_index,
1603	"Invalid index: ",
1604	out_index->toInlineString(),
1605	", ",
1606	expr->toString());
1607	TORCH_CHECK(
1608	pred->value()->definition()->as<BinaryOp>()->lhs() ==
1609	hoisted_index,
1610	"Invalid predicate: ",
1611	pred->value()->toInlineString(),
1612	", ",
1613	expr->toString());
1614	TORCH_CHECK(arith_expr->inputs().size() == `1`);
1615	auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1616	TORCH_CHECK(in0->view()->name() == `2`);
1617	auto in0_index = in0->index(`0`);
1618	TORCH_CHECK(
1619	in0_index == hoisted_index,
1620	"Invalid index: ",
1621	in0_index->toInlineString(),
1622	", ",
1623	expr->toString());
1624	} else if (out_ti->view()->name() == `4`) {
1625	// Ref: T4[0] = T3[hoisted_index];
1626	TORCH_CHECK(
1627	pred->value()->definition()->as<BinaryOp>()->lhs() ==
1628	hoisted_index,
1629	"Invalid predicate: ",
1630	pred->value()->toInlineString(),
1631	", ",
1632	expr->toString());
1633	TORCH_CHECK(arith_expr->inputs().size() == `1`);
1634	auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1635	TORCH_CHECK(in0->view()->name() == `3`);
1636	auto in0_index = in0->index(`0`);
1637	TORCH_CHECK(
1638	in0_index == hoisted_index,
1639	"Invalid index: ",
1640	in0_index->toInlineString(),
1641	", ",
1642	expr->toString());
1643	} else if (out_ti->view()->name() == `5`) {
1644	// Ref: T5[hoisted_index] = T4[0]
1645	auto out_index = out_ti->index(`0`);
1646	TORCH_CHECK(
1647	out_index == hoisted_index,
1648	"Invalid index: ",
1649	out_index->toInlineString(),
1650	", ",
1651	expr->toString());
1652	TORCH_CHECK(
1653	pred->value()->definition()->as<BinaryOp>()->lhs() ==
1654	hoisted_index,
1655	"Invalid predicate: ",
1656	pred->value()->toInlineString(),
1657	", ",
1658	expr->toString());
1659	}
1660	}
1661	}
1662	}
1663
1664	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1665	at::manual_seed(`0`);
1666	auto t0 = at::randn({`15`, `17`}, options);
1667
1668	FusionExecutor fe;
1669	fe.compileFusion(&fusion, {t0});
1670	auto cg_outputs = fe.runFusion({t0});
1671
1672	auto ref = t0;
1673
1674	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1675	}
1676
1677	// Hoist indices for vectorized tensors
1678	TEST_F(NVFuserTest, FusionIndexHoist2_CUDA) {
1679	if (isOptionDisabled(DisableOption::IndexHoist)) {
1680	GTEST_SKIP() << "Index hoisting disabled";
1681	}
1682
1683	Fusion fusion;
1684	FusionGuard fg(&fusion);
1685
1686	auto tv0 = makeContigTensor(`1`);
1687	fusion.addInput(tv0);
1688	auto tv1 = makeContigTensor(`1`);
1689	fusion.addInput(tv1);
1690
1691	auto tv2 = set(tv0);
1692	auto tv3 = set(tv1);
1693	auto tv4 = add(tv2, tv3);
1694	auto tv5 = set(tv4);
1695	fusion.addOutput(tv5);
1696
1697	tv5->split(-`1`, `4`);
1698	TransformPropagatorWithCheck propagator(tv5);
1699	MaxRootDomainInfoSpanningTree (tv5).traverse(&propagator);
1700
1701	tv4->split(-`1`, `3`);
1702
1703	tv0->computeAt(tv5, `1`);
1704	tv1->computeAt(tv5, `1`);
1705
1706	tv2->axis(-`1`)->parallelize(ParallelType::Vectorize);
1707	tv3->axis(-`1`)->parallelize(ParallelType::Vectorize);
1708	tv5->axis(-`1`)->parallelize(ParallelType::Vectorize);
1709
1710	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1711	at::manual_seed(`0`);
1712	auto t0 = at::randn({`16`}, options);
1713	auto t1 = at::randn({`16`}, options);
1714
1715	FusionExecutor fe;
1716	fe.compileFusion(&fusion, {t0, t1});
1717	auto cg_outputs = fe.runFusion({t0, t1});
1718
1719	auto ref = t0 + t1;
1720
1721	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1722	}
1723
1724	TEST_F(NVFuserTest, FusionTestGridComm_CUDA) {
1725	Fusion fusion;
1726	FusionGuard fg(&fusion);
1727	int X = `3`, Y = `4`, Z = `2`;
1728	auto tv0 = makeConcreteTensor({X, Y, Z});
1729	fusion.addInput(tv0);
1730	auto tv1 = makeConcreteTensor({X, Y, Z});
1731	fusion.addInput(tv1);
1732
1733	auto tv2 = set(tv0);
1734	auto tv3 = add(tv2, tv1);
1735	auto tv4 = set(tv3);
1736	auto tv5 = set(tv4);
1737	fusion.addOutput(tv5);
1738
1739	tv2->setMemoryType(MemoryType::Global);
1740	tv3->setMemoryType(MemoryType::Global);
1741	tv4->setMemoryType(MemoryType::Global);
1742
1743	tv2->axis(`0`)->parallelize(ParallelType::BIDy);
1744	tv2->axis(`1`)->parallelize(ParallelType::BIDx);
1745	tv2->axis(`2`)->parallelize(ParallelType::Vectorize);
1746
1747	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1748	tv3->axis(`1`)->parallelize(ParallelType::BIDy);
1749
1750	tv4->axis(`0`)->parallelize(ParallelType::BIDy);
1751	tv4->axis(`1`)->parallelize(ParallelType::BIDx);
1752
1753	tv5->axis(`0`)->parallelize(ParallelType::BIDy);
1754	tv5->axis(`1`)->parallelize(ParallelType::BIDx);
1755	tv5->axis(`2`)->parallelize(ParallelType::Vectorize);
1756
1757	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1758	at::manual_seed(`0`);
1759	auto t0 = at::randn({X, Y, Z}, options);
1760	auto t1 = at::randn({X, Y, Z}, options);
1761
1762	FusionExecutor fe;
1763	fe.compileFusion(&fusion, {t0, t1});
1764	auto cg_outputs = fe.runFusion({t0, t1});
1765
1766	auto ref = t0 + t1;
1767
1768	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1769	}
1770
1771	// See issue https://github.com/csarofeen/pytorch/issues/1497
1772	TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) {
1773	Fusion fusion;
1774	FusionGuard fg(&fusion);
1775
1776	int64_t W = `3`, X = `4`;
1777
1778	auto tv0 = makeConcreteTensor({X});
1779	auto tv1 = makeConcreteTensor({W, X});
1780	fusion.addInput(tv0);
1781	fusion.addInput(tv1);
1782
1783	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
1784	auto tv3 = broadcast(tv2, {true, false});
1785	auto tv4 = add(tv3, tv1);
1786	fusion.addOutput(tv4);
1787
1788	tv4->merge(`0`);
1789	tv4->split(`0`, `2`);
1790
1791	TransformPropagatorWithCheck propagator(tv4);
1792	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
1793
1794	tv3->computeAt(tv4, `1`);
1795
1796	tv4->axis(`0`)->parallelize(ParallelType::BIDx);
1797	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
1798	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
1799	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
1800	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
1801
1802	tv2->setMemoryType(MemoryType::Global);
1803
1804	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1805	at::manual_seed(`0`);
1806	auto t0 = at::randn({X}, options);
1807	auto t1 = at::randn({W, X}, options);
1808
1809	FusionExecutor fe;
1810	fe.compileFusion(&fusion, {t0, t1});
1811	auto cg_outputs = fe.runFusion({t0, t1});
1812
1813	auto ref = t0 + t1 + `1`;
1814
1815	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1816	}
1817
1818	// Vectorized reset test for double buffered registers
1819	TEST_F(NVFuserTest, FusionDoubleBufferVector_CUDA) {
1820	Fusion fusion;
1821	FusionGuard fg(&fusion);
1822
1823	auto tv0 = makeContigTensor(`1`);
1824	fusion.addInput(tv0);
1825
1826	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
1827	auto tv2 = sum(tv1, {`0`});
1828	auto tv2c = tv2->cacheBefore();
1829
1830	fusion.addOutput(tv2);
1831
1832	auto tv1cw = tv1->cacheAfter();
1833	auto tv1cr = tv1cw->cacheAfter();
1834
1835	tv1cw->split(-`1`, `32`);
1836	tv1cr->split(-`1`, `32`);
1837	tv1cr->split(-`1`, `4`);
1838	tv1cr->axis(-`1`)->parallelize(ParallelType::Vectorize);
1839
1840	tv1cw->computeAt(tv1cr, `1`);
1841	tv0->computeAt(tv1cw, -`1`);
1842	tv2c->split(-`1`, `32`);
1843	tv2c->split(-`1`, `4`);
1844	tv1cr->computeAt(tv2c, `2`);
1845
1846	tv1cw->setMemoryType(MemoryType::Shared);
1847	tv1cr->doubleBuffer();
1848
1849	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1850
1851	at::manual_seed(`0`);
1852	auto t0 = at::randn({`200`}, options);
1853	FusionExecutor fe;
1854	fe.compileFusion(&fusion, {t0});
1855	auto cg_outputs = fe.runFusion({t0});
1856	auto ref = (t0 + `1`).sum({`0`});
1857
1858	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1859	}
1860
1861	// Request 48KB of data in shared mem,
1862	// should be large enough not to fit in
1863	// static allocations, but small enough
1864	// to fit in supported devices (sm70+).
1865	TEST_F(NVFuserTest, FusionLargeSmem_CUDA) {
1866	Fusion fusion;
1867	FusionGuard fg(&fusion);
1868
1869	auto tv0 = makeContigTensor(`1`);
1870	fusion.addInput(tv0);
1871	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
1872	auto tv2 = add(tv1, IrBuilder::create<Double>(`2.0`));
1873	fusion.addOutput(tv2);
1874
1875	tv2->split(`0`, `12288`);
1876	tv2->split(`1`, `128`);
1877	tv1->computeAt(tv2, `1`);
1878	tv1->split(`1`, `128`);
1879	tv0->computeAt(tv1, -`1`);
1880	tv1->setMemoryType(MemoryType::Shared);
1881	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
1882	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
1883
1884	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1885
1886	at::manual_seed(`0`);
1887	auto t0 = at::randn({`12288` * `4`}, options);
1888	FusionExecutor fe;
1889	fe.compileFusion(&fusion, {t0});
1890	auto cg_outputs = fe.runFusion({t0});
1891	auto ref = t0 + `1` + `2`;
1892
1893	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1894	}
1895
1896	// Request a smem allocation that is equal to the device limit
1897	TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) {
1898	Fusion fusion;
1899	FusionGuard fg(&fusion);
1900
1901	auto properties = at::cuda::getDeviceProperties(
1902	c10::Device (c10::DeviceType::CUDA, `0`).index());
1903	int device_limit = properties->sharedMemPerBlockOptin;
1904
1905	auto tv0 = makeContigTensor(`1`);
1906	fusion.addInput(tv0);
1907	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
1908	auto tv2 = add(tv1, IrBuilder::create<Double>(`2.0`));
1909	fusion.addOutput(tv2);
1910
1911	// 4 byte per float
1912	tv2->split(`0`, device_limit / `4`);
1913	tv2->split(`1`, `128`);
1914	tv1->computeAt(tv2, `1`);
1915	tv1->split(`1`, `128`);
1916	tv0->computeAt(tv1, -`1`);
1917	tv1->setMemoryType(MemoryType::Shared);
1918	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
1919	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
1920
1921	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1922
1923	at::manual_seed(`0`);
1924	auto t0 = at::randn({`12288` * `4`}, options);
1925	FusionExecutor fe;
1926
1927	// First compile gets a compiled kernel
1928	fe.compileFusion(&fusion, {t0});
1929
1930	// Should be throwing because the kernel
1931	// requested absolute device limit
1932	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1933	ASSERT_ANY_THROW(fe.runFusion({t0}));
1934	}
1935
1936	// Try to test alignment when multiple tensors are
1937	// in shared mem.
1938	TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) {
1939	Fusion fusion;
1940	FusionGuard fg(&fusion);
1941
1942	auto tv0 = makeConcreteTensor({`3`, `4`, `7`, `2`, `5`});
1943	fusion.addInput(tv0);
1944	auto tv1 = sum(tv0, {`4`});
1945	auto tv2 = sum(tv1, {`3`});
1946	auto tv3 = sum(tv2, {`2`});
1947	auto tv4 = sum(tv3, {`1`});
1948	fusion.addOutput(tv4);
1949
1950	auto tv0c = tv0->cacheAfter();
1951	auto tv1bc = tv1->cacheBefore();
1952	auto tv2bc = tv2->cacheBefore();
1953	auto tv3bc = tv3->cacheBefore();
1954	auto tv4bc = tv4->cacheBefore();
1955
1956	tv0c->setMemoryType(MemoryType::Shared);
1957	tv1bc->setMemoryType(MemoryType::Shared);
1958	tv2bc->setMemoryType(MemoryType::Shared);
1959	tv3bc->setMemoryType(MemoryType::Shared);
1960	tv4bc->setMemoryType(MemoryType::Shared);
1961
1962	tv1->axis(-`1`)->parallelize(ParallelType::Vectorize);
1963	tv3->axis(-`1`)->parallelize(ParallelType::Vectorize);
1964	tv0->computeAt(tv4, `0`);
1965	tv0->computeAt(tv2, `2`);
1966
1967	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1968
1969	at::manual_seed(`0`);
1970	auto t0 = at::randn({`3`, `4`, `7`, `2`, `5`}, options);
1971	FusionExecutor fe;
1972
1973	fe.compileFusion(&fusion, {t0});
1974	auto cg_outputs = fe.runFusion({t0});
1975	auto tref = t0.sum({`1`, `2`, `3`, `4`});
1976
1977	testValidate(&fusion, cg_outputs, {t0}, {tref}, __LINE__, __FILE__);
1978	}
1979
1980	// Repro of #1521
1981	TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) {
1982	Fusion fusion;
1983	FusionGuard fg(&fusion);
1984
1985	auto tv0 = makeSymbolicTensor(`1`);
1986	fusion.addInput(tv0);
1987
1988	auto immediate_scalr = IrBuilder::create<Double>(`0.1`);
1989	// Adding an immediate scalar value as an input is not allowed
1990	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1991	ASSERT_ANY_THROW(fusion.addInput(immediate_scalr));
1992
1993	// Instead, use a symbolic value
1994	auto symbolic_scalar = IrBuilder::create<Double>();
1995	fusion.addInput(symbolic_scalar);
1996
1997	auto tv1 = add(tv0, symbolic_scalar);
1998	fusion.addOutput(tv1);
1999
2000	// Make sure the kernel is compiled.
2001	FusionExecutor fe;
2002	fe.compileFusion(&fusion);
2003	}
2004
2005	// Repro of #1506
2006	TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) {
2007	std::vector<int64_t> shape{`14`, `14`};
2008
2009	Fusion fusion;
2010	FusionGuard fg(&fusion);
2011
2012	auto tv0 = makeContigTensor(`2`);
2013	fusion.addInput(tv0);
2014	auto tv1 = set(tv0);
2015	auto tv2 = set(tv1);
2016	fusion.addOutput(tv2);
2017
2018	tv2->merge(`0`);
2019
2020	// Vectorize by 4 should be allowed
2021	tv2->split(`0`, `4`);
2022
2023	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
2024	tv0->computeAt(tv2, `1`);
2025
2026	tv1->axis(`1`)->parallelize(ParallelType::Vectorize);
2027	tv2->axis(`1`)->parallelize(ParallelType::Vectorize);
2028
2029	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2030	auto t0 = at::randn(shape, options);
2031
2032	FusionExecutor fe;
2033	fe.compileFusion(&fusion, {t0});
2034	auto cg_outputs = fe.runFusion({t0});
2035
2036	TORCH_CHECK(t0.equal(cg_outputs[`0`]));
2037	}
2038
2039	// Make sure the same fusion as FusionVectorizeContigIndex fails if
2040	// not contig.
2041	TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) {
2042	std::vector<int64_t> shape{`14`, `14`};
2043
2044	Fusion fusion;
2045	FusionGuard fg(&fusion);
2046
2047	auto tv0 = makeSymbolicTensor(`2`);
2048	fusion.addInput(tv0);
2049	auto tv1 = set(tv0);
2050	auto tv2 = set(tv1);
2051	fusion.addOutput(tv2);
2052
2053	tv2->merge(`0`);
2054
2055	tv2->split(`0`, `4`);
2056
2057	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
2058	tv0->computeAt(tv2, `1`);
2059
2060	tv1->axis(`1`)->parallelize(ParallelType::Vectorize);
2061	tv2->axis(`1`)->parallelize(ParallelType::Vectorize);
2062
2063	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2064	auto t0 = at::randn(shape, options);
2065
2066	FusionExecutor fe;
2067	fe.compileFusion(&fusion, {t0});
2068
2069	// This should fail at the launch time as 14 is not divisible by the
2070	// vector word size. The two domains are merged, but they are not
2071	// contiguous, so contig indexing is not involved in this case.
2072	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2073	ASSERT_ANY_THROW(fe.runFusion({t0}));
2074	}
2075
2076	TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
2077	Fusion fusion;
2078	FusionGuard fg(&fusion);
2079
2080	auto tv0 = makeSymbolicTensor(`1`);
2081	fusion.addInput(tv0);
2082	auto tv1 = set(tv0);
2083	fusion.addOutput(tv1);
2084
2085	tv1->split(`0`, `4`);
2086
2087	tv1->axis(-`1`)->parallelize(ParallelType::Vectorize);
2088
2089	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2090	at::manual_seed(`0`);
2091
2092	const int n = `12`;
2093	auto t0 = at::randn({n}, options);
2094	// Shift by one to make it non-aligned
2095	auto t0_misaligned = at::randn({n + `1`}, options).index({Slice (`1`)});
2096	auto t1_misaligned = at::empty({n + `1`}, options).index({Slice (`1`)});
2097
2098	FusionExecutor fe;
2099	fe.compileFusion(&fusion, {t0});
2100	auto cg_outputs = fe.runFusion({t0});
2101	TORCH_CHECK(t0.equal(cg_outputs[`0`]));
2102
2103	// Pass misaligned input. This must fail.
2104	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2105	ASSERT_ANY_THROW(fe.runFusion({t0_misaligned}));
2106
2107	// Pass misaligned output. This must fail too.
2108	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2109	ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned}));
2110	}
2111
2112	// Repro of issue #1530
2113	TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) {
2114	std::vector<int64_t> shape{`1`, `2`, `1`};
2115
2116	Fusion fusion;
2117	FusionGuard fg(&fusion);
2118
2119	auto tv0 = makeContigTensor(shape.size());
2120	fusion.addInput(tv0);
2121	auto tv1 = set(tv0);
2122	fusion.addOutput(tv1);
2123
2124	tv1->merge(`1`);
2125	tv1->merge(`0`);
2126
2127	auto invalid_vec_size = shape [`0`] * shape [`1`] * shape [`2`];
2128	invalid_vec_size *= invalid_vec_size;
2129
2130	tv1->split(`0`, invalid_vec_size);
2131
2132	tv1->axis(`1`)->parallelize(ParallelType::Vectorize);
2133
2134	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2135	auto t0 = at::randn(shape, options);
2136
2137	FusionExecutor fe;
2138	fe.compileFusion(&fusion, {t0});
2139
2140	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2141	ASSERT_ANY_THROW(fe.runFusion({t0}));
2142	}
2143
2144	TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
2145	Fusion fusion;
2146	FusionGuard fg(&fusion);
2147
2148	auto tv0 = makeConcreteTensor({`4`});
2149	fusion.addInput(tv0);
2150	auto tv1 = makeConcreteTensor({`3`, `4`});
2151	fusion.addInput(tv1);
2152
2153	auto tv2 = broadcast(tv0, {true, false});
2154	auto tv3 = add(tv2, tv1);
2155	fusion.addOutput(tv3);
2156
2157	tv3->merge(`0`);
2158	TransformPropagatorWithCheck propagator(tv3);
2159	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
2160
2161	tv2->setMemoryType(MemoryType::Local);
2162
2163	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2164	auto t0 = at::randn({`4`}, options);
2165	auto t1 = at::randn({`3`, `4`}, options);
2166
2167	auto t3 = t0.unsqueeze(`0`).add(t1);
2168	{
2169	FusionExecutor fe;
2170	fe.compileFusion(&fusion, {t0, t1});
2171	auto cg_outputs = fe.runFusion({t0, t1});
2172
2173	testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
2174	}
2175
2176	// Make sure tv2 indexing also works when it's stored in global memory
2177	tv2->setMemoryType(MemoryType::Global);
2178	{
2179	FusionExecutor fe;
2180	fe.compileFusion(&fusion, {t0, t1});
2181	auto cg_outputs = fe.runFusion({t0, t1});
2182
2183	testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
2184	}
2185	}
2186
2187	// Repro of #1534. Validation should detect invalid vectorization.
2188	TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) {
2189	std::vector<int64_t> shape1{`2`, `3`, `2`};
2190	std::vector<int64_t> shape2{`2`, `2`};
2191
2192	Fusion fusion;
2193	FusionGuard fg(&fusion);
2194
2195	auto tv0 = makeContigConcreteTensor(shape1);
2196	fusion.addInput(tv0);
2197	auto tv1 = makeContigConcreteTensor(shape2);
2198	fusion.addInput(tv1);
2199
2200	auto tv2 = set(tv1);
2201	auto tv3 = broadcast(tv2, {false, true, false});
2202	auto tv4 = add(tv0, tv3);
2203	fusion.addOutput(tv4);
2204
2205	tv4->merge(`1`, `2`);
2206	tv4->merge(`0`, `1`);
2207	tv4->split(`0`, `4`);
2208	TransformPropagatorWithCheck propagator(tv4);
2209	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
2210
2211	tv0->computeAt(tv4, -`2`);
2212	tv1->computeAt(tv4, -`2`);
2213
2214	tv2->axis(-`1`)->parallelize(ParallelType::Vectorize);
2215
2216	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2217	auto t0 = at::randn(shape1, options);
2218	auto t1 = at::randn(shape2, options);
2219
2220	FusionExecutor fe;
2221	fe.compileFusion(&fusion, {t0, t1});
2222
2223	// Vectorization of tv2 should be detected as invalid.
2224	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2225	ASSERT_ANY_THROW(fe.runFusion({t0, t1}));
2226	}
2227
2228	TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
2229	std::vector<int64_t> shape1{`2`, `2`, `2`};
2230	std::vector<int64_t> shape2{`1`, `2`, `2`};
2231
2232	Fusion fusion;
2233	FusionGuard fg(&fusion);
2234
2235	// [I0, I1, I2]
2236	auto tv0 = makeContigTensor(shape1.size());
2237	fusion.addInput(tv0);
2238
2239	// [B3, I1, I2]
2240	auto tv1 = makeContigConcreteTensor(shape2);
2241	fusion.addInput(tv1);
2242
2243	auto tv2 = set(tv1);
2244	auto tv3 = add(tv0, tv2);
2245	fusion.addOutput(tv3);
2246
2247	tv3->merge(`1`, `2`);
2248	tv3->merge(`0`, `1`);
2249	tv3->split(`0`, `4`);
2250
2251	// Don't modify tv1 so that it's replayed as tv2 with actual
2252	// transformations. It would create temporary IterDomains, and the
2253	// validation should still be able to detect vectorization by 4 is valid.
2254	// TransformPropagatorWithCheck propagator(tv3);
2255	// MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
2256
2257	tv2->merge(`1`, `2`);
2258	tv2->merge(`0`, `1`);
2259	tv2->split(`0`, `4`);
2260
2261	tv2->computeAt(tv3, -`2`);
2262
2263	tv2->axis(-`1`)->parallelize(ParallelType::Vectorize);
2264
2265	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2266	auto t0 = at::randn(shape1, options);
2267	auto t1 = at::randn(shape2, options);
2268
2269	FusionExecutor fe;
2270	fe.compileFusion(&fusion, {t0, t1});
2271	auto cg_outputs = fe.runFusion({t0, t1});
2272
2273	auto ref = t0 + t1;
2274
2275	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2276	}
2277
2278	TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
2279	std::vector<int64_t> shape0{`100`, `14`, `2`, `14`};
2280	std::vector<int64_t> shape1{`100`, `2`, `14`};
2281
2282	Fusion fusion;
2283	FusionGuard fg(&fusion);
2284
2285	auto tv0 = makeContigTensor(shape0.size());
2286	fusion.addInput(tv0);
2287	auto tv1 = makeContigTensor(shape1.size());
2288	fusion.addInput(tv1);
2289
2290	auto tv2 = broadcast(tv1, {false, true, false, false});
2291	auto tv3 = add(tv0, tv2);
2292	fusion.addOutput(tv3);
2293
2294	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2295	auto t0 = at::randn(shape0, options);
2296	auto t1 = at::randn(shape1, options);
2297
2298	auto lparams = schedulePointwise(&fusion, {t0, t1});
2299
2300	GpuLower gpulw(&fusion);
2301	auto kernel = gpulw.kernel();
2302
2303	// The innermost two dimensions are merged and contiguous, so
2304	// vectorization can be done against 214=28 rather than 14, so*
2305	// vector word size should be 4. Broadcasting of tv1 should not
2306	// matter.
2307	for (const auto& vec_info : kernel->summary().vectorized_set_info) {
2308	TORCH_CHECK(
2309	vec_info.word_size == `4`,
2310	"Invalid vector word size: ",
2311	vec_info.word_size);
2312	}
2313
2314	FusionExecutor fe;
2315	fe.compileFusion(&fusion, {t0, t1}, lparams);
2316	auto cg_outputs = fe.runFusion({t0, t1});
2317
2318	auto ref = t0 + t1.unsqueeze(-`3`);
2319
2320	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2321	}
2322
2323	// Repro of issue #1539.
2324	TEST_F(NVFuserTest, FusionTrivialReductionForwarding1_CUDA) {
2325	Fusion fusion;
2326	FusionGuard fg(&fusion);
2327
2328	auto tv0 = makeSymbolicTensor(`1`);
2329	fusion.addInput(tv0);
2330
2331	auto tv1 = broadcast(tv0, {true, false});
2332	auto tv2 = sum(tv1, {`0`});
2333	auto tv3 = set(tv2);
2334	fusion.addOutput(tv3);
2335
2336	tv2->merge(`0`);
2337	tv2->split(`0`, `4`);
2338
2339	TransformPropagatorWithCheck propagator(tv2);
2340	MaxRootDomainInfoSpanningTree (tv2).traverse(&propagator);
2341
2342	// All tensors must be transformed to a 2D tensor with each axis
2343	// mapped with each other in the LOOP map.
2344	ComputeAtMap ca_map(&fusion);
2345	for (auto tv : ir_utils::allTvs(&fusion)) {
2346	TORCH_CHECK(
2347	tv->nDims() == `2`, "Expected to be a 2D tensor but: ", tv->toString());
2348	for (const auto i : c10::irange(`2`)) {
2349	TORCH_CHECK(ca_map.areMapped(
2350	tv->axis(i), tv3->axis(i), IdMappingMode::PERMISSIVE));
2351	}
2352	}
2353	}
2354
2355	TEST_F(NVFuserTest, FusionTrivialReductionForwarding2_CUDA) {
2356	Fusion fusion;
2357	FusionGuard fg(&fusion);
2358
2359	auto tv0 = makeSymbolicTensor(`1`);
2360	fusion.addInput(tv0);
2361
2362	auto tv1 = broadcast(tv0, {true, false});
2363	auto tv2 = sum(tv1, {`0`});
2364	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
2365
2366	fusion.addOutput(tv3);
2367
2368	// Merging a trivial reduction with a non-reduction domain
2369	tv2->merge(`0`, `1`);
2370	tv2->split(`0`, `4`);
2371
2372	tv3->split(`0`, `4`);
2373
2374	// tv2 and tv3 are different as tv3 lacks the trivial reduction, but
2375	// they are mapped with each other by BestEffortReplay as the merge
2376	// of trivial reduciton dim is forwarded.
2377
2378	PairwiseRootDomainMap root_map(tv2, tv3);
2379
2380	auto p2c = BestEffortReplay::replayCasP(tv3, tv2, `2`, root_map).getReplay();
2381	for (const auto i : c10::irange(tv2->nDims())) {
2382	auto tv2_id = tv2->axis(i);
2383	auto it = p2c.find(tv2_id);
2384	TORCH_CHECK(
2385	it != p2c.end(),
2386	"Expected mapped consumer ID but not found: ",
2387	tv2_id->toString());
2388	auto tv3_mapped_id = it ->second;
2389	TORCH_CHECK(
2390	tv3_mapped_id == tv3->axis(i),
2391	"Unexpected mapped consumer ID: ",
2392	tv3_mapped_id->toString());
2393	}
2394
2395	auto c2p = BestEffortReplay::replayPasC(tv2, tv3, `2`, root_map).getReplay();
2396	for (const auto i : c10::irange(tv3->nDims())) {
2397	auto tv3_id = tv3->axis(i);
2398	auto it = c2p.find(tv3_id);
2399	TORCH_CHECK(
2400	it != c2p.end(),
2401	"Expected mapped producer ID but not found: ",
2402	tv3_id->toString());
2403	auto tv2_mapped_id = it ->second;
2404	TORCH_CHECK(
2405	tv2_mapped_id == tv2->axis(i),
2406	"Unexpected mapped consumer ID: ",
2407	tv2_mapped_id->toString());
2408	}
2409	}
2410
2411	TEST_F(NVFuserTest, FusionTrivialReductionForwarding3_CUDA) {
2412	Fusion fusion;
2413	FusionGuard fg(&fusion);
2414
2415	auto tv0 = makeSymbolicTensor(`2`);
2416	fusion.addInput(tv0);
2417
2418	auto tv1 = sum(tv0, {`1`});
2419	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
2420	fusion.addOutput(tv2);
2421
2422	// Similar pattern as FusionTrivialReductionForwarding2 but trivial
2423	// reduciton at non-root domain
2424
2425	// Create a trivial reduction by splitting with a factor of 1
2426	tv1->split(`1`, `1`, false);
2427	// Merging with a trivial reduction
2428	tv1->merge(`0`, `1`);
2429	auto tv1_merge_out_id = tv1->axis(`0`);
2430	tv1->split(`0`, `5`);
2431
2432	tv2->split(`0`, `5`);
2433
2434	// The merge of tv1 is done with a non-root trivial
2435	// reduciton. BestEffortReplay should forward the merge.
2436
2437	PairwiseRootDomainMap root_map(tv1, tv2);
2438	auto p2c = BestEffortReplay::replayCasP(tv2, tv1, `2`, root_map).getReplay();
2439
2440	// The two tensors should look like:
2441	// tv1: [I11//5, 5, I2//1]*
2442	// tv2: [I1//5, 5]
2443	//
2444	// BestEffortRepaly should forward the merge of (I1 1) and create*
2445	// mappings of:
2446	// I11//5 -> I1//5*
2447	// 5 -> 5
2448	// I11 -> I1*
2449
2450	TORCH_CHECK(p2c.size() == `3`, "Unexpected number of mappings");
2451	TORCH_CHECK(p2c.count(tv1->axis(`0`)) && p2c[tv1->axis(`0`)] == tv2->axis(`0`));
2452	TORCH_CHECK(p2c.count(tv1->axis(`1`)) && p2c[tv1->axis(`1`)] == tv2->axis(`1`));
2453	TORCH_CHECK(
2454	p2c.count(tv1_merge_out_id) &&
2455	p2c[tv1_merge_out_id] == tv2->getRootDomain()[`0`]);
2456	}
2457
2458	TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) {
2459	Fusion fusion;
2460	FusionGuard fg(&fusion);
2461
2462	auto tv0 = makeSymbolicTensor(`1`);
2463	fusion.addInput(tv0);
2464
2465	auto tv1 = makeSymbolicTensor(`2`);
2466	fusion.addInput(tv1);
2467
2468	auto tv2 = broadcast(tv0, {true, false});
2469	auto tv3 = add(tv1, tv2);
2470	fusion.addOutput(tv3);
2471
2472	// tv4 has a trivial reduction axis
2473	auto tv4 = sum(tv2, {`0`});
2474	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
2475	fusion.addOutput(tv5);
2476
2477	tv3->merge(`0`, `1`);
2478	tv3->split(`0`, `32`);
2479
2480	// This causes the trivial reduction of tv4 to be merged with
2481	// another axis of tv4, and then forward computeAt is done from tv4
2482	// to tv5. The split of the merged id of tv4 should be done on tv5
2483	// by forwarding the merge of the trivial reduction.
2484	tv0->computeAt(tv3, -`1`);
2485
2486	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
2487	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
2488
2489	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2490	auto t0 = at::randn({`111`}, options);
2491	auto t1 = at::randn({`123`, `111`}, options);
2492
2493	FusionExecutor fe;
2494	fe.compileFusion(&fusion, {t0, t1});
2495	auto cg_outputs = fe.runFusion({t0, t1});
2496
2497	auto t2 = t0.unsqueeze(`0`);
2498	auto t3 = t1 + t2;
2499	auto t5 = sum(t2, {`0`}) + `1`;
2500
2501	testValidate(&fusion, cg_outputs, {t0, t1}, {t3, t5}, __LINE__, __FILE__);
2502	}
2503
2504	// See issue #1598
2505	TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) {
2506	Fusion fusion;
2507	FusionGuard fg(&fusion);
2508
2509	auto tv0 = makeSymbolicTensor(`2`);
2510	auto tv1 = makeSymbolicTensor(`2`);
2511	fusion.addInput(tv0);
2512	fusion.addInput(tv1);
2513
2514	auto tv2 = set(tv0);
2515	auto tv3 = set(tv1);
2516	auto tv4 = add(tv2, tv3);
2517	fusion.addOutput(tv4);
2518
2519	// Place tv2 on shared memory
2520	tv2->split(`0`, `2`);
2521	tv2->split(-`1`, `4`);
2522	tv2->setMemoryType(MemoryType::Shared);
2523	tv2->axis(-`2`)->parallelize(ParallelType::TIDy);
2524	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
2525
2526	tv3->split(`0`, `2`);
2527	tv3->split(-`1`, `4`);
2528	// swap tidx and tidy
2529	tv3->axis(-`2`)->parallelize(ParallelType::TIDx);
2530	tv3->axis(-`1`)->parallelize(ParallelType::TIDy);
2531
2532	tv4->split(`0`, `2`);
2533	tv4->split(-`1`, `4`);
2534	tv4->axis(-`2`)->parallelize(ParallelType::TIDx);
2535	tv4->axis(-`1`)->parallelize(ParallelType::TIDy);
2536
2537	tv0->computeAt(tv4, `1`);
2538	tv3->computeAt(tv4, -`1`);
2539
2540	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2541	auto t0 = at::randn({`10`, `64`}, options);
2542	auto t1 = at::randn({`10`, `64`}, options);
2543
2544	FusionExecutor fe;
2545	fe.compileFusion(&fusion, {t0, t1});
2546	auto cg_outputs = fe.runFusion({t0, t1});
2547
2548	auto ref = t0 + t1;
2549
2550	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2551	}
2552
2553	// See issue #1598
2554	TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) {
2555	Fusion fusion;
2556	FusionGuard fg(&fusion);
2557
2558	auto tv0 = makeSymbolicTensor(`2`);
2559	auto tv1 = makeSymbolicTensor(`2`);
2560	fusion.addInput(tv0);
2561	fusion.addInput(tv1);
2562
2563	auto tv2 = set(tv0);
2564	auto tv3 = set(tv1);
2565	auto tv4 = add(tv2, tv3);
2566	fusion.addOutput(tv4);
2567
2568	tv2->split(`0`, `2`);
2569	tv2->split(-`1`, `4`);
2570	tv2->setMemoryType(MemoryType::Shared);
2571
2572	tv2->axis(-`2`)->parallelize(ParallelType::TIDy);
2573	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
2574
2575	tv4->split(`0`, `2`);
2576	tv4->split(-`1`, `4`);
2577	// Also do unroll for tv3 and tv4
2578	tv4->split(-`2`, `8`, false);
2579	tv4->axis(-`3`)->parallelize(ParallelType::Unroll);
2580	// swap tidx and tidy
2581	tv4->axis(-`2`)->parallelize(ParallelType::TIDx);
2582	tv4->axis(-`1`)->parallelize(ParallelType::TIDy);
2583
2584	tv0->computeAt(tv4, `1`);
2585	tv3->computeAt(tv4, -`1`);
2586
2587	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2588	auto t0 = at::randn({`10`, `64`}, options);
2589	auto t1 = at::randn({`10`, `64`}, options);
2590
2591	FusionExecutor fe;
2592	fe.compileFusion(&fusion, {t0, t1});
2593	auto cg_outputs = fe.runFusion({t0, t1});
2594
2595	auto ref = t0 + t1;
2596
2597	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2598	}
2599
2600	// See issue #1599
2601	TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) {
2602	Fusion fusion;
2603	FusionGuard fg(&fusion);
2604
2605	auto tv0 = makeSymbolicTensor(`2`);
2606	auto tv1 = makeSymbolicTensor(`2`);
2607	fusion.addInput(tv0);
2608	fusion.addInput(tv1);
2609
2610	auto tv2 = set(tv0);
2611	auto tv3 = set(tv1);
2612	auto tv4 = add(tv2, tv3);
2613	fusion.addOutput(tv4);
2614
2615	// Use unroll where a RAW-sync tensor is stored
2616
2617	tv4->split(`0`, `2`);
2618	tv4->split(`0`, `3`);
2619	tv4->split(-`1`, `4`);
2620	tv4->axis(`1`)->parallelize(ParallelType::Unroll);
2621	tv4->axis(-`2`)->parallelize(ParallelType::TIDx);
2622	tv4->axis(-`1`)->parallelize(ParallelType::TIDy);
2623
2624	tv0->computeAt(tv4, `3`);
2625	tv3->computeAt(tv4, -`1`);
2626
2627	tv2->split(-`1`, `4`);
2628	tv2->axis(-`2`)->parallelize(ParallelType::TIDy);
2629	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
2630	tv2->setMemoryType(MemoryType::Shared);
2631
2632	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2633	auto t0 = at::randn({`50`, `64`}, options);
2634	auto t1 = at::randn({`50`, `64`}, options);
2635
2636	FusionExecutor fe;
2637	fe.compileFusion(&fusion, {t0, t1});
2638	auto cg_outputs = fe.runFusion({t0, t1});
2639
2640	auto ref = t0 + t1;
2641
2642	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2643	}
2644
2645	// See #1618
2646	TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace4_CUDA) {
2647	Fusion fusion;
2648	FusionGuard fg(&fusion);
2649
2650	auto tv0 = makeConcreteTensor({`16`, `128`});
2651	auto tv1 = makeConcreteTensor({`16`, `128`});
2652	fusion.addInput(tv0);
2653	fusion.addInput(tv1);
2654
2655	auto tv2 = set(tv0);
2656	auto tv3 = set(tv1);
2657	auto tv4 = set(tv2);
2658	auto tv5 = set(tv3);
2659	auto tv6 = add(tv4, tv5);
2660	fusion.addOutput(tv6);
2661
2662	tv2->setMemoryType(MemoryType::Shared);
2663	tv3->setMemoryType(MemoryType::Shared);
2664
2665	tv2->computeAt(tv6, `0`);
2666	tv3->computeAt(tv6, `1`);
2667	tv4->computeAt(tv6, `1`);
2668	tv5->computeAt(tv6, -`1`);
2669	tv2->split(`1`, `64`);
2670	tv3->split(`1`, `64`);
2671	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
2672	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
2673	tv6->axis(-`1`)->parallelize(ParallelType::TIDx);
2674
2675	// Check the block sync is inserted at the correct location.
2676	// There is exactly one block sync needed in this test case
2677	// and the sync needs to be after the 2 expressions
2678	// that modify shared memory.
2679	class SyncInsertionPointChecker : public kir::IrVisitor {
2680	public:
2681	using kir::IrVisitor::handle;
2682
2683	private:
2684	void handle(UnaryOp* uop) final {
2685	// Record number of unary ops that modifies shared memory.
2686	if (uop->out()->isA<kir::TensorIndex>() &&
2687	uop->out()->as<kir::TensorIndex>()->view()->getMemoryType() ==
2688	MemoryType::Shared &&
2689	// Filter out initialization expressions
2690	uop->in()->isA<kir::TensorIndex>()) {
2691	number_of_writes_++;
2692	}
2693	}
2694	void handle(kir::BlockSync* bsync) final {
2695	// Make sure both shared memory modifying expressions
2696	// have been observed at the sync insertion point.
2697	TORCH_INTERNAL_ASSERT(
2698	number_of_writes_ == `2`,
2699	"FusionRAWSyncInsertionPlace4 test fail:",
2700	"only 1 sync after the 2 shared mem writes is needed in this test,"
2701	"either a redundant sync has been inserted or the block sync is not inserted at the right place");
2702	}
2703
2704	private:
2705	int number_of_writes_ = `0`;
2706	} sync_insertion_checker;
2707	GpuLower gpulw(&fusion);
2708	sync_insertion_checker.handle(gpulw.kernel()->topLevelExprs());
2709	}
2710
2711	// Test serial write and parallel read of shared mem: mapped case
2712	TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) {
2713	Fusion fusion;
2714	FusionGuard fg(&fusion);
2715
2716	TensorView* tv0 = makeConcreteTensor({`128`, `6`});
2717	TensorView* tv1 = makeConcreteTensor({`128`, `6`});
2718	TensorView* tv2 = makeConcreteTensor({`128`, `6`});
2719	fusion.addInput(tv0);
2720	fusion.addInput(tv1);
2721	fusion.addInput(tv2);
2722
2723	TensorView* tv3 = add(tv0, tv1);
2724	TensorView* tv4 = add(tv3, tv2);
2725
2726	fusion.addOutput(tv4);
2727
2728	// Use shared memory
2729	tv3->setMemoryType(MemoryType::Shared);
2730
2731	// Parallelize t4, in this case dim 0 on tv3 will
2732	// not be parallelized but dim0 of t4 will be.
2733	// We will need to make sure a sync is inserted
2734	// even if these dimensions are mapped.
2735	tv4->axis(`0`)->parallelize(ParallelType::TIDx);
2736
2737	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2738
2739	at::Tensor t0 = at::randn({`128`, `6`}, options);
2740	at::Tensor t1 = at::randn({`128`, `6`}, options);
2741	at::Tensor t2 = at::randn({`128`, `6`}, options);
2742
2743	FusionExecutor fe;
2744	fe.compileFusion(&fusion, {t0, t1, t2});
2745	auto cg_outputs = fe.runFusion({t0, t1, t2});
2746
2747	auto ref = t0 + t1 + t2;
2748
2749	testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
2750	}
2751
2752	// Test serial write and parallel read of shared mem: un-mapped case
2753	TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) {
2754	Fusion fusion;
2755	FusionGuard fg(&fusion);
2756
2757	TensorView* tv0 = makeConcreteTensor({`128`, `6`});
2758	TensorView* tv1 = makeConcreteTensor({`128`, `6`});
2759	TensorView* tv2 = makeConcreteTensor({`128`, `6`});
2760	fusion.addInput(tv0);
2761	fusion.addInput(tv1);
2762	fusion.addInput(tv2);
2763
2764	TensorView* tv3 = add(tv0, tv1);
2765	TensorView* tv4 = add(tv3, tv2);
2766
2767	fusion.addOutput(tv4);
2768
2769	// Use shared memory
2770	tv3->setMemoryType(MemoryType::Shared);
2771
2772	// Split and parallelize t4,
2773	// the parallelized dimension in t4 will not
2774	// map across to the shared mem tensor, t3. So
2775	// there will need to be a sync before use of t3.
2776	tv4->split(`0`, `2`);
2777	tv4->axis(`0`)->parallelize(ParallelType::TIDx);
2778
2779	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2780
2781	at::Tensor t0 = at::randn({`128`, `6`}, options);
2782	at::Tensor t1 = at::randn({`128`, `6`}, options);
2783	at::Tensor t2 = at::randn({`128`, `6`}, options);
2784
2785	FusionExecutor fe;
2786	fe.compileFusion(&fusion, {t0, t1, t2});
2787	auto cg_outputs = fe.runFusion({t0, t1, t2});
2788
2789	auto ref = t0 + t1 + t2;
2790
2791	testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
2792	}
2793
2794	// Simple test of async copy primitive
2795	TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) {
2796	Fusion fusion;
2797	FusionGuard fg(&fusion);
2798
2799	int m = `33`, n = `31`;
2800
2801	TensorView* tv0 = makeConcreteTensor({m, n});
2802	TensorView* tv1 = makeConcreteTensor({m, n});
2803
2804	fusion.addInput(tv0);
2805	fusion.addInput(tv1);
2806
2807	TensorView* tv2 = add(tv0, tv1);
2808
2809	fusion.addOutput(tv2);
2810
2811	auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
2812	tv0_shared->setMemoryType(MemoryType::Shared);
2813
2814	tv0->computeAt(tv2, `1`);
2815	tv0_shared->axis(`1`)->parallelize(ParallelType::TIDx);
2816	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
2817
2818	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2819	at::Tensor t0 = at::randn({m, n}, options);
2820	at::Tensor t1 = at::randn({m, n}, options);
2821
2822	FusionExecutor fe;
2823
2824	// requires ampere+ GPU
2825	if (!deviceMajorMinorCheck(`8`)) {
2826	ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
2827	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
2828	}
2829	fe.compileFusion(&fusion, {t0, t1});
2830	auto cg_outputs = fe.runFusion({t0, t1});
2831
2832	auto ref = t0 + t1;
2833
2834	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2835	}
2836
2837	// Simple test of async copy primitive: double buffered
2838	// Double buffer case 1, both block sync and async wait
2839	// are needed.
2840	TEST_F(NVFuserTest, FusionDoubleBufferCpAsync1_CUDA) {
2841	Fusion fusion;
2842	FusionGuard fg(&fusion);
2843
2844	// Using vectorization so need to keep n multiple of 4.
2845	int m = `33`, n = `48`;
2846
2847	TensorView* tv0 = makeConcreteTensor({m, n});
2848	TensorView* tv1 = makeConcreteTensor({m, n});
2849
2850	fusion.addInput(tv0);
2851	fusion.addInput(tv1);
2852
2853	TensorView* tv2 = add(tv0, tv1);
2854
2855	fusion.addOutput(tv2);
2856
2857	auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
2858	tv0_shared->setMemoryType(MemoryType::Shared);
2859	tv0->computeAt(tv2, `1`);
2860
2861	// Asynchronously load a tile in one schedule
2862	tv0_shared->split(`1`, `4`);
2863	tv0_shared->axis(-`1`)->parallelize(ParallelType::Vectorize);
2864	tv0_shared->axis(-`2`)->parallelize(ParallelType::TIDx);
2865
2866	// Consume the loaded tile in another schedule,
2867	// triggering the need for a sync.
2868	tv2->split(`1`, `12`);
2869	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
2870
2871	// Double buffer the shared mem tensor.
2872	tv0_shared->doubleBuffer();
2873
2874	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2875	at::Tensor t0 = at::randn({m, n}, options);
2876	at::Tensor t1 = at::randn({m, n}, options);
2877
2878	FusionExecutor fe;
2879	// requires ampere+ GPU
2880	if (!deviceMajorMinorCheck(`8`)) {
2881	ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
2882	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
2883	}
2884	fe.compileFusion(&fusion, {t0, t1});
2885	auto cg_outputs = fe.runFusion({t0, t1});
2886
2887	auto ref = t0 + t1;
2888
2889	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2890	}
2891
2892	// Simple test of async copy primitive: double buffered
2893	// Double buffer case 2, only async wait is needed
2894	TEST_F(NVFuserTest, FusionDoubleBufferCpAsync2_CUDA) {
2895	Fusion fusion;
2896	FusionGuard fg(&fusion);
2897
2898	// Using vectorization so need to keep n multiple of 4.
2899	int m = `33`, n = `48`;
2900
2901	TensorView* tv0 = makeConcreteTensor({m, n});
2902	TensorView* tv1 = makeConcreteTensor({m, n});
2903
2904	fusion.addInput(tv0);
2905	fusion.addInput(tv1);
2906
2907	TensorView* tv2 = add(tv0, tv1);
2908
2909	fusion.addOutput(tv2);
2910
2911	auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
2912	tv0_shared->setMemoryType(MemoryType::Shared);
2913	tv0->computeAt(tv2, `1`);
2914
2915	// Asynchronously load a tile in one schedule
2916	tv0_shared->split(`1`, `4`);
2917	tv0_shared->axis(-`2`)->parallelize(ParallelType::TIDx);
2918
2919	// Consume the loaded tile in another schedule,
2920	// triggering the need for a sync.
2921	tv2->split(`1`, `4`);
2922	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
2923
2924	// Double buffer the shared mem tensor.
2925	tv0_shared->doubleBuffer();
2926
2927	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2928	at::Tensor t0 = at::randn({m, n}, options);
2929	at::Tensor t1 = at::randn({m, n}, options);
2930
2931	FusionExecutor fe;
2932	// requires ampere+ GPU
2933	if (!deviceMajorMinorCheck(`8`)) {
2934	ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
2935	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
2936	}
2937	fe.compileFusion(&fusion, {t0, t1});
2938	auto cg_outputs = fe.runFusion({t0, t1});
2939
2940	auto ref = t0 + t1;
2941
2942	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2943	}
2944
2945	// Simple test for double buffer in shared mem,
2946	// where we should not insert redundant syncs when
2947	// they are not needed.
2948	TEST_F(NVFuserTest, FusionDoubleBufferNoSync_CUDA) {
2949	Fusion fusion;
2950	FusionGuard fg(&fusion);
2951
2952	// Using vectorization so need to keep n multiple of 4.
2953	int m = `33`, n = `48`;
2954
2955	TensorView* tv0 = makeConcreteTensor({m, n});
2956	TensorView* tv1 = makeConcreteTensor({m, n});
2957
2958	fusion.addInput(tv0);
2959	fusion.addInput(tv1);
2960
2961	TensorView* tv2 = add(tv0, tv1);
2962
2963	fusion.addOutput(tv2);
2964
2965	auto tv0_shared = tv0->cacheAfter();
2966	tv0_shared->setMemoryType(MemoryType::Shared);
2967	tv0->computeAt(tv2, `1`);
2968
2969	// Asynchronously load a tile in one schedule
2970	tv0_shared->split(`1`, `4`);
2971	tv0_shared->axis(-`2`)->parallelize(ParallelType::TIDx);
2972
2973	// Consume the loaded tile in another schedule,
2974	// triggering the need for a sync.
2975	tv2->split(`1`, `4`);
2976	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
2977
2978	// Double buffer the shared mem tensor.
2979	tv0_shared->doubleBuffer();
2980
2981	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2982	at::Tensor t0 = at::randn({m, n}, options);
2983	at::Tensor t1 = at::randn({m, n}, options);
2984
2985	GpuLower gpulw(&fusion);
2986	auto flattened_exprs =
2987	ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
2988	bool sync_inserted = std::any_of(
2989	flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
2990	return expr->isA<kir::BlockSync>();
2991	});
2992	TORCH_INTERNAL_ASSERT(!sync_inserted, "Un-expected block sync inserted");
2993
2994	FusionExecutor fe;
2995	fe.compileFusion(&fusion, {t0, t1});
2996	auto cg_outputs = fe.runFusion({t0, t1});
2997
2998	auto ref = t0 + t1;
2999
3000	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
3001	}
3002
3003	// Test predicate inversion for cp.async
3004	TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) {
3005	// requires ampere+ GPU
3006
3007	Fusion fusion;
3008	FusionGuard fg(&fusion);
3009
3010	// Using vectorization so need to keep n multiple of 4.
3011	int m = `33`, n = `48`;
3012
3013	TensorView* tv0 = makeConcreteTensor({m, n});
3014
3015	fusion.addInput(tv0);
3016	auto tv1 = sum(tv0, {`1`});
3017	fusion.addOutput(tv1);
3018
3019	auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
3020	auto tv0_reg = tv0_shared->cacheAfter();
3021	tv0_shared->setMemoryType(MemoryType::Shared);
3022	tv0->computeAt(tv1, `1`);
3023
3024	tv0_shared->split(-`1`, `32`);
3025	tv0_shared->split(-`1`, `4`);
3026	tv0_shared->axis(-`1`)->parallelize(ParallelType::Vectorize);
3027
3028	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3029	at::Tensor t0 = at::randn({m, n}, options);
3030
3031	FusionExecutor fe;
3032	if (!deviceMajorMinorCheck(`8`)) {
3033	ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0}));
3034	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
3035	}
3036
3037	fe.compileFusion(&fusion, {t0});
3038	auto cg_outputs = fe.runFusion({t0});
3039
3040	auto ref = t0.sum({`1`});
3041
3042	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
3043	}
3044
3045	// Test predicate removal on reg-to-reg expressions
3046	TEST_F(NVFuserTest, FusionPredRemovalCheck_CUDA) {
3047	Fusion fusion;
3048	FusionGuard fg(&fusion);
3049
3050	TensorView* tv0 = makeContigTensor(`2`);
3051	fusion.addInput(tv0);
3052
3053	TensorView* tv1 = set(tv0);
3054	TensorView* tv2 = set(tv1);
3055	TensorView* tv3 = set(tv2);
3056	TensorView* tv4 = set(tv3);
3057
3058	fusion.addOutput(tv4);
3059	tv4->split(`1`, `4`);
3060	tv0->computeAt(tv4, -`2`);
3061	tv3->axis(-`1`)->parallelize(ParallelType::Vectorize);
3062
3063	class PredicateRemovalChecker : public kir::IrVisitor {
3064	public:
3065	using kir::IrVisitor::handle;
3066
3067	private:
3068	void handle(UnaryOp* uop) final {
3069	assertOnLocalToLocal(uop);
3070	}
3071
3072	// Utility to assert any local-to-local expr is only trivially predicated.
3073	void assertOnLocalToLocal(Expr* expr) {
3074	bool is_local = true;
3075	for (auto in : ir_utils::filterByType<kir::TensorIndex>(expr->inputs())) {
3076	if (in->view()->getMemoryType() != MemoryType::Local) {
3077	is_local = false;
3078	}
3079	}
3080	for (auto in :
3081	ir_utils::filterByType<kir::TensorIndex>(expr->outputs())) {
3082	if (in->view()->getMemoryType() != MemoryType::Local) {
3083	is_local = false;
3084	}
3085	}
3086
3087	if (is_local) {
3088	if (auto ite = dynamic_cast<kir::IfThenElse*>(scope_exprs_.back())) {
3089	TORCH_INTERNAL_ASSERT(
3090	ite->predicate()->value()->isConst(),
3091	"redundant predicate on: ",
3092	expr);
3093	}
3094	}
3095	}
3096
3097	private:
3098	bool within_ite_ = false;
3099	} pred_checker;
3100
3101	GpuLower gpulw(&fusion);
3102	pred_checker.handle(gpulw.kernel()->topLevelExprs());
3103	}
3104
3105	TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
3106	Fusion fusion;
3107	FusionGuard fg(&fusion);
3108
3109	auto tv0 = makeSymbolicTensor(`1`);
3110	fusion.addInput(tv0);
3111	auto tvs = Welford(tv0, {`0`});
3112	auto tv_avg = tvs.avg;
3113	fusion.addOutput(tv_avg);
3114
3115	tv_avg->split(`0`, `128`);
3116	TransformPropagatorWithCheck propagator(tv_avg);
3117	MaxRootDomainInfoSpanningTree (tv_avg).traverse(&propagator);
3118
3119	tv_avg->axis(`0`)->parallelize(ParallelType::BIDx);
3120	tv_avg->axis(`1`)->parallelize(ParallelType::TIDx);
3121
3122	// Make sure the parallelization of tv_avg is propagated to the var
3123	// and count tensors.
3124	GpuLower gpulw(&fusion);
3125	for (const auto expr : gpulw.kernel()->exprs()) {
3126	auto wop = dynamic_cast<WelfordOp*>(expr);
3127	if (wop == nullptr) {
3128	continue;
3129	}
3130	auto ref = wop->outAvg()->as<TensorView>();
3131	for (auto sibling : ir_utils::filterByType<TensorView>(wop->outputs())) {
3132	if (ref == sibling) {
3133	continue;
3134	}
3135	TORCH_CHECK(
3136	ref->nDims() == sibling->nDims(),
3137	"Invalid sibling: ",
3138	sibling->toString());
3139	for (const auto i : c10::irange(ref->nDims())) {
3140	TORCH_CHECK(
3141	ref->axis(i)->getParallelType() ==
3142	sibling->axis(i)->getParallelType(),
3143	"Mismatched parallel types between siblings. ",
3144	ref->toString(),
3145	", ",
3146	sibling->toString());
3147	}
3148	}
3149	}
3150
3151	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3152	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
3153	at::manual_seed(`0`);
3154	at::Tensor t0 = at::randn({`9999`}, options);
3155
3156	FusionExecutor fe;
3157	fe.compileFusion(&fusion, {t0});
3158	auto outputs = fe.runFusion({t0});
3159
3160	testValidate(fe.kernel(), outputs, {t0}, {t0.mean({`0`})}, __LINE__, __FILE__);
3161	}
3162
3163	// Test ExactRootDomainMap
3164	TEST_F(NVFuserTest, FusionExactRootDomainMap_CUDA) {
3165	Fusion fusion;
3166	FusionGuard fg(&fusion);
3167
3168	auto tv0 = makeSymbolicTensor(`1`);
3169	fusion.addInput(tv0);
3170	auto tv1 = makeSymbolicTensor(`2`);
3171	fusion.addInput(tv1);
3172
3173	auto tv2 = broadcast(tv0, {false, true});
3174	auto tv3 = transpose(tv2);
3175	auto tv4 = add(tv2, tv1);
3176	auto tv5 = add(tv2, tv3);
3177	auto tv6 = add(tv3, tv1);
3178	fusion.addOutput(tv4);
3179	fusion.addOutput(tv5);
3180	fusion.addOutput(tv6);
3181
3182	const auto exact_map = ExactRootDomainMap (&fusion);
3183
3184	// In the exact mapping, the broadcast domain introduced at tv2 is
3185	// only mapped with the another one in tv3, which is just transposed
3186	// from tv2. Any other domain, including the second domain of tv4,
3187	// must not be mapped.
3188
3189	auto tv2_bc = tv2->axis(`1`);
3190	auto tv3_bc = tv3->axis(`0`);
3191
3192	TORCH_CHECK(
3193	exact_map.areMapped(tv2_bc, tv3_bc),
3194	"Invalid exact root domain map: ",
3195	exact_map.toString());
3196
3197	// They must not be mapped with anything else.
3198	for (auto tv : ir_utils::allTvs(&fusion)) {
3199	for (auto root_id : tv->getRootDomain()) {
3200	if (root_id == tv2_bc \|\| root_id == tv3_bc) {
3201	continue;
3202	}
3203	TORCH_CHECK(
3204	!exact_map.areMapped(root_id, tv2_bc),
3205	"Invalid exact root domain map: ",
3206	exact_map.toString());
3207	TORCH_CHECK(
3208	!exact_map.areMapped(root_id, tv3_bc),
3209	"Invalid exact root domain map: ",
3210	exact_map.toString());
3211	}
3212	}
3213	}
3214
3215	class NVFuserMultithreadedTest : public ::testing::Test {
3216	protected:
3217	bool was_enabled = false;
3218
3219	void SetUp() override {
3220	was_enabled = fuser::cuda::setEnabled(true);
3221	}
3222
3223	void TearDown() override {
3224	fuser::cuda::setEnabled(was_enabled);
3225	}
3226	};
3227
3228	TEST_F(NVFuserMultithreadedTest, SingleFunction_CUDA) {
3229	std::string ir = R"IR(
3230	graph(%x.1 : Tensor,
3231	%y.1 : Tensor):
3232	%12 : NoneType = prim::Constant()
3233	%11 : bool = prim::Constant[value=0]()
3234	%9 : int = prim::Constant[value=1]()
3235	%3 : Tensor = aten::exp(%x.1)
3236	%5 : Tensor = aten::relu(%y.1)
3237	%6 : Tensor = aten::sin(%5)
3238	%8 : Tensor = aten::add(%3, %6, %9)
3239	%10 : int[] = prim::ListConstruct(%9)
3240	%13 : Tensor = aten::sum(%8, %10, %11, %12)
3241	return (%13)
3242	)IR";
3243	auto g = std::make_shared<Graph>();
3244	torch::jit::parseIR(ir, g.get());
3245	GraphFunction fn("nvfuser_test", g, nullptr);
3246
3247	auto run_kernel = [&fn]() {
3248	auto x = torch::rand({`32`, `32`}, at::TensorOptions(at::kCUDA));
3249	auto y = torch::rand({`32`, `32`}, at::TensorOptions(at::kCUDA));
3250	std::vector<IValue> results;
3251	for (const auto& _ : c10::irange(`10`)) {
3252	auto stack = createStack({x.clone(), y.clone()});
3253	fn.run(stack);
3254	results.push_back(stack.back());
3255	}
3256	for (const auto& i : c10::irange(`1`, `10`)) {
3257	auto t0 = results [`0`].toTensor();
3258	auto ti = results [i].toTensor();
3259	ASSERT_TRUE(at::allclose(t0, ti));
3260	}
3261	};
3262
3263	constexpr size_t kNumThreads = `4`;
3264	std::vector<std::thread> threads;
3265	for (size_t id = `0`; id < kNumThreads; ++id) {
3266	threads.emplace_back(run_kernel);
3267	}
3268	for (auto& t : threads) {
3269	t.join();
3270	}
3271	}
3272
3273	TEST_F(NVFuserMultithreadedTest, MultipleFunctions_CUDA) {
3274	auto run_kernel = []() {
3275	const std::string ir = R"IR(
3276	graph(%x.1 : Tensor,
3277	%y.1 : Tensor):
3278	%12 : NoneType = prim::Constant()
3279	%11 : bool = prim::Constant[value=0]()
3280	%9 : int = prim::Constant[value=1]()
3281	%3 : Tensor = aten::exp(%x.1)
3282	%5 : Tensor = aten::relu(%y.1)
3283	%6 : Tensor = aten::sin(%5)
3284	%8 : Tensor = aten::add(%3, %6, %9)
3285	%10 : int[] = prim::ListConstruct(%9)
3286	%13 : Tensor = aten::sum(%8, %10, %11, %12)
3287	return (%13)
3288	)IR";
3289	auto g = std::make_shared<Graph>();
3290	torch::jit::parseIR(ir, g.get());
3291	GraphFunction fn("nvfuser_test", g, nullptr);
3292
3293	auto x = torch::rand({`32`, `32`}, at::TensorOptions(at::kCUDA));
3294	auto y = torch::rand({`32`, `32`}, at::TensorOptions(at::kCUDA));
3295	std::vector<IValue> results;
3296	constexpr size_t numRuns = `10`;
3297	for (const auto& _ : c10::irange(numRuns)) {
3298	auto stack = createStack({x.clone(), y.clone()});
3299	fn.run(stack);
3300	results.push_back(stack.back());
3301	}
3302	for (const auto& i : c10::irange(`1`, numRuns)) {
3303	auto t0 = results [`0`].toTensor();
3304	auto ti = results [i].toTensor();
3305	ASSERT_TRUE(at::allclose(t0, ti));
3306	}
3307	};
3308
3309	constexpr size_t kNumThreads = `4`;
3310	std::vector<std::thread> threads;
3311	for (size_t id = `0`; id < kNumThreads; ++id) {
3312	threads.emplace_back(run_kernel);
3313	}
3314	for (auto& t : threads) {
3315	t.join();
3316	}
3317	}
3318
3319	// Repro of issue #1655
3320	TEST_F(NVFuserTest, FusionIncompleteConcreteID_CUDA) {
3321	Fusion fusion;
3322	FusionGuard fg(&fusion);
3323
3324	auto tv0 = makeSymbolicTensor(`1`);
3325	fusion.addInput(tv0);
3326	auto tv1 = makeSymbolicTensor(`2`);
3327	fusion.addInput(tv1);
3328	auto tv2 = makeSymbolicTensor(`2`);
3329	fusion.addInput(tv2);
3330
3331	auto tv3 = broadcast(tv0, {true, true, false});
3332	auto tv4 = broadcast(tv1, {false, true, false});
3333	auto tv5 = broadcast(tv2, {true, false, false});
3334
3335	auto tv6 = add(tv3, tv4);
3336	auto tv7 = add(tv3, tv5);
3337
3338	fusion.addOutput(tv6);
3339	fusion.addOutput(tv7);
3340
3341	tv6->merge(`0`);
3342	tv6->merge(`0`);
3343
3344	TransformPropagatorWithCheck propagator(tv6);
3345	MaxRootDomainInfoSpanningTree (tv6).traverse(&propagator);
3346
3347	tv0->computeAt(tv6, -`1`, ComputeAtMode::MostInlined);
3348	tv1->computeAt(tv6, -`1`, ComputeAtMode::MostInlined);
3349	tv2->computeAt(tv7, -`1`, ComputeAtMode::MostInlined);
3350
3351	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
3352	ASSERT_ANY_THROW(fusion.printKernel());
3353	}
3354
3355	TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) {
3356	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
3357	Fusion& fusion = *fusion_ptr.get();
3358	FusionGuard fg(&fusion);
3359
3360	int X = `256`, Y = `7`, Z = `2048`;
3361
3362	// setup fusion
3363	auto tv0 = makeContigTensor(`4`, DataType::Half);
3364	fusion.addInput(tv0);
3365	auto tv1 = castOp(DataType::Float, tv0);
3366
3367	auto tvs = Welford(tv1, {`0`, `1`, `2`});
3368	auto tv_avg = tvs.avg;
3369	auto tv_M2 = tvs.var_sum;
3370	auto tv_N = tvs.n;
3371	fusion.addOutput(tv_avg);
3372	fusion.addOutput(tv_M2);
3373
3374	auto cached_input = tv0->cacheAfter();
3375	auto cached_avg = tv_avg->cacheBefore();
3376	auto cached_M2 = tv_M2->cacheBefore();
3377
3378	auto reduction_tv = scheduler_utils::getReductionTvs(&fusion)[`0`];
3379
3380	reduction_tv->merge(`0`);
3381	reduction_tv->merge(`0`);
3382
3383	int TIDx = `16`;
3384	int vec = `4`;
3385
3386	int TIDy = `16`;
3387	int outer_tidy_fact = `16`;
3388
3389	reduction_tv->split(-`1`, TIDx * vec);
3390	reduction_tv->split(-`1`, vec);
3391	reduction_tv->axis(-`2`)->parallelize(ParallelType::TIDx);
3392	reduction_tv->axis(-`1`)->parallelize(ParallelType::Vectorize);
3393	reduction_tv->axis(-`3`)->parallelize(ParallelType::BIDx);
3394
3395	reduction_tv->split(`0`, TIDy);
3396	reduction_tv->axis(`1`)->parallelize(ParallelType::TIDy);
3397	reduction_tv->split(`0`, outer_tidy_fact);
3398	reduction_tv->axis(`0`)->parallelize(ParallelType::BIDy);
3399
3400	// T2_g[ rblockIdx.y, rS{16}, rthreadIdx.y, iblockIdx.x, ithreadIdx.x24,
3401	// iV25{4} ]
3402	reduction_tv->reorder({{`3`, `0`}, {`4`, `1`}, {`0`, `2`}, {`2`, `3`}, {`1`, `4`}, {`5`, `5`}});
3403	// T2_g[iblockIdx.x, ithreadIdx.x24, rblockIdx.y, rthreadIdx.y, rS{16},
3404	// iV25{4}]
3405
3406	TransformPropagatorWithCheck propagator(reduction_tv);
3407	MaxRootDomainInfoSpanningTree (reduction_tv).traverse(&propagator);
3408	auto rfactor_tv = ir_utils::rfactorHelper(reduction_tv, {`4`});
3409	scheduler_utils::parallelizeAllLike(rfactor_tv);
3410
3411	tv0->computeAt(tv_avg, `2`);
3412	tv0->computeAt(cached_input, -`2`);
3413
3414	cached_input->computeAt(rfactor_tv, `4`, ComputeAtMode::BestEffort);
3415
3416	for (auto tv : ir_utils::allTvs(&fusion)) {
3417	if (tv == cached_input \|\| tv == tv_avg \|\| tv == tv_M2) {
3418	continue;
3419	}
3420	tv->axis(-`1`)->parallelize(ParallelType::Serial);
3421	}
3422
3423	FusionExecutor fe;
3424	fe.compileFusion(&fusion, {}, LaunchParams ());
3425
3426	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
3427	at::Tensor t0 = at::randn({X, Y, Y, Z}, options);
3428
3429	auto cg_outputs = fe.runFusion({t0}, LaunchParams (-`1`, -`1`, -`1`, -`1`, -`1`, -`1`));
3430
3431	// by default Welford outputs sum of square diff so need to divide to get var
3432	cg_outputs [`1`] = cg_outputs [`1`].div((float)(X * Y * Y));
3433
3434	auto at_mu = at::mean(t0.to(at::kDouble), {`0`, `1`, `2`});
3435	auto at_var = at::var(t0.to(at::kDouble), {`0`, `1`, `2`}, false);
3436
3437	testValidate(
3438	&fusion,
3439	cg_outputs,
3440	{t0},
3441	{at_mu, at_var},
3442	__LINE__,
3443	__FILE__,
3444	"",
3445	LaunchParams (-`1`, -`1`, -`1`, -`1`, -`1`, -`1`));
3446	}
3447
3448	// Test sync insertion with redundant predicates
3449	TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) {
3450	Fusion fusion;
3451	FusionGuard fg(&fusion);
3452
3453	TensorView* tv0 = makeConcreteTensor({`32`});
3454	TensorView* tv1 = makeConcreteTensor({`32`, `32`});
3455	fusion.addInput(tv0);
3456	fusion.addInput(tv1);
3457
3458	auto tv2 = broadcast(tv0, {true, false});
3459	auto tv3 = add(tv2, tv1);
3460
3461	fusion.addOutput(tv3);
3462
3463	auto tv0c = tv0->cacheAfter();
3464
3465	// Make a redundant write through smem
3466	tv0c->setMemoryType(MemoryType::Shared);
3467
3468	tv0->computeAt(tv3, `0`);
3469	tv1->computeAt(tv3, `0`);
3470
3471	tv0c->axis(`0`)->parallelize(ParallelType::TIDx);
3472	tv2->axis(`0`)->parallelize(ParallelType::TIDy);
3473	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
3474
3475	tv3->axis(`0`)->parallelize(ParallelType::TIDy);
3476	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
3477
3478	GpuLower gpulw(&fusion);
3479	auto flattened_exprs =
3480	ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
3481	bool sync_inserted = std::any_of(
3482	flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
3483	return expr->isA<kir::BlockSync>();
3484	});
3485	TORCH_INTERNAL_ASSERT(sync_inserted, "Expected block sync not inserted");
3486
3487	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3488
3489	at::Tensor t0 = at::randn({`32`}, options);
3490	at::Tensor t1 = at::randn({`32`, `32`}, options);
3491
3492	FusionExecutor fe;
3493	fe.compileFusion(&fusion, {t0, t1});
3494	auto cg_outputs = fe.runFusion({t0, t1});
3495
3496	auto ref = t0 + t1;
3497
3498	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
3499	}
3500
3501	// Test case for removing syncs on chain of redundant uses.
3502	TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) {
3503	Fusion fusion;
3504	FusionGuard fg(&fusion);
3505
3506	TensorView* tv0 = makeConcreteTensor({`32`});
3507	TensorView* tv1 = makeConcreteTensor({`32`, `32`});
3508	fusion.addInput(tv0);
3509	fusion.addInput(tv1);
3510
3511	auto tv2 = broadcast(tv0, {true, false});
3512	auto tv3 = add(tv2, tv1);
3513
3514	fusion.addOutput(tv3);
3515
3516	auto tv0c = tv0->cacheAfter();
3517
3518	// Make a redundant write through smem
3519	tv0c->setMemoryType(MemoryType::Shared);
3520	tv2->setMemoryType(MemoryType::Shared);
3521
3522	tv0->computeAt(tv3, `0`);
3523	tv1->computeAt(tv3, `0`);
3524
3525	tv0c->axis(`0`)->parallelize(ParallelType::TIDx);
3526	tv2->axis(`0`)->parallelize(ParallelType::TIDy);
3527	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
3528
3529	tv3->axis(`0`)->parallelize(ParallelType::TIDy);
3530	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
3531
3532	// Utility class to make sure one block sync
3533	// is inserted by RAW pass.
3534	class SyncChecker : public kir::IrVisitor {
3535	public:
3536	using kir::IrVisitor::handle;
3537	int result() {
3538	return sync_seen_;
3539	}
3540
3541	private:
3542	void handle(kir::BlockSync*) final {
3543	sync_seen_++;
3544	}
3545
3546	private:
3547	int sync_seen_ = `0`;
3548	} checker;
3549
3550	GpuLower gpulw(&fusion);
3551	checker.handle(gpulw.kernel()->topLevelExprs());
3552	TORCH_INTERNAL_ASSERT(
3553	checker.result() < `2`, "More syncs were inserted than expected");
3554
3555	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3556
3557	at::Tensor t0 = at::randn({`32`}, options);
3558	at::Tensor t1 = at::randn({`32`, `32`}, options);
3559
3560	FusionExecutor fe;
3561	fe.compileFusion(&fusion, {t0, t1});
3562	auto cg_outputs = fe.runFusion({t0, t1});
3563
3564	auto ref = t0 + t1;
3565
3566	testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
3567	}
3568
3569	// Test case for sync insertion after redundant predicated smem write
3570	// Check that syncs are removed only when all paths are redundant.
3571	TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) {
3572	Fusion fusion;
3573	FusionGuard fg(&fusion);
3574
3575	TensorView* tv0 = makeConcreteTensor({`32`});
3576	TensorView* tv1 = makeConcreteTensor({`32`, `32`});
3577	fusion.addInput(tv0);
3578	fusion.addInput(tv1);
3579
3580	auto tv2 = broadcast(tv0, {true, false});
3581	auto tv3 = set(tv2);
3582	auto tv4 = add(tv3, tv1);
3583	auto tv5 = add(tv2, tv1);
3584
3585	fusion.addOutput(tv4);
3586	fusion.addOutput(tv5);
3587
3588	auto tv0c = tv0->cacheAfter();
3589
3590	// In this scheduling config,
3591	// tv0c -> tv2 -> tv3 is a redundant path for tidy
3592	// tv0c -> tv2 -> tv5 is not.
3593	// So we need a RAW sync in tv0c->tv2 to make sure
3594	// tv2 has the correct value to produce tv5.
3595	tv0c->setMemoryType(MemoryType::Shared);
3596	tv3->setMemoryType(MemoryType::Shared);
3597
3598	tv0c->axis(`0`)->parallelize(ParallelType::TIDx);
3599	tv2->axis(`0`)->parallelize(ParallelType::TIDy);
3600	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
3601
3602	tv3->axis(`0`)->parallelize(ParallelType::TIDy);
3603	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
3604
3605	tv5->axis(`0`)->parallelize(ParallelType::TIDy);
3606	tv5->axis(`1`)->parallelize(ParallelType::TIDx);
3607
3608	// Utility class to make sure one block sync
3609	// is inserted by RAW pass.
3610	class SyncChecker : public kir::IrVisitor {
3611	public:
3612	using kir::IrVisitor::handle;
3613	int result() {
3614	return sync_seen_;
3615	}
3616
3617	private:
3618	void handle(kir::BlockSync* sync) final {
3619	if (!sync->isWarHazardSync()) {
3620	sync_seen_++;
3621	}
3622	}
3623
3624	private:
3625	int sync_seen_ = `0`;
3626	} checker;
3627
3628	GpuLower gpulw(&fusion);
3629	checker.handle(gpulw.kernel()->topLevelExprs());
3630
3631	// This is implicit checking. There are exactly 2 places
3632	// where RAW hazards happen: one producing tv2 and the other
3633	// producing tv3. This test case expect syncs in both of
3634	// these places so we check that 2 RAW syncs are inserted.
3635	TORCH_INTERNAL_ASSERT(
3636	checker.result() == `2`,
3637	"Exactly 2 RAW sync expected for the two shared memory transfers");
3638
3639	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3640
3641	at::Tensor t0 = at::randn({`32`}, options);
3642	at::Tensor t1 = at::randn({`32`, `32`}, options);
3643
3644	FusionExecutor fe;
3645	fe.compileFusion(&fusion, {t0, t1});
3646	auto cg_outputs = fe.runFusion({t0, t1});
3647
3648	auto ref = t0 + t1;
3649
3650	testValidate(&fusion, cg_outputs, {t0, t1}, {ref, ref}, __LINE__, __FILE__);
3651	}
3652
3653	// Unit test case for detecting thread redundant usage of shared tensors.
3654	TEST_F(NVFuserTest, FusionRedundantUseCheck_CUDA) {
3655	Fusion fusion;
3656	FusionGuard fg(&fusion);
3657
3658	TensorView* tv0 = makeConcreteTensor({`32`, `32`});
3659	fusion.addInput(tv0);
3660
3661	auto tv1 = set(tv0);
3662	auto tv2 = set(tv1);
3663	auto tv3 = set(tv2);
3664	auto tv4 = set(tv3);
3665
3666	auto tv5 = set(tv4);
3667
3668	auto tv6 = set(tv4);
3669	auto tv7 = set(tv6);
3670
3671	fusion.addOutput(tv5);
3672	fusion.addOutput(tv7);
3673
3674	tv2->setMemoryType(MemoryType::Shared);
3675	tv4->setMemoryType(MemoryType::Shared);
3676
3677	tv7->axis(-`1`)->parallelize(ParallelType::TIDx);
3678
3679	// Thread pred map cannot be built without an active lower
3680	// object. So would need to lower the whole fusion for
3681	// testing. However, lower also keeps an copy of the fusion
3682	// so the original pointers cannot be used to querry the
3683	// thread pred map. So have to traverse the new expr list
3684	// to find the pointers;
3685	GpuLower gpulw(&fusion);
3686
3687	TensorView lowered_tv2 = nullptr, lowered_tv4 = nullptr;
3688	auto used_vals = gpulw.kernel()->usedMathVals();
3689
3690	for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
3691	if (tv->name() == `2`) {
3692	lowered_tv2 = tv;
3693	}
3694	if (tv->name() == `4`) {
3695	lowered_tv4 = tv;
3696	}
3697	}
3698
3699	TORCH_INTERNAL_ASSERT(
3700	lowered_tv2 != nullptr && lowered_tv4 != nullptr,
3701	"tv2 or tv4 not lowered or mangled");
3702
3703	auto tv2_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv2);
3704	auto tv4_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv4);
3705
3706	// tv2 -> tv3 -> tv4 (shared) is the only use chain for tv2,
3707	// and tv4 is redundantly written in tidx so tv2 is redundantly
3708	// consumed in tidx.
3709	TORCH_INTERNAL_ASSERT(
3710	tv2_info.redundant_use_types.get(ParallelType::TIDx),
3711	"TV2 is redundantly used but not detected.");
3712
3713	// tv4->tv5 (global) is a redundant use chain, but
3714	// tv4->tv6->tv7 is not, so tv4 should not be detected as
3715	// a redundant used tensor in tidx.
3716	TORCH_INTERNAL_ASSERT(
3717	!tv4_info.redundant_use_types.get(ParallelType::TIDx),
3718	"TV4 is not redundantly used but not detected.");
3719	}
3720
3721	// Test a basic swizzle pattern
3722	TEST_F(NVFuserTest, FusionSimpleSwizzle0_CUDA) {
3723	Fusion fusion;
3724	FusionGuard fg(&fusion);
3725
3726	auto tv0 = makeConcreteTensor({`2`, `32`});
3727	fusion.addInput(tv0);
3728
3729	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3730	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
3731
3732	fusion.addOutput(tv2);
3733
3734	// Make a 2x8 Zshape tile
3735	tv1->split(-`1`, `16`);
3736	tv1->split(-`1`, `8`);
3737	// [O, 2, 8]
3738
3739	tv2->split(-`1`, `16`);
3740	tv2->split(-`1`, `4`);
3741	//[O, 4, 4]
3742
3743	tv1->computeAt(tv2, `1`);
3744	tv1->swizzle(Swizzle2DType::ZShape, -`2`, -`1`);
3745
3746	FusionExecutor fe;
3747	fe.compileFusion(&fusion);
3748
3749	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3750	auto t0 = at::randn({`2`, `32`}, options);
3751	auto t2 = t0 + `2.0`;
3752	auto cg_outputs = fe.runFusion({t0});
3753
3754	testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
3755	}
3756
3757	// Test swizzle inlining
3758	TEST_F(NVFuserTest, FusionSimpleSwizzle1_CUDA) {
3759	Fusion fusion;
3760	FusionGuard fg(&fusion);
3761
3762	auto tv0 = makeConcreteTensor({`2`, `32`});
3763	fusion.addInput(tv0);
3764
3765	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3766	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
3767	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
3768
3769	fusion.addOutput(tv3);
3770
3771	// Make a 2x8 Zshape tile
3772	tv2->split(-`1`, `16`);
3773	tv2->split(-`1`, `8`);
3774	// [O, 2, 8]
3775
3776	tv3->split(-`1`, `16`);
3777	tv3->split(-`1`, `4`);
3778	//[O, 4, 4]
3779
3780	tv2->computeAt(tv3, `1`);
3781	tv2->swizzle(Swizzle2DType::ZShape, -`2`, -`1`);
3782
3783	// Inlining a producer into a swizzled consumer is ok
3784	tv1->computeAt(tv2, -`1`);
3785
3786	FusionExecutor fe;
3787	fe.compileFusion(&fusion);
3788
3789	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3790	auto t0 = at::randn({`2`, `32`}, options);
3791	auto t3 = t0 + `3.0`;
3792	auto cg_outputs = fe.runFusion({t0});
3793
3794	testValidate(&fusion, cg_outputs, {t0}, {t3}, __LINE__, __FILE__);
3795	}
3796
3797	// Test sync insertion and memory check in parallelized swizzles.
3798	// In this test, data is parallel written into smem in zcurve
3799	// pattern and then read out and output to global mem unswizzled.
3800	TEST_F(NVFuserTest, FusionSimpleSwizzle2_CUDA) {
3801	Fusion fusion;
3802	FusionGuard fg(&fusion);
3803
3804	auto tv0 = makeConcreteTensor({`32`, `32`});
3805	fusion.addInput(tv0);
3806
3807	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3808	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
3809
3810	fusion.addOutput(tv2);
3811
3812	tv1->swizzle(Swizzle2DType::ZShape, -`2`, -`1`);
3813
3814	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
3815	tv1->axis(`1`)->parallelize(ParallelType::TIDy);
3816
3817	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
3818	tv2->axis(`1`)->parallelize(ParallelType::TIDy);
3819
3820	// Validation should fail since TV1 is not in shared
3821	// memory as required by sync info pass.
3822	ASSERT_ANY_THROW(GpuLower gpulw_throw(&fusion));
3823
3824	tv1->setMemoryType(MemoryType::Shared);
3825
3826	// Make sure that a sync is inserted:
3827	bool sync_found = false;
3828	GpuLower gpu_lw(&fusion);
3829	auto flattened_exps =
3830	ir_utils::flattenScopedExprs(gpu_lw.kernel()->topLevelExprs());
3831
3832	for (auto expr : flattened_exps) {
3833	if (expr->isA<kir::BlockSync>()) {
3834	sync_found = true;
3835	}
3836	// Will require a sync thread before any shared memory read.
3837	for (auto inp_tv : ir_utils::filterByType<TensorView>(expr->inputs())) {
3838	if (inp_tv->getMemoryType() == MemoryType::Shared) {
3839	TORCH_INTERNAL_ASSERT(
3840	sync_found, "Block sync required but not inserted");
3841	}
3842	}
3843	}
3844
3845	FusionExecutor fe;
3846	fe.compileFusion(&fusion);
3847
3848	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3849	auto t0 = at::randn({`32`, `32`}, options);
3850	auto t2 = t0 + `2.0`;
3851	auto cg_outputs = fe.runFusion({t0});
3852
3853	testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
3854	}
3855
3856	// Test BestEffortReplay behavior with swizzle op
3857	TEST_F(NVFuserTest, FusionSwizzleMapping_CUDA) {
3858	Fusion fusion;
3859	FusionGuard fg(&fusion);
3860
3861	auto tv0 = makeConcreteTensor({`2`, `32`});
3862	fusion.addInput(tv0);
3863
3864	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3865	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
3866	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
3867
3868	fusion.addOutput(tv3);
3869
3870	// Make a 2x8 Zshape tile
3871	tv2->split(-`1`, `16`);
3872	tv2->split(-`1`, `8`);
3873	// [O, 2, 8]
3874
3875	tv3->split(-`1`, `16`);
3876	tv3->split(-`1`, `4`);
3877	//[O, 4, 4]
3878
3879	tv2->computeAt(tv3, `1`);
3880	tv2->swizzle(Swizzle2DType::ZShape, -`2`, -`1`);
3881
3882	// Inlining a producer into a swizzled consumer is ok
3883	tv1->computeAt(tv2, -`1`);
3884
3885	// Check BestEffortReplay behavior with skip swizzles option on.
3886	PairwiseRootDomainMap root_map(tv1, tv2);
3887
3888	// Check producer to consumer map,
3889	// i.e. unswizzled tensor to swizzled tensor map
3890	//----------------------------------------------------------
3891	auto p2c = BestEffortReplay::replayCasP(tv2, tv1, -`1`, root_map).getReplay();
3892	auto swizzle_x_it0 = p2c.find(tv1->axis(-`2`));
3893	auto swizzle_y_it0 = p2c.find(tv1->axis(-`1`));
3894	// P2C map should exist and both the x and y map should
3895	// map to the output of the swizzle op.
3896	TORCH_INTERNAL_ASSERT(
3897	swizzle_x_it0 != p2c.end() && swizzle_y_it0 != p2c.end());
3898	TORCH_INTERNAL_ASSERT(
3899	swizzle_x_it0 ->second == tv2->axis(-`2`) &&
3900	swizzle_y_it0 ->second == tv2->axis(-`1`));
3901
3902	// Check consumer to producer map,
3903	// i.e. swizzled tensor to unswizzled tensor map
3904	//----------------------------------------------------------
3905	auto c2p = BestEffortReplay::replayPasC(tv1, tv2, -`1`, root_map).getReplay();
3906
3907	auto swizzle_op = tv2->axis(-`1`)->definition()->as<Swizzle2D>();
3908
3909	// Find mapping for swizzle inputs
3910	auto swizzle_x_it1 = c2p.find(swizzle_op->inX());
3911	auto swizzle_y_it1 = c2p.find(swizzle_op->inY());
3912
3913	// Find mapping for swizzle outputs
3914	auto swizzle_x_it2 = c2p.find(swizzle_op->outX());
3915	auto swizzle_y_it2 = c2p.find(swizzle_op->outY());
3916
3917	// Input of swizzle ops will not be mapped to any
3918	// by BestEffortReplay, as BestEffortReplay has to be
3919	// one to one. IdGraph will further map them together.
3920	TORCH_INTERNAL_ASSERT(
3921	swizzle_x_it1 == c2p.end() && swizzle_y_it1 == c2p.end());
3922
3923	// Mapping for swizzle outputs should be mapped and should
3924	// also map to the corresponding axes on the unswizzled tensor.
3925	TORCH_INTERNAL_ASSERT(
3926	swizzle_x_it2 != c2p.end() && swizzle_y_it2 != c2p.end());
3927	TORCH_INTERNAL_ASSERT(
3928	swizzle_x_it2 ->second == tv1->axis(-`2`) &&
3929	swizzle_y_it2 ->second == tv1->axis(-`1`));
3930
3931	// Check id graph behavior
3932	//----------------------------------------------------------
3933	ComputeAtMap ca_map(&fusion);
3934	// Corresponding inputs and outputs of swizzle ops are
3935	// map through by exact and permissive map.
3936	TORCH_INTERNAL_ASSERT(
3937	ca_map.areMapped(tv1->axis(-`2`), swizzle_op->inX(), IdMappingMode::EXACT));
3938	TORCH_INTERNAL_ASSERT(
3939	ca_map.areMapped(tv1->axis(-`1`), swizzle_op->inY(), IdMappingMode::EXACT));
3940	TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3941	tv1->axis(-`2`), swizzle_op->outX(), IdMappingMode::EXACT));
3942	TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3943	tv1->axis(-`1`), swizzle_op->outY(), IdMappingMode::EXACT));
3944
3945	TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3946	tv1->axis(-`2`), swizzle_op->inX(), IdMappingMode::PERMISSIVE));
3947	TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3948	tv1->axis(-`1`), swizzle_op->inY(), IdMappingMode::PERMISSIVE));
3949	TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3950	tv1->axis(-`2`), swizzle_op->outX(), IdMappingMode::PERMISSIVE));
3951	TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3952	tv1->axis(-`1`), swizzle_op->outY(), IdMappingMode::PERMISSIVE));
3953	}
3954
3955	// Test a basic loop swizzle pattern
3956	TEST_F(NVFuserTest, FusionLoopSwizzle0_CUDA) {
3957	Fusion fusion;
3958	FusionGuard fg(&fusion);
3959
3960	auto tv0 = makeConcreteTensor({`2`, `32`});
3961	fusion.addInput(tv0);
3962
3963	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3964	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
3965
3966	fusion.addOutput(tv2);
3967
3968	tv2->split(-`1`, `16`);
3969	tv2->split(-`1`, `4`);
3970	//[O, 4, 4]
3971
3972	tv2->swizzle(Swizzle2DType::ZShape, -`2`, -`1`, SwizzleMode::Loop);
3973
3974	tv0->computeAt(tv2, -`1`);
3975
3976	FusionExecutor fe;
3977	fe.compileFusion(&fusion);
3978
3979	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3980	auto t0 = at::randn({`2`, `32`}, options);
3981	auto t2 = t0 + `2.0`;
3982	auto cg_outputs = fe.runFusion({t0});
3983
3984	testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
3985	}
3986
3987	// Outer block zshape pattern
3988	TEST_F(NVFuserTest, FusionLoopSwizzle1_CUDA) {
3989	Fusion fusion;
3990	FusionGuard fg(&fusion);
3991
3992	auto tv0 = makeContigTensor(`2`);
3993	fusion.addInput(tv0);
3994
3995	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3996	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
3997
3998	fusion.addOutput(tv2);
3999
4000	tv2->split(-`2`, `8`);
4001	tv2->split(-`1`, `4`);
4002	//[I0o, I0i, I1o, I1i]
4003	tv2->reorder({{`1`, `2`}, {`2`, `1`}});
4004	//[I0o, I1o, I0i, I1i]
4005
4006	tv2->swizzle(Swizzle2DType::ZShape, `0`, `1`, SwizzleMode::Loop);
4007	tv0->computeAt(tv2, -`1`);
4008
4009	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4010	tv2->axis(`1`)->parallelize(ParallelType::BIDy);
4011
4012	FusionExecutor fe;
4013	fe.compileFusion(&fusion);
4014
4015	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4016	auto t0 = at::randn({`45`, `77`}, options);
4017	auto t2 = t0 + `2.0`;
4018	auto cg_outputs = fe.runFusion({t0});
4019
4020	testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
4021	}
4022
4023	// Test assertion in unsupported pattern: non-leaf loop swizzle.
4024	TEST_F(NVFuserTest, FusionLoopSwizzleCheck0_CUDA) {
4025	Fusion fusion;
4026	FusionGuard fg(&fusion);
4027
4028	auto tv0 = makeConcreteTensor({`2`, `32`});
4029	fusion.addInput(tv0);
4030
4031	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4032	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4033
4034	fusion.addOutput(tv2);
4035
4036	tv2->split(-`1`, `16`);
4037	tv2->split(-`1`, `4`);
4038	//[O, 4, 4]
4039
4040	// Swizzle the inner tile.
4041	tv2->swizzle(Swizzle2DType::ZShape, -`2`, -`1`, SwizzleMode::Loop);
4042
4043	// Make swizzle output not a leaf domain.
4044	tv2->merge(-`2`);
4045
4046	tv0->computeAt(tv2, -`1`);
4047
4048	FusionExecutor fe;
4049	ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4050	}
4051
4052	// Test assertion in unsupported pattern: half-inlined loop swizzle.
4053	TEST_F(NVFuserTest, FusionLoopSwizzleCheck1_CUDA) {
4054	Fusion fusion;
4055	FusionGuard fg(&fusion);
4056
4057	auto tv0 = makeConcreteTensor({`2`, `32`});
4058	fusion.addInput(tv0);
4059
4060	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4061	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4062	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
4063
4064	fusion.addOutput(tv3);
4065
4066	//[O, 4, 4]
4067	tv2->split(-`1`, `16`);
4068	tv2->split(-`1`, `4`);
4069
4070	//[O, 4, 4]
4071	tv3->split(-`1`, `16`);
4072	tv3->split(-`1`, `4`);
4073
4074	// Swizzle inner tile of tv2
4075	tv2->swizzle(Swizzle2DType::ZShape, -`2`, -`1`, SwizzleMode::Loop);
4076
4077	// Make tv2 swizzled and partially-inlined (unsupported).
4078	tv0->computeAt(tv3, -`2`);
4079
4080	FusionExecutor fe;
4081	ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4082	}
4083
4084	TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
4085	Fusion fusion;
4086	FusionGuard fg(&fusion);
4087
4088	std::vector<int64_t> shape({`10`, `11`});
4089
4090	auto tv0 = makeConcreteTensor(shape);
4091	fusion.addInput(tv0);
4092
4093	// [I, R]
4094	auto tv1 = sum(tv0, {`1`});
4095	// [I, B]
4096	auto tv2 = unsqueeze(tv1, -`1`);
4097	fusion.addOutput(tv2);
4098
4099	TORCH_CHECK(
4100	tv2->nDims() == `2`, "Unexpected unsqueeze result: ", tv2->toString());
4101	TORCH_CHECK(
4102	tv2->axis(`1`)->isBroadcast(),
4103	"Unexpected unsqueeze result: ",
4104	tv2->toString());
4105
4106	// tv1 has only one non-reduction axis. An exception should be
4107	// thrown.
4108	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4109	ASSERT_ANY_THROW(unsqueeze(tv1, `2`));
4110
4111	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4112	at::Tensor t0 = at::randn({`10`, `11`}, options);
4113	std::vector<IValue> aten_inputs = {t0};
4114
4115	FusionExecutor fe;
4116	fe.compileFusion(&fusion, aten_inputs);
4117	auto cg_outputs = fe.runFusion(aten_inputs);
4118
4119	auto ref = t0.sum(`1`).unsqueeze(-`1`);
4120
4121	testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
4122	}
4123
4124	TEST_F(NVFuserTest, FusionSqueeze1_CUDA) {
4125	Fusion fusion;
4126	FusionGuard fg(&fusion);
4127
4128	std::vector<int64_t> shape({`10`, `11`});
4129
4130	auto tv0 = makeConcreteTensor(shape);
4131	fusion.addInput(tv0);
4132
4133	// [I, B]
4134	auto tv1 = sum(tv0, {`1`}, true);
4135	// [I]
4136	auto tv2 = squeeze(tv1, {shape [`0`], `1`});
4137	fusion.addOutput(tv2);
4138
4139	TORCH_CHECK(
4140	tv2->nDims() == `2`, "Unexpected squeeze result: ", tv2->toString());
4141
4142	// [I, R]
4143	auto tv3 = sum(tv0, {`1`});
4144	// tv3 has only one non-reduction axis. The extent of the first axis
4145	// is not one, so squeeze should fail.
4146	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4147	ASSERT_ANY_THROW(squeeze(tv3, {shape[`0`], `1`}));
4148
4149	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4150	at::Tensor t0 = at::randn({`10`, `11`}, options);
4151	std::vector<IValue> aten_inputs = {t0};
4152
4153	FusionExecutor fe;
4154	fe.compileFusion(&fusion, aten_inputs);
4155	auto cg_outputs = fe.runFusion(aten_inputs);
4156
4157	auto ref = t0.sum(`1`, true).squeeze(-`1`);
4158
4159	testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
4160	}
4161
4162	TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
4163	Fusion fusion;
4164	FusionGuard fg(&fusion);
4165
4166	auto tv0 = makeSymbolicTensor(`2`);
4167	fusion.addInput(tv0);
4168	auto tv1 = set(tv0);
4169	auto tv2 = broadcast(tv1, {false, true, false});
4170	fusion.addOutput(tv2);
4171
4172	tv2->merge(-`2`, -`1`);
4173	tv2->merge(-`2`, -`1`);
4174	tv2->split(-`1`, `100`);
4175	tv0->computeAt(tv2, -`1`);
4176
4177	GpuLower gpulw(&fusion);
4178	TORCH_CHECK(PredicatedChecker::isPredicated(tv1, gpulw));
4179
4180	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4181	at::Tensor t0 = at::randn({`3`, `4`}, options);
4182
4183	FusionExecutor fe;
4184	fe.compileFusion(&fusion, {t0});
4185	auto cg_outputs = fe.runFusion({t0});
4186
4187	auto ref = t0.unsqueeze(`1`);
4188
4189	testValidate(fe.kernel(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
4190	}
4191
4192	// Repro of https://github.com/csarofeen/pytorch/issues/1777
4193	TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) {
4194	// tv1 = 2.0 / tv0
4195	Fusion fusion;
4196	FusionGuard fg(&fusion);
4197
4198	TensorView* tv0 = makeSymbolicTensor(`2`);
4199	fusion.addInput(tv0);
4200	TensorView* tv1 = div(IrBuilder::create<Double>(`2.0`), tv0);
4201	fusion.addOutput(tv1);
4202
4203	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4204	auto t0 = at::randn({`3`, `3`}, options);
4205	// There's no overload div(Scalar, Tensor) in ATen
4206	auto aten_output = at::div(
4207	at::native::wrapped_scalar_tensor(at::Scalar(`2.0`), options.device()), t0);
4208
4209	FusionExecutor fe;
4210	fe.compileFusion(&fusion, {t0});
4211	auto cg_outputs = fe.runFusion({t0});
4212
4213	testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__);
4214	}
4215
4216	// Repro of an issue of the reduction scheduler with a broadcast
4217	// domain concretized to multiple domains that are not proven to have
4218	// the same extent
4219	TEST_F(NVFuserTest, FusionRepro1713_CUDA) {
4220	auto fusion = std::make_unique<Fusion>();
4221	FusionGuard fg(fusion.get());
4222
4223	auto tv0 = makeSymbolicTensor(`2`);
4224	auto tv1 = makeSymbolicTensor(`2`);
4225	auto tv2 = makeSymbolicTensor(`1`);
4226	fusion ->addInput(tv0);
4227	fusion ->addInput(tv1);
4228	fusion ->addInput(tv2);
4229	auto tv3 = broadcast(tv2, {false, true});
4230
4231	auto tv4 = add(tv3, tv0);
4232
4233	auto tv5 = add(tv3, tv1);
4234	auto tv6 = sum(tv5, {`0`});
4235	fusion ->addOutput(tv4);
4236	fusion ->addOutput(tv6);
4237
4238	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4239	at::Tensor t0 = at::randn({`1024`, `204800`}, options);
4240	// Original repro had the same shape as t0, but this should work
4241	// with a different extent at the second axis
4242	at::Tensor t1 = at::randn({`1024`, `123`}, options);
4243	at::Tensor t2 = at::randn({`1024`}, options);
4244	std::vector<IValue> aten_inputs({t0, t1, t2});
4245
4246	FusionExecutorCache executor_cache(std::move(fusion));
4247	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4248
4249	auto t3 = t2.unsqueeze(-`1`);
4250	auto t4 = t3 + t0;
4251	auto t5 = t3 + t1;
4252	auto t6 = sum(t5, {`0`});
4253
4254	testValidate(
4255	executor_cache.fusion(),
4256	cg_outputs,
4257	{t0, t1, t2},
4258	{t4, t6},
4259	__LINE__,
4260	__FILE__);
4261	}
4262
4263	TEST_F(NVFuserTest, FusionExpand_CUDA) {
4264	auto fusion = std::make_unique<Fusion>();
4265	FusionGuard fg(fusion.get());
4266
4267	auto w = `2`, x = `3`, y = `4`, z = `5`;
4268
4269	// Test
4270	// a simple expand
4271	// Expand that's propagated
4272	// expand_as
4273	// symbolic expand
4274
4275	// x
4276	auto tv0 = makeSymbolicTensor(`1`);
4277	fusion ->addInput(tv0);
4278
4279	auto tv1 = broadcast(tv0, {false, true});
4280	auto tv2 = expand(tv1, {tv0->axis(`0`)->extent(), IrBuilder::create<Int>(y)});
4281
4282	// x
4283	auto tv3 = makeSymbolicTensor(`1`);
4284	fusion ->addInput(tv3);
4285	auto tv4 = broadcast(tv3, {false, true});
4286	auto tv5 = add(tv4, tv2);
4287	// [x, e_y]
4288
4289	// [x, y, z]
4290	auto tv6 = makeSymbolicTensor(`3`);
4291	fusion ->addInput(tv6);
4292
4293	// Disjoint set op will cause a segmentation for just this op.
4294	auto tmp_7 = set(tv6);
4295	fusion ->addOutput(tmp_7);
4296
4297	auto tv7 = broadcast(tv5, {false, false, true});
4298
4299	auto tv8 = expand_as(tv7, tv6);
4300	// [x, e_y, e_z]
4301
4302	auto w_symbolic = IrBuilder::create<Int>();
4303	fusion ->addInput(w_symbolic);
4304
4305	auto tv9 = broadcast(tv8, {true, false, false, false});
4306	//[1, x, e_y, e_z]
4307
4308	auto tv10 = expand(
4309	tv9,
4310	{w_symbolic,
4311	tv9->axis(`1`)->extent(),
4312	tv9->axis(`2`)->expandedExtent(),
4313	tv9->axis(`3`)->expandedExtent()});
4314
4315	fusion ->addOutput(tv10);
4316
4317	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4318	at::Tensor t0 = at::randn({x}, options);
4319	at::Tensor t3 = at::randn({x}, options);
4320	at::Tensor t6 = at::randn({x, y, z}, options);
4321
4322	FusionExecutorCache executor_cache(std::move(fusion));
4323
4324	auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6, w});
4325	auto cg_out = cg_outputs [`1`];
4326
4327	TORCH_INTERNAL_ASSERT(cg_out.size(`0`) == w);
4328	TORCH_INTERNAL_ASSERT(cg_out.size(`1`) == x);
4329	TORCH_INTERNAL_ASSERT(cg_out.size(`2`) == y);
4330	TORCH_INTERNAL_ASSERT(cg_out.size(`3`) == z);
4331	TORCH_INTERNAL_ASSERT(cg_out.stride(`0`) == `0`);
4332	TORCH_INTERNAL_ASSERT(cg_out.stride(`1`) == `1`);
4333	TORCH_INTERNAL_ASSERT(cg_out.stride(`2`) == `0`);
4334	TORCH_INTERNAL_ASSERT(cg_out.stride(`3`) == `0`);
4335
4336	auto t10 = t0.unsqueeze(-`1`)
4337	.expand({x, y})
4338	.add(t3.unsqueeze(-`1`))
4339	.unsqueeze(-`1`)
4340	.expand_as(t6)
4341	.unsqueeze(`0`)
4342	.expand({w, x, y, z});
4343
4344	testValidate(
4345	executor_cache.fusion(),
4346	cg_outputs,
4347	{t0, t3, t6, w},
4348	{t6, t10},
4349	__LINE__,
4350	__FILE__);
4351	}
4352
4353	TEST_F(NVFuserTest, FusionExpandIssue1751_CUDA) {
4354	auto fusion = std::make_unique<Fusion>();
4355	FusionGuard fg(fusion.get());
4356
4357	auto x = `3`, y = `4`, z = `5`;
4358
4359	// y, z
4360	auto tv0 = makeSymbolicTensor(`2`);
4361	fusion ->addInput(tv0);
4362
4363	auto tv1 = broadcast(tv0, {true, false, false});
4364
4365	// Two ways to propagate extents as is: use -1 or explicitly pass
4366	// the extent vals.
4367
4368	auto tv2 = expand(
4369	tv1,
4370	{IrBuilder::create<Int>(x),
4371	IrBuilder::create<Int>(-`1`),
4372	IrBuilder::create<Int>(-`1`)});
4373
4374	auto tv3 = expand(
4375	tv1,
4376	{IrBuilder::create<Int>(x),
4377	tv0->axis(`0`)->extent(),
4378	tv0->axis(`1`)->extent()});
4379
4380	fusion ->addOutput(tv2);
4381	fusion ->addOutput(tv3);
4382
4383	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4384	at::Tensor t0 = at::randn({y, z}, options);
4385
4386	FusionExecutorCache executor_cache(std::move(fusion));
4387
4388	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
4389
4390	for (const auto& cg_out : cg_outputs) {
4391	TORCH_INTERNAL_ASSERT(cg_out.size(`0`) == x);
4392	TORCH_INTERNAL_ASSERT(cg_out.size(`1`) == y);
4393	TORCH_INTERNAL_ASSERT(cg_out.size(`2`) == z);
4394	}
4395
4396	auto t2 = t0.expand({x, y, z});
4397
4398	testValidate(
4399	executor_cache.fusion(), cg_outputs, {t0}, {t2, t2}, __LINE__, __FILE__);
4400	}
4401
4402	// TODO: Make sure the kernel uses the expanded concrete size instead
4403	// of the symbolic size
4404	TEST_F(NVFuserTest, FusionExpandToConcrete_CUDA) {
4405	auto fusion = std::make_unique<Fusion>();
4406	FusionGuard fg(fusion.get());
4407
4408	auto x = `3`, y = `4`;
4409
4410	auto tv0 = makeSymbolicTensor(`1`);
4411	fusion ->addInput(tv0);
4412
4413	auto tv1 = broadcast(tv0, {true, false});
4414
4415	auto tv2 =
4416	expand(tv1, {IrBuilder::create<Int>(x), IrBuilder::create<Int>(y)});
4417
4418	fusion ->addOutput(tv2);
4419
4420	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4421	at::Tensor t0 = at::randn({y}, options);
4422
4423	FusionExecutorCache executor_cache(std::move(fusion));
4424
4425	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
4426
4427	for (const auto& cg_out : cg_outputs) {
4428	TORCH_INTERNAL_ASSERT(cg_out.size(`0`) == x);
4429	TORCH_INTERNAL_ASSERT(cg_out.size(`1`) == y);
4430	}
4431
4432	auto t2 = t0.expand({x, y});
4433
4434	testValidate(
4435	executor_cache.fusion(), cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
4436	}
4437
4438	TEST_F(NVFuserTest, FusionReproNoncontigBroadcast_CUDA) {
4439	auto fusion = std::make_unique<Fusion>();
4440	FusionGuard fg(fusion.get());
4441
4442	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
4443	at::Tensor t0 = at::randn({`4`, `32`, `16`, `112`, `112`}, options).transpose(-`1`, -`2`);
4444	at::Tensor t1 = at::randn({`32`, `1`, `112`, `1`}, options).transpose(-`1`, -`2`);
4445
4446	auto tv0 = TensorViewBuilder ()
4447	.ndims(`5`)
4448	.contiguity({true, true, false, false, false}) // ttfff
4449	.shape({-`1`, -`1`, -`1`, -`1`, -`1`})
4450	.dtype(DataType::Half)
4451	.build();
4452	auto tv1 = TensorViewBuilder ()
4453	.ndims(`4`)
4454	.contiguity({true, false, false, true}) // tfft
4455	.shape({-`1`, `1`, `1`, -`1`})
4456	.dtype(DataType::Half)
4457	.build();
4458
4459	fusion ->addInput(tv0);
4460	fusion ->addInput(tv1);
4461
4462	auto tv2 = add(tv0, tv1);
4463
4464	fusion ->addOutput(tv2);
4465
4466	std::vector<IValue> aten_inputs({t0, t1});
4467
4468	FusionExecutorCache executor_cache(std::move(fusion));
4469	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4470
4471	auto t2 = t0 + t1;
4472
4473	testValidate(
4474	executor_cache.fusion(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
4475	}
4476
4477	namespace {
4478
4479	// check that the resulting sibling are identical
4480	void checkSiblingConsistency(TensorView* replay, TensorView* target) {
4481	auto replay_root = replay->getRootDomain();
4482	auto replay_dom = replay->domain()->domain();
4483	auto target_root = target->getRootDomain();
4484	auto target_dom = target->domain()->domain();
4485	std::unordered_map<IterDomain, IterDomain> target2replay_map;
4486	TORCH_CHECK(replay_root.size() == target_root.size());
4487	target2replay_map.reserve(replay_root.size());
4488	std::transform(
4489	target_root.begin(),
4490	target_root.end(),
4491	replay_root.begin(),
4492	std::inserter(target2replay_map, target2replay_map.begin()),
4493	[](auto a, auto b) { return std::make_pair(a, b); });
4494	BestEffortReplay replay_(replay_dom, target_dom, target2replay_map);
4495	auto r = replay_.getReplay();
4496	for (int64_t i = `0`; i < (int64_t)replay_dom.size(); i++) {
4497	auto target_id = target_dom [i];
4498	auto replay_it = r.find(target_id);
4499	TORCH_CHECK(replay_it != r.end());
4500	TORCH_CHECK(
4501	replay_it ->second == replay_dom[i],
4502	"IterDomain mismatch when checking ",
4503	replay,
4504	" and ",
4505	target,
4506	" at ",
4507	i,
4508	", got ",
4509	replay_it ->second,
4510	" and ",
4511	replay_dom[i]);
4512	}
4513	};
4514
4515	} // namespace
4516
4517	TEST_F(NVFuserTest, FusionTransformPropagateSibling_CUDA) {
4518	// https://github.com/csarofeen/pytorch/issues/1760
4519	Fusion fusion;
4520	FusionGuard fg(&fusion);
4521
4522	auto tv0 = makeSymbolicTensor(`2`);
4523	fusion.addInput(tv0);
4524
4525	auto tvs = Welford(tv0, {`1`});
4526	fusion.addOutput(tvs.var_sum);
4527
4528	tvs.avg->split(`1`, `1`);
4529	tvs.avg->split(`1`, `2`);
4530	tvs.avg->split(`1`, `3`);
4531	tvs.var_sum->split(`1`, `1`);
4532	tvs.var_sum->split(`1`, `2`);
4533	tvs.var_sum->split(`1`, `3`);
4534	tvs.n->split(`1`, `1`);
4535	tvs.n->split(`1`, `2`);
4536	tvs.n->split(`1`, `3`);
4537
4538	auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {`1`, `4`});
4539
4540	TransformPropagatorWithCheck propagator(var_sum_rf);
4541	MaxRootDomainInfoSpanningTree (var_sum_rf).traverse(&propagator);
4542
4543	auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
4544
4545	std::vector<TensorView*> siblings[] = {{tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
4546	for (auto tensors : siblings) {
4547	for (auto t1 : tensors) {
4548	for (auto t2 : tensors) {
4549	TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
4550	}
4551	}
4552	}
4553	}
4554
4555	TEST_F(NVFuserTest, FusionTransformPropagateSelectorSibling_CUDA) {
4556	Fusion fusion;
4557	FusionGuard fg(&fusion);
4558
4559	auto tv0 = makeSymbolicTensor(`2`);
4560	fusion.addInput(tv0);
4561
4562	auto tvs = Welford(tv0, {`1`});
4563	fusion.addOutput(tvs.var_sum);
4564
4565	tvs.avg->split(`1`, `1`);
4566	tvs.avg->split(`1`, `2`);
4567	tvs.avg->split(`1`, `3`);
4568	tvs.var_sum->split(`1`, `1`);
4569	tvs.var_sum->split(`1`, `2`);
4570	tvs.var_sum->split(`1`, `3`);
4571	tvs.n->split(`1`, `1`);
4572	tvs.n->split(`1`, `2`);
4573	tvs.n->split(`1`, `3`);
4574
4575	auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {`1`, `4`});
4576
4577	struct DisableTv0 : public MaxInfoSpanningTree::Selector {
4578	TensorView* tv0;
4579	virtual bool allowC2P(TensorView* from, TensorView* to) override {
4580	return from != tv0 && to != tv0;
4581	};
4582	virtual bool allowP2C(TensorView* from, TensorView* to) override {
4583	return from != tv0 && to != tv0;
4584	};
4585	virtual bool allowSibling(TensorView* from, TensorView* to) override {
4586	return true;
4587	}
4588	DisableTv0(TensorView* tv0) : tv0(tv0) {}
4589	} selector1(tv0);
4590
4591	struct DisableTv0AndSibling : public DisableTv0 {
4592	virtual bool allowSibling(TensorView* from, TensorView* to) override {
4593	return false;
4594	}
4595	using DisableTv0::DisableTv0;
4596	} selector2(tv0);
4597
4598	TransformPropagatorWithCheck propagator(var_sum_rf);
4599	MaxRootDomainInfoSpanningTree good_path(var_sum_rf, &selector1);
4600	MaxRootDomainInfoSpanningTree bad_path(var_sum_rf, &selector2);
4601
4602	auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
4603
4604	auto check = [&]() {
4605	std::vector<TensorView*> siblings[] = {
4606	{tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
4607	for (auto tensors : siblings) {
4608	for (auto t1 : tensors) {
4609	for (auto t2 : tensors) {
4610	TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
4611	}
4612	}
4613	}
4614	};
4615
4616	bad_path.traverse(&propagator);
4617	ASSERT_ANY_THROW(check());
4618	good_path.traverse(&propagator);
4619	check ();
4620	}
4621
4622	TEST_F(NVFuserTest, FusionTransformPropagatePosition_CUDA) {
4623	Fusion fusion;
4624	FusionGuard fg(&fusion);
4625
4626	auto tv0 = makeSymbolicTensor(`4`);
4627	auto tv1 = makeSymbolicTensor(`6`);
4628	fusion.addInput(tv0);
4629
4630	auto tv2 = broadcast(tv0, {false, false, true, false, false, true});
4631	auto tv3 = add(tv1, tv2);
4632	fusion.addOutput(tv3);
4633
4634	tv0->merge(`2`);
4635	tv0->merge(`0`);
4636	TransformPropagatorWithCheck propagator(tv0);
4637	MaxRootDomainInfoSpanningTree (tv0).traverse(&propagator);
4638
4639	TORCH_CHECK(tv1->nDims() == `4`);
4640	}
4641
4642	TEST_F(NVFuserTest, FusionIgnoreZeroDimReduction_CUDA) {
4643	auto fusion = std::make_unique<Fusion>();
4644	FusionGuard fg(fusion.get());
4645
4646	auto tv0 = makeSymbolicTensor(`1`);
4647	fusion ->addInput(tv0);
4648	auto tv1 = sum(tv0, {`0`});
4649	// tv1 is effectively a zero-dim tensor as it only has a reduction
4650	// axis.
4651	// Reducing it further is converted to just a set op.
4652	auto tv2 = sum(tv1, {`0`});
4653	fusion ->addOutput(tv2);
4654
4655	auto tv2_def = dynamic_cast<UnaryOp*>(tv2->definition());
4656	TORCH_CHECK(
4657	tv2_def != nullptr,
4658	"Expected UnaryOp but found ",
4659	tv2->definition()->toString());
4660
4661	TORCH_CHECK(
4662	tv2_def->getUnaryOpType() == UnaryOpType::Set,
4663	"Expected UnaryOpType::Set but found ",
4664	tv2_def->getUnaryOpType());
4665
4666	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4667	auto t0 = at::randn({`12345`}, options);
4668	std::vector<IValue> aten_inputs({t0});
4669
4670	FusionExecutorCache executor_cache(std::move(fusion));
4671	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4672
4673	auto ref = sum(t0, {`0`});
4674
4675	testValidate(
4676	executor_cache.fusion(),
4677	cg_outputs,
4678	aten_inputs,
4679	{ref},
4680	__LINE__,
4681	__FILE__);
4682	}
4683
4684	// Repro of issue #1770
4685	TEST_F(NVFuserTest, FusionIssue1770Repro_CUDA) {
4686	auto fusion = std::make_unique<Fusion>();
4687	FusionGuard fg(fusion.get());
4688
4689	auto tv0 = makeSymbolicTensor(`1`);
4690	fusion ->addInput(tv0);
4691	auto tv1 = makeSymbolicTensor(`1`);
4692	fusion ->addInput(tv1);
4693
4694	auto tv2 = ge(tv0, tv1);
4695	auto tv3 =
4696	where(tv2, IrBuilder::create<Double>(`1`), IrBuilder::create<Double>(`2`));
4697	fusion ->addOutput(tv3);
4698
4699	std::vector<int64_t> shape({`999`});
4700	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4701	at::Tensor t0 = at::randn(shape, options);
4702	at::Tensor t1 = at::randn(shape, options);
4703	std::vector<IValue> aten_inputs({t0, t1});
4704
4705	FusionExecutorCache executor_cache(std::move(fusion));
4706	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4707
4708	auto ref = where(t0 >= t1, `1.0`, `2.0`);
4709
4710	testValidate(
4711	executor_cache.fusion(),
4712	cg_outputs,
4713	aten_inputs,
4714	{ref},
4715	__LINE__,
4716	__FILE__);
4717	}
4718
4719	TEST_F(NVFuserTest, FusionTransformPropagatorSelector_CUDA) {
4720	auto fusion = std::make_unique<Fusion>();
4721	FusionGuard fg(fusion.get());
4722
4723	auto tv0 = makeSymbolicTensor(`1`);
4724	fusion ->addInput(tv0);
4725	auto tv1 = makeSymbolicTensor(`1`);
4726	fusion ->addInput(tv1);
4727
4728	auto tv2 = add(tv0, tv1);
4729
4730	auto tv3 = sin(tv2);
4731	auto tv4 = cos(tv2);
4732
4733	fusion ->addOutput(tv3);
4734	fusion ->addOutput(tv4);
4735
4736	tv2->split(`0`, `10`);
4737
4738	struct Selector : public MaxInfoSpanningTree::Selector {
4739	TensorView* tv0;
4740	TensorView* tv3;
4741	virtual bool allowC2P(TensorView* from, TensorView* to) override {
4742	return to == tv0;
4743	}
4744	virtual bool allowP2C(TensorView* from, TensorView* to) override {
4745	return to == tv3;
4746	}
4747	virtual bool allowSibling(TensorView* from, TensorView* to) override {
4748	return false;
4749	}
4750	Selector(TensorView* tv0, TensorView* tv3) : tv0(tv0), tv3(tv3) {}
4751	} selector(tv0, tv3);
4752
4753	TransformPropagatorWithCheck propagator(tv2);
4754	MaxRootDomainInfoSpanningTree (tv2, &selector).traverse(&propagator);
4755
4756	TORCH_CHECK(tv0->nDims() == `2`);
4757	TORCH_CHECK(tv1->nDims() == `1`);
4758	TORCH_CHECK(tv2->nDims() == `2`);
4759	TORCH_CHECK(tv3->nDims() == `2`);
4760	TORCH_CHECK(tv4->nDims() == `1`);
4761	}
4762
4763	TEST_F(NVFuserTest, FusionTransformPropagatorPos_CUDA) {
4764	auto fusion = std::make_unique<Fusion>();
4765	FusionGuard fg(fusion.get());
4766
4767	auto tv0 = makeConcreteTensor({`22`, `105`});
4768	fusion ->addInput(tv0);
4769
4770	auto tv1 = sin(tv0);
4771	fusion ->addOutput(tv1);
4772
4773	tv1->split(`0`, `2`);
4774	tv1->split(-`1`, `3`);
4775	tv1->split(-`1`, `5`);
4776
4777	TransformPropagatorWithCheck propagator(tv1, `2`);
4778	MaxRootDomainInfoSpanningTree (tv1, `2`).traverse(&propagator);
4779
4780	auto expect = makeConcreteTensor({`22`, `105`});
4781	expect->split(`0`, `2`);
4782	TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv0));
4783	}
4784
4785	TEST_F(NVFuserTest, FusionMaxRootDomainInfoSpanningTreePrintTwice_CUDA) {
4786	auto fusion = std::make_unique<Fusion>();
4787	FusionGuard fg(fusion.get());
4788
4789	auto tv0 = makeSymbolicTensor(`3`);
4790	fusion ->addInput(tv0);
4791
4792	auto tv1 = sum(tv0, {`0`});
4793	auto tv2 = neg(tv1);
4794
4795	fusion ->addOutput(tv2);
4796
4797	tv1->split(`0`, `10`);
4798
4799	struct Printer : public MaxInfoSpanningTree::Propagator {
4800	std::stringstream ss;
4801	virtual void propagateC2P(TensorView* from, TensorView* to) override {
4802	ss << "propagateC2P" << std::endl;
4803	ss << "from: " << from->name() << std::endl;
4804	ss << "to: " << to->name() << std::endl;
4805	}
4806	virtual void propagateP2C(TensorView* from, TensorView* to) override {
4807	ss << "propagateP2C" << std::endl;
4808	ss << "from: " << from->name() << std::endl;
4809	ss << "to: " << to->name() << std::endl;
4810	}
4811	virtual void propagateSibling(TensorView* from, TensorView* to) override {
4812	ss << "propagateSibling" << std::endl;
4813	ss << "from: " << from->name() << std::endl;
4814	ss << "to: " << to->name() << std::endl;
4815	}
4816	} printer1, printer2;
4817	printer1.ss << std::endl;
4818	printer2.ss << std::endl;
4819
4820	MaxRootDomainInfoSpanningTree path(tv1);
4821	path.traverse(&printer1);
4822	path.traverse(&printer2);
4823
4824	auto expect = R"ESCAPE(
4825	propagateC2P
4826	from: 1
4827	to: 0
4828	propagateP2C
4829	from: 1
4830	to: 2
4831	)ESCAPE";
4832	TORCH_CHECK(printer1.ss.str() == expect);
4833	TORCH_CHECK(printer2.ss.str() == expect);
4834	}
4835
4836	TEST_F(NVFuserTest, FusionTransformPropagatorNoOverwrite_CUDA) {
4837	auto fusion = std::make_unique<Fusion>();
4838	FusionGuard fg(fusion.get());
4839
4840	auto tv0 = makeSymbolicTensor(`1`);
4841	fusion ->addInput(tv0);
4842	auto tv1 = broadcast(tv0, {true, false, true});
4843	auto tv2 = sin(tv1);
4844	fusion ->addOutput(tv2);
4845
4846	tv0->split(`0`, `2`);
4847	tv2->split(`1`, `2`);
4848	tv2->split(`0`, `4`);
4849
4850	MaxRootDomainInfoSpanningTree path1(tv2);
4851	TransformPropagatorWithCheck propagator1(tv2);
4852	path1.traverse(&propagator1);
4853
4854	MaxRootDomainInfoSpanningTree path2(tv0);
4855	TransformPropagatorWithCheck propagator2(tv0);
4856	path2.traverse(&propagator2);
4857
4858	TORCH_CHECK(tv1->axis(`0`)->isBroadcast());
4859	TORCH_CHECK(tv1->axis(`1`)->isBroadcast());
4860	TORCH_CHECK(!tv1->axis(`2`)->isBroadcast());
4861	TORCH_CHECK(!tv1->axis(`3`)->isBroadcast());
4862	TORCH_CHECK(tv1->axis(`4`)->isBroadcast());
4863
4864	auto expect = makeSymbolicTensor(`3`);
4865	expect->split(`1`, `2`);
4866	expect->split(`0`, `4`);
4867	TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv1));
4868	}
4869
4870	TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) {
4871	Fusion fusion;
4872	FusionGuard fg(&fusion);
4873
4874	// Set up your input tensor views
4875	TensorView* tv0 = makeContigTensor(`1`);
4876	TensorView* tv1 = makeContigTensor(`2`);
4877
4878	// Register your inputs
4879	fusion.addInput(tv0);
4880	fusion.addInput(tv1);
4881
4882	auto tv2 = set(tv0);
4883	// [B, I]
4884	auto tv3 = broadcast(tv2, {true, false});
4885	auto tv4 = add(tv3, tv1);
4886	auto tv5 = set(tv4);
4887
4888	// Register your outputs
4889	fusion.addOutput(tv5);
4890
4891	tv5->split(`0`, `8`);
4892	tv5->split(-`1`, `8`);
4893
4894	// [Serial, TIDy, TIDX, Serial]
4895
4896	tv4->computeAt(tv5, -`2`);
4897	tv3->computeAt(tv4, -`1`);
4898	tv2->computeAt(tv3, `0`);
4899	tv2->split(`0`, `8`);
4900	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
4901	tv1->computeAt(tv5, -`2`);
4902
4903	tv5->axis(`1`)->parallelize(ParallelType::TIDy);
4904	tv5->axis(`2`)->parallelize(ParallelType::TIDx);
4905
4906	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4907
4908	at::Tensor in1 = at::randn({`16`}, options);
4909	at::Tensor in2 = at::randn({`12`, `16`}, options);
4910
4911	FusionExecutor fe;
4912	fe.compileFusion(&fusion, {in1, in2});
4913	auto cg_outputs = fe.runFusion({in1, in2});
4914
4915	auto tv_ref = in1 + in2;
4916
4917	testValidate(&fusion, cg_outputs, {in1, in2}, {tv_ref}, __LINE__, __FILE__);
4918	}
4919
4920	TEST_F(NVFuserTest, FusionSkipReplay_CUDA) {
4921	{
4922	Fusion fusion;
4923	FusionGuard fg(&fusion);
4924
4925	TensorView* tv0 = makeContigTensor(`1`);
4926	TensorView* tv1 = makeContigTensor(`2`);
4927	fusion.addInput(tv0);
4928	fusion.addInput(tv1);
4929
4930	auto tv2 = broadcast(tv0, {false, true});
4931	auto tv3 = add(tv2, tv1);
4932	fusion.addOutput(tv3);
4933
4934	tv3->split(`1`, `2`, false);
4935
4936	TransformPropagatorWithCheck propagator(tv3);
4937	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
4938	}
4939
4940	{
4941	Fusion fusion;
4942	FusionGuard fg(&fusion);
4943
4944	TensorView* tv0 = makeContigTensor(`3`);
4945	fusion.addInput(tv0);
4946
4947	auto tv1 = sum(tv0, {`0`, `2`});
4948	auto tv2 = sin(tv1);
4949	fusion.addOutput(tv2);
4950
4951	tv0->split(`1`, `2`, false);
4952
4953	TransformPropagatorWithCheck propagator(tv0);
4954	MaxRootDomainInfoSpanningTree (tv0).traverse(&propagator);
4955	}
4956	}
4957
4958	TEST_F(NVFuserTest, FusionInlineRepro1803_CUDA) {
4959	Fusion fusion;
4960	FusionGuard fg(&fusion);
4961
4962	TensorView* tv0 = makeContigTensor(`2`);
4963
4964	fusion.addInput(tv0);
4965	auto tv1 = set(tv0);
4966	auto tvs = Welford(tv1, {`1`});
4967	auto tvo = set(tvs.var_sum);
4968	fusion.addOutput(tvo);
4969
4970	tvo->split(`0`, `16`);
4971	tvo->axis(`1`)->parallelize(ParallelType::Unroll);
4972
4973	tv0->computeAt(tvo, -`1`, ComputeAtMode::BestEffort);
4974
4975	TORCH_CHECK(
4976	tvs.var_sum->getComputeAtPosition() == tvs.avg->getComputeAtPosition());
4977	TORCH_CHECK(
4978	tvs.var_sum->getComputeAtPosition() == tvs.n->getComputeAtPosition());
4979	TORCH_CHECK(tvs.var_sum->getComputeAtPosition() == `1`);
4980	}
4981
4982	// Unit test for the transform selection logic
4983	TEST_F(NVFuserTest, FusionBoundedDirectionSelection1_CUDA) {
4984	Fusion fusion;
4985	FusionGuard fg(&fusion);
4986
4987	TensorView* tv0 = makeContigTensor(`2`);
4988
4989	fusion.addInput(tv0);
4990	auto tv1 = set(tv0);
4991	auto tv2 = set(tv1);
4992	auto tv3 = add(tv2, tv1);
4993	fusion.addOutput(tv3);
4994
4995	tv3->split(-`1`, `5`);
4996	tv3->split(-`1`, `8`);
4997
4998	scheduler_utils::BoundedDirectionalTransformPropagator::backward(
4999	tv3, -`1`, {tv0, tv2});
5000
5001	// Check that the splits are replayed on tv2
5002	TORCH_INTERNAL_ASSERT(
5003	tv2->nDims() == tv3->nDims(),
5004	"Propagator didn't propagate to tv2: ",
5005	tv2->toString());
5006
5007	// Check that the splits are replayed on tv1 as well. Even though
5008	// one of its consumers, tv2, is part of the boundary, another
5009	// consumer is not a boundary, so tv1 should be transformed as well.
5010	TORCH_INTERNAL_ASSERT(
5011	tv1->nDims() == tv3->nDims(),
5012	"Propagator didn't propagate to tv1: ",
5013	tv1->toString());
5014	}
5015
5016	TEST_F(NVFuserTest, FusionIssueRepro1844_CUDA) {
5017	auto fusion = std::make_unique<Fusion>();
5018	FusionGuard fg(fusion.get());
5019
5020	std::vector<int64_t> shape = {`2`, `1`, `768`};
5021	std::vector<int64_t> sum_to_shape = {`768`};
5022	std::vector<int64_t> sum_to_axes = {`0`, `1`};
5023	double kProb = `0.5`;
5024
5025	std::vector<Int*> sum_to_symb;
5026	std::transform(
5027	sum_to_shape.begin(),
5028	sum_to_shape.end(),
5029	std::back_inserter(sum_to_symb),
5030	[](int s) -> Int* { return IrBuilder::create<Int>(s); });
5031
5032	TensorView* tv0 = makeContigConcreteTensor(shape);
5033	TensorView* tv1 = makeContigConcreteTensor(shape);
5034	TensorView* tv2 = makeContigConcreteTensor(shape, DataType::Bool);
5035
5036	fusion ->addInput(tv0);
5037	fusion ->addInput(tv1);
5038	fusion ->addInput(tv2);
5039
5040	Double* prob = IrBuilder::create<Double>(kProb);
5041	auto grad_input = dropout_backward(tv1, tv2, prob);
5042	auto grad_gelu = gelu_backward(grad_input, tv0);
5043	auto grad_bias = sum_to(grad_gelu, sum_to_symb);
5044
5045	fusion ->addOutput(grad_gelu);
5046	fusion ->addOutput(grad_bias);
5047
5048	const auto options =
5049	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5050	const auto mask_options =
5051	at::TensorOptions().dtype(at::kBool).device(at::kCUDA, `0`);
5052	at::manual_seed(`0`);
5053
5054	at::Tensor a = at::randn(shape, options);
5055	at::Tensor b = at::randn(shape, options);
5056	at::Tensor c = at::randn(shape, options);
5057	auto mask = at::gt(c, `0.0f`);
5058	std::vector<IValue> aten_inputs = {a, b, mask};
5059
5060	FusionExecutorCache executor_cache(std::move(fusion));
5061	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
5062
5063	auto dinput = at::native_dropout_backward(b, mask, kProb);
5064	auto dgelu = at::gelu_backward(dinput, a, "none");
5065	auto dbias = dgelu.sum(sum_to_axes);
5066
5067	testValidate(
5068	executor_cache.fusion(),
5069	cg_outputs,
5070	aten_inputs,
5071	{dgelu, dbias},
5072	__LINE__,
5073	__FILE__);
5074	}
5075
5076	TEST_F(NVFuserTest, FusionInsertMagicZero1_CUDA) {
5077	Fusion fusion;
5078	FusionGuard fg(&fusion);
5079
5080	auto tv0 = makeSymbolicTensor(`2`);
5081	fusion.addInput(tv0);
5082
5083	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
5084	auto tv2 = set(tv1);
5085	fusion.addOutput(tv2);
5086
5087	tv2->split(`0`, `32`);
5088	tv2->split(-`1`, `2`);
5089	tv2->reorder({{`1`, `2`}, {`2`, `1`}});
5090	tv2->merge(`0`);
5091
5092	TransformPropagatorWithCheck propagator(tv2);
5093	MaxRootDomainInfoSpanningTree (tv2).traverse(&propagator);
5094
5095	tv0->computeAt(tv2, `1`);
5096
5097	// The predicate of tv2 should be protected with magic zero
5098	GpuLower gpulw(&fusion);
5099	TORCH_CHECK(
5100	PredicateMagicZeroChecker::isProtected(tv2, gpulw),
5101	"Failed to protect the predicates of ",
5102	tv2->toString());
5103	}
5104
5105	TEST_F(NVFuserTest, FusionRepro1860_CUDA) {
5106	auto fusion_ptr = std::make_unique<Fusion>();
5107	Fusion& fusion = *fusion_ptr;
5108	FusionGuard fg(&fusion);
5109	std::vector<bool> contiguity{true, false, false};
5110
5111	std::vector<int64_t> shape{`1`, -`1`, -`1`};
5112	TensorView* tv0 = makeContigConcreteTensor(shape);
5113	fusion.addInput(tv0);
5114	TensorView* tv1 = makeContigConcreteTensor(shape);
5115	fusion.addInput(tv1);
5116	TensorView* tv2 = makeContigConcreteTensor(shape);
5117	fusion.addInput(tv2);
5118
5119	std::vector<IterDomain> domain1(`3`, nullptr*);
5120	for (const auto i : c10::irange(`3`)) {
5121	if (i == `0`) {
5122	domain1 [i] =
5123	IterDomainBuilder (
5124	FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(`1`))
5125	.iter_type(IterType::Broadcast)
5126	.build();
5127	} else {
5128	domain1 [i] =
5129	IterDomainBuilder (
5130	FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(`1`))
5131	.expanded_extent(IrBuilder::create<Int>(`1` + i))
5132	.iter_type(IterType::Broadcast)
5133	.build();
5134	}
5135	}
5136
5137	TensorView* tv22 = IrBuilder::create<TensorView>(
5138	IrBuilder::create<TensorDomain>(domain1, contiguity), DataType::Float);
5139
5140	fusion.addInput(tv22);
5141
5142	auto tv3 = add(tv0, tv1);
5143	auto tv4 = softmax(tv3, `0`);
5144	auto tv5 = add(tv4, tv22);
5145	fusion.addOutput(tv5);
5146
5147	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5148
5149	at::Tensor input1 = at::randn({`1`, `2`, `3`}, options);
5150	at::Tensor input2 = at::randn({`1`, `2`, `3`}, options);
5151	at::Tensor input3 = at::randn({`1`, `2`, `3`}, options);
5152	at::Tensor input4 = at::randn({`1`, `1`, `1`}, options).expand({`1`, `2`, `3`});
5153	std::vector<IValue> aten_inputs = {input1, input2, input3, input4};
5154
5155	FusionExecutorCache executor_cache(std::move(fusion_ptr));
5156	auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
5157	}
5158
5159	TEST_F(NVFuserTest, FusionExpandReduce_CUDA) {
5160	auto fusion = std::make_unique<Fusion>();
5161	FusionGuard fg(fusion.get());
5162
5163	auto tv0 = makeConcreteTensor({`1`, `8`});
5164	fusion ->addInput(tv0);
5165
5166	auto tv1 =
5167	expand(tv0, {IrBuilder::create<Int>(`12`), IrBuilder::create<Int>(`8`)});
5168
5169	auto tv2 = sum(tv1, {`0`});
5170	fusion ->addOutput(tv2);
5171
5172	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5173	at::manual_seed(`0`);
5174	auto t0 = at::randn({`1`, `8`}, options);
5175
5176	FusionExecutorCache executor_cache(std::move(fusion));
5177	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
5178
5179	auto ref = t0.expand({`12`, `8`}).sum({`0`});
5180
5181	testValidate(
5182	executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
5183	}
5184
5185	// Predicate elimination issue repro:
5186	TEST_F(NVFuserTest, FusionExpandReduce2_CUDA) {
5187	auto fusion = std::make_unique<Fusion>();
5188	FusionGuard fg(fusion.get());
5189
5190	auto tv0 = makeConcreteTensor({`1`, `4`});
5191	fusion ->addInput(tv0);
5192
5193	auto tv1 =
5194	expand(tv0, {IrBuilder::create<Int>(`3`), IrBuilder::create<Int>(`4`)});
5195
5196	auto tv2 = sum(tv1, {`0`});
5197	fusion ->addOutput(tv2);
5198
5199	// tv2[r{3}, i{4}]
5200	tv2->split(`0`, NamedScalar::getParallelDim(ParallelType::TIDy));
5201	tv2->axis(`1`)->parallelize(ParallelType::TIDy);
5202	tv2->split(`0`, NamedScalar::getParallelDim(ParallelType::BIDy), false);
5203	tv2->axis(`0`)->parallelize(ParallelType::BIDy);
5204	tv2->split(-`1`, NamedScalar::getParallelDim(ParallelType::TIDx));
5205	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
5206	tv2->axis(-`2`)->parallelize(ParallelType::BIDx);
5207	// [rBIDy, rO, rTIDy, iBIDx, iTIDx]
5208	tv2->reorder({{-`2`, `0`}, {-`1`, `1`}, {`2`, `2`}});
5209	// [iBIDx, iTIDx, rTIDy, rBIDy, rO]
5210	auto tv3 = tv2->rFactor({-`1`});
5211
5212	TransformPropagatorWithCheck propagator(tv3);
5213	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
5214	scheduler_utils::parallelizeAllLike(tv3);
5215	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
5216
5217	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5218	at::manual_seed(`0`);
5219	auto t0 = at::randn({`1`, `4`}, options);
5220
5221	FusionExecutor fe;
5222	fe.compileFusion(fusion.get(), {t0}, LaunchParams (-`1`, `2`, -`1`, `4`, `2`, `1`));
5223	auto cg_outputs = fe.runFusion({t0}, LaunchParams (-`1`, `2`, -`1`, `4`, `2`, `1`));
5224
5225	auto ref = t0.expand({`3`, `4`}).sum({`0`});
5226
5227	testValidate(
5228	fusion.get(),
5229	cg_outputs,
5230	{t0},
5231	{ref},
5232	__LINE__,
5233	__FILE__,
5234	"",
5235	LaunchParams (-`1`, `2`, -`1`, `4`, `2`, `1`));
5236	}
5237
5238	TEST_F(NVFuserTest, FusionExpandBadShapeTest_CUDA) {
5239	auto fusion_ptr = std::make_unique<Fusion>();
5240	Fusion& fusion = *fusion_ptr;
5241	FusionGuard fg(&fusion);
5242	std::vector<bool> contiguity{false, false};
5243
5244	auto tv0 = makeSymbolicTensor(`2`);
5245	fusion.addInput(tv0);
5246
5247	std::vector<IterDomain*> domains = {
5248	IterDomainBuilder (
5249	FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>())
5250	.build(),
5251	IterDomainBuilder (
5252	FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(`1`))
5253	.expanded_extent(IrBuilder::create<Int>(`10`))
5254	.iter_type(IterType::Broadcast)
5255	.build()};
5256
5257	// expand to 10
5258	TensorView* tv22 = IrBuilder::create<TensorView>(
5259	IrBuilder::create<TensorDomain>(domains, contiguity), DataType::Float);
5260
5261	fusion.addInput(tv22);
5262
5263	auto tv3 = add(tv0, tv22);
5264	fusion.addOutput(tv3);
5265
5266	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5267
5268	// Incompatible shapes
5269	at::Tensor input1 = at::randn({`2`, `3`}, options);
5270	// Passing expand size of 5, not 10. Should cause an error
5271	at::Tensor input4 = at::randn({`2`, `1`}, options).expand({`2`, `5`});
5272
5273	std::vector<IValue> aten_inputs = {input1, input4};
5274
5275	FusionExecutorCache executor_cache(std::move(fusion_ptr));
5276	ASSERT_ANY_THROW(executor_cache.runFusionWithInputs(aten_inputs));
5277	}
5278
5279	TEST_F(
5280	NVFuserTest,
5281	FusionPointwiseScheduleWithBroadcastAndTrivialReduction_CUDA) {
5282	Fusion fusion;
5283	FusionGuard fg(&fusion);
5284
5285	auto tv0 = makeContigTensor(`3`);
5286	auto tv1 = makeContigTensor(`2`);
5287	fusion.addInput(tv0);
5288	fusion.addInput(tv1);
5289	auto tv2 = broadcast(tv0, {false, true, false, true, false, true});
5290	auto tv3 = sin(tv2);
5291	auto tv4 = add(tv3, tv1);
5292	auto tv5 = sum(tv4, {`1`});
5293	fusion.addOutput(tv5);
5294
5295	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5296	at::Tensor t0 = at::randn({`100`, `100`, `10`}, options);
5297	at::Tensor t1 = at::randn({`10`, `20`}, options);
5298
5299	auto aten_output = (t0.view({`100`, `1`, `100`, `1`, `10`, `1`}).sin() + t1).squeeze(`1`);
5300
5301	std::vector<IValue> aten_inputs = {t0, t1};
5302
5303	auto lparams = schedulePointwise(&fusion, aten_inputs);
5304
5305	FusionExecutor fe;
5306	fe.compileFusion(&fusion, aten_inputs, lparams);
5307	auto cg_outputs = fe.runFusion(aten_inputs, lparams);
5308
5309	testValidate(
5310	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
5311	}
5312
5313	TEST_F(NVFuserTest, FusionInliningMismatchedDims1_CUDA) {
5314	Fusion fusion;
5315	FusionGuard fg(&fusion);
5316
5317	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5318	fusion.addInput(tv0);
5319	auto tv1 = sin(tv0);
5320	auto tv2 = cos(tv1);
5321	auto tv3 = transpose(tv2, `1`, `2`);
5322	auto tv4 = exp(tv3);
5323	auto tv5 = tan(tv4);
5324	fusion.addOutput(tv5);
5325
5326	inlineMost();
5327
5328	TORCH_CHECK(tv5->getComputeAtPosition() == `3`);
5329	TORCH_CHECK(tv4->getComputeAtPosition() == `3`);
5330	TORCH_CHECK(tv3->getComputeAtPosition() == `3`);
5331	TORCH_CHECK(tv2->getComputeAtPosition() == `1`);
5332	TORCH_CHECK(tv1->getComputeAtPosition() == `3`);
5333
5334	const auto options =
5335	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5336	at::Tensor input = at::randn({`2`, `3`, `4`}, options);
5337	auto output = input.sin().cos().transpose(`1`, `2`).exp().tan();
5338
5339	FusionExecutor fe;
5340	fe.compileFusion(&fusion, {input});
5341	auto cg_outputs = fe.runFusion({input});
5342
5343	testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5344	}
5345
5346	TEST_F(NVFuserTest, FusionInliningMismatchedDims2_CUDA) {
5347	Fusion fusion;
5348	FusionGuard fg(&fusion);
5349
5350	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5351	fusion.addInput(tv0);
5352	auto tv1 = sin(tv0);
5353	auto tv2 = cos(tv1);
5354	auto tv3 = transpose(tv2, `1`, `2`);
5355	auto tv4 = exp(tv3);
5356	auto tv5 = tan(tv4);
5357	fusion.addOutput(tv5);
5358
5359	inlineAllAt(tv5, -`1`, true);
5360
5361	TORCH_CHECK(tv5->getComputeAtPosition() == `3`);
5362	TORCH_CHECK(tv4->getComputeAtPosition() == `3`);
5363	TORCH_CHECK(tv3->getComputeAtPosition() == `3`);
5364	TORCH_CHECK(tv2->getComputeAtPosition() == `1`);
5365	TORCH_CHECK(tv1->getComputeAtPosition() == `1`);
5366
5367	const auto options =
5368	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5369	at::Tensor input = at::randn({`2`, `3`, `4`}, options);
5370	auto output = input.sin().cos().transpose(`1`, `2`).exp().tan();
5371
5372	FusionExecutor fe;
5373	fe.compileFusion(&fusion, {input});
5374	auto cg_outputs = fe.runFusion({input});
5375
5376	testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5377	}
5378
5379	TEST_F(NVFuserTest, FusionInliningMismatchedDims3_CUDA) {
5380	Fusion fusion;
5381	FusionGuard fg(&fusion);
5382
5383	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5384	fusion.addInput(tv0);
5385	auto tv1 = sin(tv0);
5386	// broadcasting
5387	auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
5388	auto tv3 = relu(tv2);
5389	// trivial reduction
5390	auto tv4 = sum(tv3, {`1`, `3`, `5`});
5391	auto tv5 = cos(tv4);
5392	auto tv6 = transpose(tv5, `1`, `2`);
5393	auto tv7 = exp(tv6);
5394	auto tv8 = tan(tv7);
5395	fusion.addOutput(tv8);
5396
5397	for (auto tv : {tv2, tv3, tv4}) {
5398	tv->merge(`0`);
5399	tv->merge(`1`);
5400	tv->merge(`2`);
5401	}
5402
5403	inlineMost();
5404
5405	TORCH_CHECK(tv8->getComputeAtPosition() == `3`);
5406	TORCH_CHECK(tv7->getComputeAtPosition() == `3`);
5407	TORCH_CHECK(tv6->getComputeAtPosition() == `3`);
5408	TORCH_CHECK(tv5->getComputeAtPosition() == `1`);
5409	TORCH_CHECK(tv4->getComputeAtPosition() == `3`);
5410	TORCH_CHECK(tv3->getComputeAtPosition() == `3`);
5411	TORCH_CHECK(tv2->getComputeAtPosition() == `3`);
5412	TORCH_CHECK(tv1->getComputeAtPosition() == `3`);
5413
5414	const auto options =
5415	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5416	at::Tensor input = at::randn({`2`, `3`, `4`}, options);
5417	auto output = input.sin().relu().cos().transpose(`1`, `2`).exp().tan();
5418
5419	FusionExecutor fe;
5420	fe.compileFusion(&fusion, {input});
5421	auto cg_outputs = fe.runFusion({input});
5422
5423	testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5424	}
5425
5426	TEST_F(NVFuserTest, FusionInliningMismatchedDims4_CUDA) {
5427	Fusion fusion;
5428	FusionGuard fg(&fusion);
5429
5430	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5431	fusion.addInput(tv0);
5432	auto tv1 = sin(tv0);
5433	auto tv2 = exp(tv1);
5434	auto tv3 = relu(tv2);
5435	auto tv4 = cos(tv3);
5436	auto tv5 = tan(tv4);
5437	fusion.addOutput(tv5);
5438
5439	tv3->merge(`1`);
5440	inlineMost();
5441
5442	TORCH_CHECK(tv5->getComputeAtPosition() == `3`);
5443	TORCH_CHECK(tv4->getComputeAtPosition() == `3`);
5444	TORCH_CHECK(tv3->getComputeAtPosition() == `1`);
5445	TORCH_CHECK(tv2->getComputeAtPosition() == `1`);
5446	TORCH_CHECK(tv1->getComputeAtPosition() == `3`);
5447
5448	const auto options =
5449	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5450	at::Tensor input = at::randn({`2`, `3`, `4`}, options);
5451	auto output = input.sin().exp().relu().cos().tan();
5452
5453	FusionExecutor fe;
5454	fe.compileFusion(&fusion, {input});
5455	auto cg_outputs = fe.runFusion({input});
5456
5457	testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5458	}
5459
5460	TEST_F(NVFuserTest, FusionInliningBroadcast_CUDA) {
5461	Fusion fusion;
5462	FusionGuard fg(&fusion);
5463
5464	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5465	fusion.addInput(tv0);
5466	auto tv1 = sin(tv0);
5467	// broadcasting
5468	auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
5469	auto tv3 = cos(tv2);
5470	auto tv4 = tan(tv3);
5471	fusion.addOutput(tv4);
5472
5473	for (auto tv : {tv2, tv3, tv4}) {
5474	tv->merge(`0`);
5475	tv->merge(`1`);
5476	tv->merge(`2`);
5477	}
5478
5479	inlineMost();
5480
5481	TORCH_CHECK(tv4->getComputeAtPosition() == `3`);
5482	TORCH_CHECK(tv3->getComputeAtPosition() == `3`);
5483	TORCH_CHECK(tv2->getComputeAtPosition() == `3`);
5484	TORCH_CHECK(tv1->getComputeAtPosition() == `3`);
5485
5486	const auto options =
5487	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5488	at::Tensor input = at::randn({`2`, `3`, `4`}, options);
5489	auto output = input.sin().view({`2`, `1`, `3`, `1`, `4`, `1`}).cos().tan();
5490
5491	FusionExecutor fe;
5492	fe.compileFusion(&fusion, {input});
5493	auto cg_outputs = fe.runFusion({input});
5494
5495	testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5496	}
5497
5498	TEST_F(NVFuserTest, FusionInliningBroadcastTrivialReduction_CUDA) {
5499	Fusion fusion;
5500	FusionGuard fg(&fusion);
5501
5502	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5503	fusion.addInput(tv0);
5504	auto tv1 = sin(tv0);
5505	// broadcasting
5506	auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
5507	auto tv3 = tan(tv2);
5508	// trivial reduction
5509	auto tv4 = sum(tv3, {`1`, `3`, `5`});
5510	auto tv5 = cos(tv4);
5511	auto tv6 = exp(tv5);
5512	fusion.addOutput(tv6);
5513
5514	for (auto tv : {tv2, tv3, tv4}) {
5515	tv->merge(`0`);
5516	tv->merge(`1`);
5517	tv->merge(`2`);
5518	}
5519
5520	inlineMost();
5521
5522	TORCH_CHECK(tv6->getComputeAtPosition() == `3`);
5523	TORCH_CHECK(tv5->getComputeAtPosition() == `3`);
5524	TORCH_CHECK(tv4->getComputeAtPosition() == `3`);
5525	TORCH_CHECK(tv3->getComputeAtPosition() == `3`);
5526	TORCH_CHECK(tv2->getComputeAtPosition() == `3`);
5527	TORCH_CHECK(tv1->getComputeAtPosition() == `3`);
5528
5529	const auto options =
5530	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5531	at::Tensor input = at::randn({`2`, `3`, `4`}, options);
5532	auto output = input.sin().tan().cos().exp();
5533
5534	FusionExecutor fe;
5535	fe.compileFusion(&fusion, {input});
5536	auto cg_outputs = fe.runFusion({input});
5537
5538	testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5539	}
5540
5541	TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayTrivialReduction_CUDA) {
5542	Fusion fusion;
5543	FusionGuard fg(&fusion);
5544
5545	auto tv0 = makeConcreteTensor({`2`, `1`, `3`, `1`, `4`, `1`});
5546	fusion.addInput(tv0);
5547	auto tv1 = sum(tv0, {`1`, `3`, `5`});
5548	auto tv2 = sin(tv1);
5549	fusion.addOutput(tv1);
5550
5551	for (auto tv : {tv0, tv1}) {
5552	tv->merge(`0`);
5553	tv->merge(`1`);
5554	tv->merge(`2`);
5555	}
5556
5557	TORCH_CHECK(
5558	TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, `3`) == `3`);
5559	TORCH_CHECK(
5560	TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, `3`) == `3`);
5561	TORCH_CHECK(
5562	TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, `3`) == `3`);
5563	TORCH_CHECK(
5564	TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, `3`) == `3`);
5565	}
5566
5567	TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayBroadcast_CUDA) {
5568	Fusion fusion;
5569	FusionGuard fg(&fusion);
5570
5571	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5572	fusion.addInput(tv0);
5573	auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
5574	auto tv2 = sin(tv1);
5575	fusion.addOutput(tv2);
5576
5577	for (auto tv : {tv1, tv2}) {
5578	tv->merge(`0`);
5579	tv->merge(`1`);
5580	tv->merge(`2`);
5581	}
5582
5583	TORCH_CHECK(
5584	TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, `3`) == `3`);
5585	TORCH_CHECK(
5586	TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, `3`) == `3`);
5587	TORCH_CHECK(
5588	TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, `3`) == `3`);
5589	TORCH_CHECK(
5590	TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, `3`) == `3`);
5591	}
5592
5593	TEST_F(NVFuserTest, FusionIdGraphTrivialReduction_CUDA) {
5594	Fusion fusion;
5595	FusionGuard fg(&fusion);
5596
5597	auto tv0 = makeConcreteTensor({`2`, `3`, `4`});
5598	fusion.addInput(tv0);
5599	auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
5600	auto tv2 = sum(tv1, {`1`, `3`, `5`});
5601	auto tv3 = sin(tv2);
5602	fusion.addOutput(tv3);
5603
5604	for (auto tv : {tv1, tv2}) {
5605	tv->merge(`0`);
5606	tv->merge(`1`);
5607	tv->merge(`2`);
5608	}
5609
5610	inlineMost();
5611
5612	ComputeAtMap ca_map(&fusion);
5613
5614	auto all_tvs = ir_utils::allTvs(&fusion);
5615	for (auto tv1 : all_tvs) {
5616	for (auto tv2 : all_tvs) {
5617	if (tv1->isFusionInput() \|\| tv2->isFusionInput()) {
5618	continue;
5619	}
5620	for (int i : c10::irange(`3`)) {
5621	auto id1 = tv1->axis(i);
5622	auto id2 = tv2->axis(i);
5623	TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::LOOP));
5624	TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::PERMISSIVE));
5625	}
5626	}
5627	}
5628	}
5629
5630	TEST_F(NVFuserTest, FusionPrint_CUDA) {
5631	auto dtypes = {
5632	at::kFloat,
5633	at::kDouble,
5634	at::kHalf,
5635	at::kBFloat16,
5636	at::kInt,
5637	at::kLong,
5638	at::kBool};
5639	for (auto dtype : dtypes) {
5640	auto fusion = std::make_unique<Fusion>();
5641	FusionGuard fg(fusion.get());
5642
5643	auto tv0 = makeSymbolicTensor(`1`, aten_to_data_type(dtype));
5644	fusion ->addInput(tv0);
5645	auto tv1 = print(tv0);
5646	auto tv2 = sin(tv1);
5647	fusion ->addOutput(tv2);
5648
5649	// There is no way to check if anything is printed to the console, but we
5650	// can validate that when print exist, compilation and computation are not
5651	// broken.
5652	auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
5653	at::Tensor t0 = at::arange(`2`, options).to(dtype);
5654
5655	FusionExecutorCache executor_cache(std::move(fusion));
5656	auto cg_outputs = executor_cache.runFusionWithInputs({t0});
5657
5658	testValidate(
5659	executor_cache.fusion(),
5660	cg_outputs,
5661	{t0},
5662	{t0.sin()},
5663	__LINE__,
5664	__FILE__);
5665	}
5666	}
5667
5668	TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) {
5669	const auto options =
5670	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5671
5672	at::Tensor a = at::randn({`123`, `456`}, options);
5673	at::Tensor b = at::randn({`123`, `456`}, options);
5674	at::Tensor c = at::randn({`321`, `654`}, options);
5675
5676	using return_t =
5677	std::pair<std::unique_ptr<FusionExecutorCache>, std::vector<at::Tensor>>;
5678	auto matched_add = [](at::Tensor a, at::Tensor b) -> return_t {
5679	auto fusion = std::make_unique<Fusion>();
5680	FusionGuard fg(fusion.get());
5681
5682	Val* s1 = IrBuilder::create<Int>();
5683	Val* s2 = IrBuilder::create<Int>();
5684	auto builder = TensorViewBuilder ().shape(std::vector<Val*>{s1, s2});
5685	TensorView* tv0 = builder.build();
5686	TensorView* tv1 = builder.build();
5687
5688	fusion ->addInput(tv0);
5689	fusion ->addInput(tv1);
5690
5691	auto tv2 = add(tv0, tv1);
5692
5693	fusion ->addOutput(tv2);
5694
5695	auto executor_cache =
5696	std::make_unique<FusionExecutorCache>(std::move(fusion));
5697	auto cg_outputs = executor_cache ->runFusionWithInputs({a, b});
5698	return {std::move(executor_cache), std::move(cg_outputs)};
5699	};
5700
5701	{
5702	auto ret1 = matched_add (a, b);
5703	testValidate(
5704	ret1.first ->fusion(), ret1.second, {a, b}, {a + b}, __LINE__, __FILE__);
5705	}
5706
5707	{
5708	EXPECT_THAT(
5709	[&]() { matched_add(a, c); },
5710	::testing::ThrowsMessage<c10::Error>(
5711	::testing::HasSubstr("Attempting to bind")));
5712	}
5713	}
5714
5715	TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) {
5716	auto fusion = std::make_unique<Fusion>();
5717	FusionGuard fg(fusion.get());
5718
5719	Val* s1 = IrBuilder::create<Int>();
5720	auto builder = TensorViewBuilder ().shape(std::vector<Val*>{s1});
5721	TensorView* tv0 = builder.build();
5722
5723	fusion ->addInput(tv0);
5724
5725	auto tv1 = add(tv0, s1);
5726
5727	fusion ->addOutput(tv1);
5728
5729	const auto options =
5730	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5731
5732	at::Tensor a = at::zeros({`123`}, options);
5733
5734	FusionExecutorCache executor_cache(std::move(fusion));
5735	auto cg_outputs = executor_cache.runFusionWithInputs({a});
5736
5737	testValidate(
5738	executor_cache.fusion(), cg_outputs, {a}, {a + `123`}, __LINE__, __FILE__);
5739	}
5740
5741	TEST_F(NVFuserTest, FusionDependencyCheck_CUDA) {
5742	Fusion fusion;
5743	FusionGuard fg(&fusion);
5744
5745	TensorView* tv0 = makeSymbolicTensor(`1`);
5746	TensorView* tv1 = makeSymbolicTensor(`1`);
5747	TensorView* tv2 = makeSymbolicTensor(`1`);
5748	TensorView* tv3 = makeSymbolicTensor(`1`);
5749
5750	auto tv4 = add(tv0, tv1);
5751	auto tv5 = add(tv0, tv2);
5752	auto tv6 = add(tv0, tv3);
5753
5754	auto tv7 = add(tv1, tv2);
5755	auto tv8 = add(tv1, tv3);
5756
5757	auto tv9 = add(tv2, tv3);
5758
5759	{
5760	auto all_vals = DependencyCheck::getAllValsBetween(
5761	{tv0, tv1}, {tv4, tv5, tv6, tv7, tv8, tv9});
5762	std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end());
5763	std::vector<Val*> results({tv0, tv1, tv4, tv5, tv6, tv7, tv8});
5764	for (auto result : results) {
5765	TORCH_CHECK(all_vals_set.count(result) > `0`);
5766	all_vals_set.erase(result);
5767	}
5768	TORCH_CHECK(all_vals_set.empty());
5769	}
5770
5771	auto tv10 = add(tv6, tv7);
5772	{
5773	auto all_vals = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv10});
5774	std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end());
5775	std::vector<Val*> results({tv0, tv1, tv6, tv7, tv10});
5776	for (auto result : results) {
5777	TORCH_CHECK(all_vals_set.count(result) > `0`);
5778	all_vals_set.erase(result);
5779	}
5780	TORCH_CHECK(all_vals_set.empty());
5781	}
5782	}
5783
5784	// Repro for issue #1925
5785	TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
5786	Fusion fusion;
5787	FusionGuard fg(&fusion);
5788
5789	auto tv0 = makeSymbolicTensor(`4`);
5790	auto tv1 = makeConcreteTensor({-`1`, -`1`, -`1`, `1`});
5791	fusion.addInput(tv0);
5792	fusion.addInput(tv1);
5793	auto tv2 = add(tv0, tv1);
5794	fusion.addOutput(tv2);
5795
5796	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5797	at::Tensor input0 = at::randn({`1`, `1`, `333`, `1`}, options);
5798	at::Tensor input1 = at::randn({`1`, `1`, `333`, `1`}, options);
5799
5800	auto lparams = scheduleTranspose(&fusion, {input0, input1});
5801
5802	FusionExecutor fe;
5803	fe.compileFusion(&fusion, {input0, input1}, lparams);
5804	auto outputs = fe.runFusion({input0, input1}, lparams);
5805
5806	auto tv_ref = input0 + input1;
5807
5808	testValidate(
5809	&fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
5810	}
5811
5812	// Repro for issue #1873
5813	TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
5814	Fusion fusion;
5815	FusionGuard fg(&fusion);
5816
5817	auto tv0 = makeContigTensor(`1`);
5818	auto tv1 = makeContigTensor(`2`);
5819	fusion.addInput(tv0);
5820	fusion.addInput(tv1);
5821	auto tv2 = set(tv0);
5822	auto tv3 = broadcast(tv2, {true, false});
5823	auto tv4 = add(tv3, tv1);
5824	fusion.addOutput(tv4);
5825
5826	tv4->merge(`0`);
5827	tv4->split(`0`, `32`);
5828
5829	tv0->computeAt(tv4, `1`);
5830
5831	tv2->split(-`1`, `8`);
5832
5833	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5834	at::Tensor t0 = at::randn({`123`}, options);
5835	at::Tensor t1 = at::randn({`3`, `123`}, options);
5836
5837	FusionExecutor fe;
5838	fe.compileFusion(&fusion, {t0, t1});
5839
5840	auto outputs = fe.runFusion({t0, t1});
5841
5842	auto tv_ref = t0 + t1;
5843
5844	testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
5845	}
5846
5847	TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
5848	// https://github.com/csarofeen/pytorch/issues/1926
5849	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5850	auto fusion = fusion_ptr.get();
5851	FusionGuard fg(fusion);
5852
5853	TensorView* tv0 = makeSymbolicTensor(`2`);
5854	fusion->addInput(tv0);
5855	auto tv1 = set(tv0);
5856	auto tv2 = set(tv1);
5857	fusion->addOutput(tv2);
5858
5859	tv1->setMemoryType(MemoryType::Shared);
5860	for (auto tv : {tv1, tv2}) {
5861	tv->split(`0`, `4`);
5862	tv->reorder({{`1`, -`1`}});
5863	tv->split(`1`, `8`);
5864	tv->merge(`0`);
5865	tv->split(`0`, `1`);
5866	tv->axis(`0`)->parallelize(ParallelType::BIDx);
5867	tv->axis(`1`)->parallelize(ParallelType::Unswitch);
5868	}
5869	tv1->merge(`2`);
5870	tv2->reorder({{`2`, `3`}});
5871	tv2->merge(`2`);
5872	for (auto tv : {tv1, tv2}) {
5873	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
5874	}
5875
5876	inlineMost();
5877
5878	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
5879	at::Tensor t0 = at::randn({`5`, `5`}, options);
5880
5881	FusionExecutor fe;
5882	fe.compileFusion(fusion, {t0});
5883	auto cg_outputs = fe.runFusion({t0});
5884	auto out = cg_outputs [`0`];
5885
5886	testValidate(fusion, {out}, {t0}, {t0}, __LINE__, __FILE__);
5887	}
5888
5889	TEST_F(NVFuserTest, AsyncCompilation_CUDA) {
5890	auto fusion = std::make_unique<Fusion>();
5891	FusionGuard fg(fusion.get());
5892
5893	TensorView* tv0 = makeSymbolicTensor(`2`);
5894	TensorView* tv1 = makeSymbolicTensor(`1`);
5895	TensorView* tv2 = makeSymbolicTensor(`2`);
5896
5897	fusion ->addInput(tv0);
5898	fusion ->addInput(tv1);
5899	fusion ->addInput(tv2);
5900
5901	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`1`)); // Group 0
5902	TensorView* tv4 =
5903	max(tv3, {`0`}); // Group 0 (use max instead to avoid numerical issues)
5904	TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce,
5905	// keeps normalization scheduler away)
5906	TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce)
5907
5908	fusion ->addOutput(tv6);
5909
5910	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5911
5912	at::Tensor t0 = at::randn({`8`, `5`}, options);
5913	at::Tensor t1 = at::randn({`5`}, options);
5914	at::Tensor t2 = at::randn({`8`, `5`}, options);
5915
5916	auto t3 = t0.add(`1.0`);
5917	auto t4 = std::get<`0`>(at::max(t3, `0`));
5918	auto t5 = t4.add(t1);
5919	auto t6 = t5.add(t2);
5920
5921	FusionExecutorCache executor_cache(std::move(fusion));
5922
5923	std::vector<IValue> aten_inputs = {t0, t1, t2};
5924
5925	executor_cache.compileFusionAsync(aten_inputs);
5926
5927	while (!executor_cache.isCompiled(aten_inputs)) {
5928	std::this_thread::sleep_for(std::chrono::milliseconds (`20`));
5929	printf(".");
5930	}
5931
5932	auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
5933
5934	TORCH_CHECK(
5935	executor_cache.getMostRecentKernelRuntime()->isSegmented(),
5936	"segmentation didn't happen");
5937	TORCH_CHECK(
5938	executor_cache.getMostRecentKernelRuntime()
5939	->fusionSegments()
5940	->groups()
5941	.size() == `2`,
5942	"segmentation didn't happen as expected");
5943
5944	testValidate(
5945	executor_cache.fusion(), outputs, aten_inputs, {t6}, __LINE__, __FILE__);
5946	}
5947
5948	TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
5949	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5950	auto fusion = fusion_ptr.get();
5951	FusionGuard fg(fusion);
5952
5953	TensorView* tv0 = makeConcreteTensor({`1`, `1`});
5954	TensorView* tv1 = makeConcreteTensor({-`1`});
5955	fusion->addInput(tv0);
5956	fusion->addInput(tv1);
5957	auto tv2 = sum(tv0, {`1`});
5958	auto tv3 = add(tv2, tv1);
5959	fusion->addOutput(tv3);
5960
5961	tv0->merge(`0`);
5962
5963	MaxRootDomainInfoSpanningTree tree(tv0);
5964	TransformPropagatorWithCheck tp(tv0);
5965	tree.traverse(&tp);
5966
5967	inlineMost();
5968
5969	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
5970	at::Tensor t0 = at::randn({`1`, `1`}, options);
5971	at::Tensor t1 = at::randn({`10`}, options);
5972
5973	FusionExecutor fe;
5974	fe.compileFusion(fusion, {t0, t1});
5975	auto cg_outputs = fe.runFusion({t0, t1});
5976	auto out = cg_outputs [`0`];
5977
5978	testValidate(
5979	fusion, {out}, {t0, t1}, {t1 + t0.flatten()}, __LINE__, __FILE__);
5980	}
5981
5982	TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) {
5983	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5984	auto fusion = fusion_ptr.get();
5985	FusionGuard fg(fusion);
5986
5987	TensorView* tv0 = makeConcreteTensor({-`1`, `1`, `1`});
5988	TensorView* tv1 = makeConcreteTensor({-`1`, -`1`});
5989	fusion->addInput(tv0);
5990	fusion->addInput(tv1);
5991	auto tv2 = sum(tv0, {`1`});
5992	auto tv3 = add(tv2, tv1);
5993	fusion->addOutput(tv3);
5994
5995	tv2->merge(`1`);
5996	tv2->merge(`0`);
5997
5998	MaxRootDomainInfoSpanningTree tree(tv0);
5999	TransformPropagatorWithCheck tp(tv0);
6000	tree.traverse(&tp);
6001
6002	inlineMost();
6003
6004	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6005	at::Tensor t0 = at::randn({`10`, `1`, `1`}, options);
6006	at::Tensor t1 = at::randn({`10`, `10`}, options);
6007
6008	FusionExecutor fe;
6009	fe.compileFusion(fusion, {t0, t1});
6010	auto cg_outputs = fe.runFusion({t0, t1});
6011	auto out = cg_outputs [`0`];
6012
6013	testValidate(
6014	fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-`1`)}, __LINE__, __FILE__);
6015	}
6016
6017	// Simple test case exercising the null scheduler path.
6018	TEST_F(NVFuserTest, FusionNullScheduler_CUDA) {
6019	auto fusion = std::make_unique<Fusion>();
6020	FusionGuard fg(fusion.get());
6021
6022	auto tv0 = makeConcreteTensor({`1`, `1`, `1`});
6023	fusion ->addInput(tv0);
6024
6025	auto tv1 = sum(tv0, {`0`, `1`, `2`});
6026
6027	fusion ->addOutput(tv1);
6028
6029	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6030	at::Tensor t0 = at::randn({`1`, `1`, `1`}, options);
6031
6032	std::vector<IValue> aten_inputs({t0});
6033
6034	FusionExecutorCache executor_cache(std::move(fusion));
6035	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6036
6037	auto t1 = t0.sum({`0`, `1`, `2`});
6038
6039	testValidate(
6040	executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
6041
6042	auto groups =
6043	executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6044
6045	// Check that all groups on the resulting runtime are null.
6046	for (auto group : groups) {
6047	TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6048	}
6049	}
6050
6051	// Simple test case exercising the null scheduler path.
6052	TEST_F(NVFuserTest, FusionNullScheduler2_CUDA) {
6053	auto fusion = std::make_unique<Fusion>();
6054	FusionGuard fg(fusion.get());
6055
6056	auto tv0 = makeConcreteTensor({`0`, `1`, `9223372036854775807L`});
6057	fusion ->addInput(tv0);
6058
6059	auto tv1 = sum(tv0, {`0`, `1`, `2`});
6060
6061	fusion ->addOutput(tv1);
6062
6063	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6064	at::Tensor t0 = at::randn({`0`, `1`, `9223372036854775807L`}, options);
6065
6066	std::vector<IValue> aten_inputs({t0});
6067
6068	FusionExecutorCache executor_cache(std::move(fusion));
6069	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6070
6071	auto t1 = t0.sum({`0`, `1`, `2`});
6072
6073	testValidate(
6074	executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
6075
6076	auto groups =
6077	executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6078
6079	// Check that all groups on the resulting runtime are null.
6080	for (auto group : groups) {
6081	TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6082	}
6083	}
6084
6085	// Simple test case exercising the null scheduler path.
6086	TEST_F(NVFuserTest, FusionNullScheduler3_CUDA) {
6087	auto fusion = std::make_unique<Fusion>();
6088	FusionGuard fg(fusion.get());
6089
6090	auto tv0 = TensorViewBuilder ().ndims(`0`).build();
6091	auto tv1 = TensorViewBuilder ().ndims(`0`).build();
6092	fusion ->addInput(tv0);
6093	fusion ->addInput(tv1);
6094	auto tv2 = add(tv0, tv1);
6095	fusion ->addOutput(tv2);
6096
6097	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6098	at::Tensor t0 = at::randn({}, options);
6099	at::Tensor t1 = at::randn({}, options);
6100
6101	std::vector<IValue> aten_inputs({t0, t1});
6102
6103	FusionExecutorCache executor_cache(std::move(fusion));
6104	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6105
6106	testValidate(
6107	executor_cache.fusion(),
6108	cg_outputs,
6109	{t0, t1},
6110	{t0 + t1},
6111	__LINE__,
6112	__FILE__);
6113
6114	auto groups =
6115	executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6116
6117	// Check that all groups on the resulting runtime are null.
6118	for (auto group : groups) {
6119	TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6120	}
6121	}
6122
6123	TEST_F(NVFuserTest, FusionEmpty_CUDA) {
6124	auto fusion = std::make_unique<Fusion>();
6125	FusionGuard fg(fusion.get());
6126
6127	auto tv0 = makeConcreteTensor({`10`, `10`, `10`});
6128	auto tv1 = makeConcreteTensor({`10`, `10`, `10`});
6129	fusion ->addInput(tv0);
6130	fusion ->addInput(tv1);
6131	fusion ->addOutput(tv0);
6132	fusion ->addOutput(tv1);
6133
6134	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6135	at::Tensor t0 = at::randn({`10`, `10`, `10`}, options);
6136	at::Tensor t1 = at::randn({`10`, `10`, `10`}, options);
6137
6138	std::vector<IValue> aten_inputs({t0, t1});
6139
6140	FusionExecutorCache executor_cache(std::move(fusion));
6141	auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6142
6143	testValidate(
6144	executor_cache.fusion(),
6145	cg_outputs,
6146	{t0, t1},
6147	{t0, t1},
6148	__LINE__,
6149	__FILE__);
6150
6151	auto groups =
6152	executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6153
6154	// Check that all groups on the resulting runtime are null.
6155	for (auto group : groups) {
6156	TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6157	}
6158	}
6159
6160	TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
6161	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6162	auto fusion = fusion_ptr.get();
6163	FusionGuard fg(fusion);
6164
6165	TensorView* tv0 = makeConcreteTensor({`1`, `1`});
6166	TensorView* tv1 = makeConcreteTensor({-`1`, `1`, `1`});
6167	fusion->addInput(tv0);
6168	fusion->addInput(tv1);
6169	auto tv2 = set(tv0);
6170	auto tv3 = broadcast(tv2, {true, false, false});
6171	auto tv4 = add(tv3, tv1);
6172
6173	fusion->addOutput(tv4);
6174
6175	tv4->merge(-`2`);
6176	tv4->merge(-`1`);
6177
6178	tv0->computeAt(tv4, -`1`);
6179	tv1->computeAt(tv4, -`1`);
6180
6181	ComputeAtMap ca_map(fusion);
6182
6183	// FIXME: This is the concerning part that would motivate some
6184	// more formalization on concrete/permissive mapping:
6185	// exact mapping should ideally imply permissive mapping.
6186	auto tv4_inner_node = tv4->axis(`0`)->definition()->input(`1`)->as<IterDomain>();
6187	TORCH_CHECK(
6188	ca_map.areMapped(tv2->axis(`0`), tv4_inner_node, IdMappingMode::EXACT));
6189	TORCH_CHECK(!ca_map.areMapped(
6190	tv2->axis(`0`), tv4_inner_node, IdMappingMode::PERMISSIVE));
6191
6192	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6193	at::Tensor t0 = at::randn({`1`, `1`}, options);
6194	at::Tensor t1 = at::randn({`2`, `1`, `1`}, options);
6195
6196	FusionExecutor fe;
6197	fe.compileFusion(fusion, {t0, t1});
6198	auto cg_outputs = fe.runFusion({t0, t1});
6199	auto out = cg_outputs [`0`];
6200
6201	testValidate(
6202	fusion, {out}, {t0, t1}, {t1 + t0.squeeze(`0`)}, __LINE__, __FILE__);
6203	}
6204
6205	TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
6206	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6207	auto fusion = fusion_ptr.get();
6208	FusionGuard fg(fusion);
6209
6210	TensorView* tv0 = makeSymbolicTensor(`2`);
6211	fusion->addInput(tv0);
6212	auto tv1 = sin(tv0);
6213	auto tv2 = cos(tv1);
6214	fusion->addOutput(tv2);
6215
6216	tv1->inlineAt(-`1`);
6217
6218	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6219	at::Tensor t0 = at::randn({`100`, `2`}, options);
6220
6221	FusionExecutor fe;
6222	fe.compileFusion(fusion, {t0});
6223	auto cg_outputs = fe.runFusion({t0});
6224	auto out = cg_outputs [`0`];
6225
6226	testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
6227	}
6228
6229	TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
6230	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6231	auto fusion = fusion_ptr.get();
6232	FusionGuard fg(fusion);
6233
6234	TensorView* tv0 = makeConcreteTensor({-`1`, -`1`});
6235	TensorView* tv1 = makeConcreteTensor({-`1`, -`1`});
6236	fusion->addInput(tv0);
6237	fusion->addInput(tv1);
6238	// Note: tv2 is not needed. Kept it here since previously there was an
6239	// assertion from sorting in codegen.
6240	auto tv2 = add(tv1, IrBuilder::create<Double>(`3.141`));
6241	fusion->addOutput(tv0);
6242
6243	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6244	at::Tensor t0 = at::randn({`10`, `4`}, options);
6245	at::Tensor t1 = at::randn({`10`, `4`}, options);
6246
6247	FusionExecutorCache fec(std::move(fusion_ptr));
6248	auto cg_outputs = fec.runFusionWithInputs({t0, t1});
6249
6250	testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);
6251
6252	// Second run to ensure cache hit handles trivial forwarding properly
6253	TORCH_CHECK(fec.isCompiled({t0, t1}));
6254	auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
6255	testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
6256	}
6257
6258	TEST_F(NVFuserTest, FusionTrivialInputForwarding2_CUDA) {
6259	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6260	auto fusion = fusion_ptr.get();
6261	FusionGuard fg(fusion);
6262
6263	TensorView* tv0 = makeSymbolicTensor(`0`);
6264	fusion->addInput(tv0);
6265	fusion->addOutput(tv0);
6266
6267	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6268	at::Tensor t0 = at::randn({}, options);
6269
6270	FusionExecutorCache fec(std::move(fusion_ptr));
6271	auto cg_outputs = fec.runFusionWithInputs({t0});
6272
6273	testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6274
6275	// Second run to ensure cache hit handles trivial forwarding properly
6276	TORCH_CHECK(fec.isCompiled({t0}));
6277	auto cg_outputs2 = fec.runFusionWithInputs({t0});
6278	testValidate(fusion, cg_outputs2, {t0}, {t0}, __LINE__, __FILE__);
6279	}
6280
6281	// Simplified repro of issue #2008
6282	TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) {
6283	auto fusion_ptr = std::make_unique<Fusion>();
6284	Fusion& fusion = *fusion_ptr;
6285	FusionGuard fg(fusion_ptr.get());
6286
6287	std::vector<int64_t> shape({`10`, `1`, `1`});
6288
6289	auto tv0 = makeConcreteTensor(shape);
6290	fusion.addInput(tv0);
6291
6292	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
6293	auto tv2 = sum(tv1, {`1`, `2`});
6294	auto tv3 = broadcast(tv2, {false, true, true});
6295	fusion.addOutput(tv3);
6296
6297	tv0->merge(-`2`, -`1`)->merge(-`2`, -`1`)->split(`0`, `4`);
6298
6299	MaxRootDomainInfoSpanningTree tree(tv0);
6300	TransformPropagator tp(tv0);
6301	tree.traverse(&tp);
6302
6303	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6304	at::Tensor t0 = at::randn(shape, options);
6305	std::vector<IValue> aten_inputs({t0});
6306
6307	FusionExecutor fe;
6308	fe.compileFusion(fusion_ptr.get(), aten_inputs);
6309	auto outputs = fe.runFusion(aten_inputs);
6310
6311	testValidate(&fusion, outputs, aten_inputs, {t0 + `1`}, __LINE__, __FILE__);
6312	}
6313
6314	namespace {
6315
6316	size_t getVecSizeForPointwise(FusionExecutorCache& fec) {
6317	auto most_recent_params =
6318	fec.getMostRecentKernelRuntime()->getMostRecentExecutorLog().params;
6319	auto params = dynamic_cast<PointwiseParams*>(most_recent_params.get());
6320	if (params->vectorize) {
6321	return params->unroll_factor;
6322	}
6323	return `1`;
6324	}
6325
6326	} // namespace
6327
6328	TEST_F(NVFuserTest, FusionVectorizeStrideContiguity2D_CUDA) {
6329	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6330	auto fusion = fusion_ptr.get();
6331	FusionGuard fg(fusion);
6332
6333	TensorView* tv0 =
6334	TensorViewBuilder ().ndims(`2`).contiguity({false, true}).build();
6335	fusion->addInput(tv0);
6336	auto tv1 = set(tv0);
6337	fusion->addOutput(tv1);
6338
6339	FusionExecutorCache fec(std::move(fusion_ptr));
6340	fec.profile(true);
6341
6342	std::vector<std::pair<int, int>> size_and_vec{{`17`, `1`}, {`18`, `2`}, {`32`, `4`}};
6343
6344	for (auto pair : size_and_vec) {
6345	auto size = pair.first;
6346	auto vec = pair.second;
6347	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6348	at::Tensor t0 = at::randn({`1000000`, size}, options).narrow(`1`, `0`, `16`);
6349	auto cg_outputs = fec.runFusionWithInputs({t0});
6350
6351	TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6352
6353	testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6354	}
6355	}
6356
6357	TEST_F(NVFuserTest, FusionVectorizeStrideContiguity3D_CUDA) {
6358	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6359	auto fusion = fusion_ptr.get();
6360	FusionGuard fg(fusion);
6361
6362	TensorView* tv0 =
6363	TensorViewBuilder ().ndims(`3`).contiguity({false, true, true}).build();
6364	fusion->addInput(tv0);
6365	auto tv1 = set(tv0);
6366	fusion->addOutput(tv1);
6367
6368	FusionExecutorCache fec(std::move(fusion_ptr));
6369	fec.profile(true);
6370
6371	std::vector<std::pair<int, int>> size_and_vec{{`17`, `1`}, {`10`, `2`}, {`16`, `4`}};
6372
6373	for (auto pair : size_and_vec) {
6374	auto size = pair.first;
6375	auto vec = pair.second;
6376	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6377	at::Tensor t0 = at::randn({`1000000`, size, `3`}, options).narrow(`1`, `0`, `8`);
6378	auto cg_outputs = fec.runFusionWithInputs({t0});
6379
6380	TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6381
6382	testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6383	}
6384	}
6385
6386	TEST_F(NVFuserTest, FusionVectorizeStrideContiguity5D_CUDA) {
6387	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6388	auto fusion = fusion_ptr.get();
6389	FusionGuard fg(fusion);
6390
6391	TensorView* tv0 = TensorViewBuilder ()
6392	.ndims(`5`)
6393	.contiguity({false, true, false, true, true})
6394	.build();
6395	fusion->addInput(tv0);
6396	auto tv1 = set(tv0);
6397	fusion->addOutput(tv1);
6398
6399	FusionExecutorCache fec(std::move(fusion_ptr));
6400	fec.profile(true);
6401
6402	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6403
6404	std::vector<std::tuple<int, int, int>> sizes_and_vec{
6405	{`9`, `17`, `1`}, {`9`, `10`, `2`}, {`9`, `16`, `4`}};
6406
6407	for (auto tup : sizes_and_vec) {
6408	auto size1 = std::get<`0`>(tup);
6409	auto size2 = std::get<`1`>(tup);
6410	auto vec = std::get<`2`>(tup);
6411	at::Tensor t0 = at::randn({`4`, size1, `12345`, size2, `3`}, options)
6412	.narrow(`1`, `0`, `8`)
6413	.narrow(`3`, `0`, `4`);
6414	auto cg_outputs = fec.runFusionWithInputs({t0});
6415
6416	TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6417
6418	testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6419	}
6420	}
6421
6422	TEST_F(NVFuserTest, FusionVectorizeStrideContiguitySelfOverlapping_CUDA) {
6423	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6424	auto fusion = fusion_ptr.get();
6425	FusionGuard fg(fusion);
6426
6427	TensorView* tv0 = TensorViewBuilder ()
6428	.ndims(`5`)
6429	.contiguity({false, true, false, true, true})
6430	.build();
6431	fusion->addInput(tv0);
6432	auto tv1 = set(tv0);
6433	fusion->addOutput(tv1);
6434
6435	FusionExecutorCache fec(std::move(fusion_ptr));
6436	fec.profile(true);
6437
6438	auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, `0`);
6439
6440	std::vector<std::tuple<int, int, int, int>> sizes_strides_and_vec{
6441	{`4`, `4`, `4`, `4`},
6442	{`4`, `4`, `2`, `2`},
6443	{`4`, `2`, `4`, `2`},
6444	{`2`, `4`, `4`, `2`},
6445	{`4`, `4`, `1`, `1`},
6446	{`4`, `1`, `4`, `1`},
6447	{`1`, `4`, `4`, `1`},
6448	{`2`, `2`, `2`, `2`},
6449	{`2`, `2`, `1`, `1`},
6450	{`2`, `1`, `2`, `1`},
6451	{`1`, `2`, `2`, `1`}};
6452
6453	for (auto tup : sizes_strides_and_vec) {
6454	auto size = std::get<`0`>(tup);
6455	auto stride1 = std::get<`1`>(tup);
6456	auto stride2 = std::get<`2`>(tup);
6457	auto vec = std::get<`3`>(tup);
6458	std::vector<int64_t> shape = {`4`, `4`, `12345`, size, `3`};
6459	std::vector<int64_t> stride = {stride1, stride2 * `12345`, stride2, `3`, `1`};
6460	at::Tensor t0 = at::empty_strided(shape, stride, options);
6461	t0.random_();
6462	auto cg_outputs = fec.runFusionWithInputs({t0});
6463	TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6464	testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6465	}
6466	}
6467
6468	TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
6469	Fusion fusion;
6470	FusionGuard fg(&fusion);
6471
6472	// requires ampere+ GPU
6473	if (!deviceMajorMinorCheck(`8`)) {
6474	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
6475	return;
6476	}
6477
6478	auto tv0 = makeContigTensor(`1`);
6479
6480	fusion.addInput(tv0);
6481
6482	auto tv1 = set(tv0);
6483
6484	fusion.addOutput(tv1);
6485
6486	auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync);
6487	tv_cache->setMemoryType(MemoryType::Shared);
6488
6489	tv1->split(`0`, `16`);
6490	tv0->computeAt(tv1, `1`);
6491
6492	tv_cache->circularBuffer(`10`);
6493
6494	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6495	at::Tensor input1 = at::randn({`255`}, options);
6496
6497	// Add check that the cp async op has an inlined predicate.
6498	class InlinedCpAsyncPredChecker : public kir::IrVisitor {
6499	public:
6500	using kir::IrVisitor::handle;
6501
6502	private:
6503	void handle(kir::IfThenElse* ite) final {
6504	auto prev_within_ite = within_ite_;
6505	within_ite_ = true;
6506	kir::IrVisitor::handle(ite);
6507	within_ite_ = prev_within_ite;
6508	}
6509
6510	void handle(LoadStoreOp* ldst) final {
6511	if (ldst->opType() == LoadStoreOpType::CpAsync) {
6512	TORCH_INTERNAL_ASSERT(!within_ite_, "CPASYNC predicate not inlined");
6513	TORCH_INTERNAL_ASSERT(
6514	ldst->predicate()->hasValue() &&
6515	!ldst->predicate()->value()->isConst(),
6516	"CPASYNC predicate is not generated");
6517	}
6518	}
6519
6520	private:
6521	bool within_ite_ = false;
6522	} pred_checker;
6523
6524	// Check that cp async is inlined:
6525	GpuLower gpulw(&fusion);
6526	pred_checker.handle(gpulw.kernel()->topLevelExprs());
6527
6528	FusionExecutor fe;
6529	fe.compileFusion(&fusion, {input1});
6530	auto cg_outputs = fe.runFusion({input1});
6531
6532	testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__);
6533	}
6534
6535	// Test file size should be up to 10K LoC. Create a new file for more tests.
6536
6537	} // namespace jit
6538	} // namespace torch
6539	#endif // #if defined(USE_CUDA)
6540

Browse the source code of pytorch/third_party/nvfuser/test/test_gpu3.cpp