test_gpu2.cpp source code [pytorch/third_party/nvfuser/test/test_gpu2.cpp]

1	#if defined(USE_CUDA)
2	#include <gmock/gmock-matchers.h>
3	#include <gtest/gtest.h>
4
5	#include <arith.h>
6	#include <codegen.h>
7	#include <disjoint_set.h>
8	#include <executor.h>
9	#include <executor_launch_params.h>
10	#include <expr_evaluator.h>
11	#include <fusion.h>
12	#include <fusion_segmenter.h>
13	#include <grouped_reduction.h>
14	#include <inlining.h>
15	#include <ir_all_nodes.h>
16	#include <ir_builder.h>
17	#include <ir_graphviz.h>
18	#include <ir_iostream.h>
19	#include <ir_utils.h>
20	#include <iter_visitor.h>
21	#include <kernel_cache.h>
22	#include <kernel_expr_evaluator.h>
23	#include <kernel_ir.h>
24	#include <kernel_ir_dispatch.h>
25	#include <lower2device.h>
26	#include <lower_magic_zero.h>
27	#include <mutator.h>
28	#include <ops/all_ops.h>
29	#include <register_interface.h>
30	#include <root_domain_map.h>
31	#include <scheduler/all_schedulers.h>
32	#include <scheduler/reduction_utils.h>
33	#include <scheduler/utils.h>
34	#include <test/test_gpu_validator.h>
35	#include <test/test_utils.h>
36	#include <transform_replay.h>
37	#include <transform_rfactor.h>
38
39	#include <test/cpp/jit/test_utils.h>
40	#include <torch/csrc/jit/api/function_impl.h>
41	#include <parser.h>
42	#include <torch/csrc/jit/ir/irparser.h>
43	#include <torch/torch.h>
44
45	#include <ATen/cuda/CUDAContext.h>
46	#include <ATen/cuda/Exceptions.h>
47	#include <c10/cuda/CUDAStream.h>
48
49	#include <algorithm>
50	#include <iostream>
51	#include <sstream>
52	#include <thread>
53
54	// Tests go in torch::jit
55	namespace torch {
56	namespace jit {
57
58	using namespace torch::jit::fuser::cuda;
59	using namespace at::indexing;
60
61	TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) {
62	Fusion fusion;
63	FusionGuard fg(&fusion);
64
65	// Set up your input tensor views
66	TensorView* tv0 = makeSymbolicTensor(`2`);
67	TensorView* tv1 =
68	reductionOp(BinaryOpType::Add, {`1`}, IrBuilder::create<Double>(`0`), tv0);
69	fusion.addInput(tv0);
70	fusion.addOutput(tv1);
71	// tv1[I0, R1] = tv0[I0, I1]
72
73	// Interface should just be a direct split with a Parallel type. We can
74	// include the parallelize call if we do this.
75	tv1->split(`1`, NamedScalar::getParallelDim(ParallelType::TIDx));
76	// tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
77
78	TensorView* tv2 = tv1->rFactor({`2`});
79	tv2->setMemoryType(MemoryType::Global);
80	// tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
81	// tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
82
83	tv0->computeAt(tv1, `1`);
84
85	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
86	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
87
88	constexpr int numel_x = `65000`, numel_y = `1024`;
89
90	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
91	at::Tensor input = at::randn({numel_x, numel_y}, options);
92
93	// How many threads to use for the block reduction
94	constexpr int runtime_threadIdx_dim = `128`;
95
96	auto lparams = LaunchParams (-`1`, -`1`, -`1`, runtime_threadIdx_dim, -`1`, -`1`);
97
98	FusionExecutor fe;
99	fe.compileFusion(&fusion, {input}, lparams);
100	auto cg_outputs = fe.runFusion({input}, lparams);
101
102	auto aten_output = input.to(at::kDouble).sum({`1`});
103	testValidate(
104	&fusion,
105	cg_outputs,
106	{input},
107	{aten_output},
108	__LINE__,
109	__FILE__,
110	"",
111	lparams);
112	}
113
114	TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
115	Fusion fusion;
116	FusionGuard fg(&fusion);
117
118	TensorView* tv0 = makeSymbolicTensor(`2`);
119	TensorView* tv1 = makeSymbolicTensor(`2`);
120	TensorView* tv2 = makeSymbolicTensor(`2`);
121	TensorView* tv3 = makeSymbolicTensor(`2`);
122	TensorView* tv4 = sub(tv2, tv3);
123	TensorView* tv5 = add(tv1, tv4);
124	TensorView* tv6 = sub(tv5, tv0);
125	fusion.addInput(tv0);
126	fusion.addInput(tv1);
127	fusion.addInput(tv2);
128	fusion.addInput(tv3);
129	fusion.addOutput(tv6);
130	// t6 = ((t1 + (t2 - t3)) - t0)
131
132	tv4->setMemoryType(MemoryType::Global);
133	tv5->setMemoryType(MemoryType::Global);
134	tv6->setMemoryType(MemoryType::Global);
135
136	constexpr int M = `32`, N = `810`;
137	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
138	at::Tensor t0 = at::randn({M, N}, options);
139	at::Tensor t1 = at::randn({M, N}, options);
140	at::Tensor t2 = at::randn({M, N}, options);
141	at::Tensor t3 = at::randn({M, N}, options);
142
143	at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
144
145	std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
146
147	FusionExecutor fe;
148	fe.compileFusion(&fusion, {t0, t1, t2, t3});
149	auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
150
151	testValidate(
152	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
153	}
154
155	TEST_F(NVFuserTest, FusionConstCheck_CUDA) {
156	Fusion fusion;
157	FusionGuard fg(&fusion);
158
159	auto one = IrBuilder::create<Int>(`1`);
160	TORCH_CHECK(one->isConstScalar());
161
162	auto one_x2 = mul(one, one);
163	TORCH_CHECK(one_x2->isConstScalar());
164
165	auto one_x3 = mul(one_x2, one);
166	TORCH_CHECK(one_x3->isConstScalar());
167
168	auto one_x4 = mul(one_x3, one);
169	TORCH_CHECK(one_x4->isConstScalar());
170	}
171
172	TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
173	const std::vector<int64_t> tensor_dims_in = {`128`, `128`};
174	Fusion fusion;
175	FusionGuard fg(&fusion);
176
177	// Set up your input tensor views
178	TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
179	fusion.addInput(tv0);
180
181	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`0`));
182	TensorView* tv2 =
183	reductionOp(BinaryOpType::Add, {`1`}, IrBuilder::create<Double>(`0`), tv1);
184	fusion.addOutput(tv2);
185
186	const auto options =
187	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
188	at::Tensor input = at::randn(tensor_dims_in, options);
189	at::Tensor cg_output = at::empty({tensor_dims_in [`0`]}, options);
190
191	// Schedule
192	tv2->split(`1`, `32`);
193	tv2->split(`1`, `4`); // unroll
194
195	auto tv2_rf = tv2->rFactor({-`3`, -`2`});
196
197	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
198	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
199
200	tv2_rf->axis(`0`)->parallelize(ParallelType::BIDx);
201	tv2_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
202	tv2_rf->axis(-`2`)->parallelize(ParallelType::Unroll);
203
204	tv1->computeAt(tv2_rf, -`1`);
205
206	FusionExecutor fe;
207	fe.compileFusion(&fusion, {input});
208	auto cg_outputs = fe.runFusion({input});
209
210	auto aten_output = (input + `0`).to(at::kDouble).sum(`1`);
211
212	testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
213	}
214
215	// Test isZeroInt
216	TEST_F(NVFuserTest, FusionIsZeroInt_CUDA) {
217	Fusion fusion;
218	FusionGuard fg(&fusion);
219
220	Int* x = IrBuilder::create<Int>(`0`);
221	Int* y = IrBuilder::create<Int>(`1`);
222	Val* z = mul(x, y);
223	TORCH_CHECK(x->isZeroInt());
224	TORCH_CHECK(!y->isZeroInt());
225	TORCH_CHECK(!z->isZeroInt());
226	}
227
228	// Test isOneInt
229	TEST_F(NVFuserTest, FusionIsOneInt_CUDA) {
230	Fusion fusion;
231	FusionGuard fg(&fusion);
232
233	Int* x = IrBuilder::create<Int>(`1`);
234	Int* y = IrBuilder::create<Int>(`1`);
235	Val* z = mul(x, y);
236	TORCH_CHECK(x->isOneInt());
237	TORCH_CHECK(y->isOneInt());
238	TORCH_CHECK(!z->isOneInt());
239	}
240
241	// This is to verify no cycle of computeAt is created. A more complex
242	// variation of this pattern appears in one of the Python tests
243	// (test_random_topo).
244	TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
245	Fusion fusion;
246	FusionGuard fg(&fusion);
247
248	TensorView* tv0 = makeSymbolicTensor(`1`);
249	fusion.addInput(tv0);
250
251	// Common intermediate tensor
252	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
253	// tv1 -> tv2
254	auto tv2 = add(tv1, IrBuilder::create<Double>(`2`));
255	// tv1 -> tv3 -> tv4
256	auto tv3 = add(tv1, IrBuilder::create<Double>(`3`));
257	auto tv4 = add(tv3, IrBuilder::create<Double>(`4`));
258
259	// NOTE: This should no longer occur as of PR #201.
260	// The order of adding outputs matters. If tv3 is added before tv4,
261	// it should be fine. However, if tv4 is added before tv3, there
262	// will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created
263	// first, and then tv4->tv3 is created at the final phase of
264	// computeAt (ComputeAt::setupOutputs).
265	fusion.addOutput(tv2);
266	fusion.addOutput(tv4);
267	fusion.addOutput(tv3);
268
269	tv0->computeAt(tv2, -`1`);
270
271	TORCH_CHECK(tv3->hasComputeAt());
272	TORCH_CHECK(!tv4->hasComputeAt());
273
274	const auto options =
275	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
276	at::Tensor aten_input = at::randn(`100`, options);
277
278	auto t1 = aten_input + `1`;
279	auto t2 = t1 + `2`;
280	auto t3 = t1 + `3`;
281	auto t4 = t3 + `4`;
282
283	FusionExecutor fe;
284	fe.compileFusion(&fusion, {aten_input});
285	auto cg_outputs = fe.runFusion({aten_input});
286
287	std::vector<at::Tensor> aten_outputs = {t2, t4, t3};
288	testValidate(
289	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
290	}
291
292	TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) {
293	Fusion fusion;
294	FusionGuard fg(&fusion);
295
296	// Set up your input tensor views
297	TensorView* tv0 = makeSymbolicTensor(`2`);
298	fusion.addInput(tv0);
299
300	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
301	TensorView* tv2 = add(tv0, IrBuilder::create<Double>(`2`));
302	TensorView* tv3 = add(tv1, IrBuilder::create<Double>(`3`));
303	TensorView* tv4 = add(tv1, IrBuilder::create<Double>(`4`));
304
305	fusion.addOutput(tv2);
306	fusion.addOutput(tv3);
307	fusion.addOutput(tv4);
308
309	tv1->computeAt(tv3, -`1`);
310
311	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
312	at::Tensor aten_input = at::randn({`10`, `10`}, options);
313
314	auto t1 = aten_input + `1`;
315	auto t2 = aten_input + `2`;
316	auto t3 = t1 + `3`;
317	auto t4 = t1 + `4`;
318
319	std::vector<at::Tensor> aten_outputs = {t2, t3, t4};
320
321	std::vector<at::Tensor> cg_outputs = {
322	at::empty_like(aten_input, options),
323	at::empty_like(aten_input, options),
324	at::empty_like(aten_input, options)};
325
326	FusionExecutor fe;
327	fe.compileFusion(&fusion, {aten_input});
328	fe.runFusion({aten_input}, cg_outputs);
329	testValidate(
330	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
331	}
332
333	TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) {
334	Fusion fusion;
335	FusionGuard fg(&fusion);
336
337	// Set up your input tensor views
338	TensorView* tv0 = makeSymbolicTensor(`2`);
339	fusion.addInput(tv0);
340
341	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
342	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`2`));
343
344	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`3`));
345	TensorView* tv4 = add(tv3, IrBuilder::create<Double>(`4`));
346
347	TensorView* tv5 = add(tv1, tv3);
348
349	fusion.addOutput(tv2);
350	fusion.addOutput(tv4);
351	fusion.addOutput(tv5);
352
353	tv1->computeAt(tv5, -`1`);
354	tv3->computeAt(tv5, -`1`);
355
356	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
357	at::Tensor aten_input = at::randn({`10`, `10`}, options);
358
359	auto t1 = aten_input + `1`;
360	auto t2 = t1 + `2`;
361	auto t3 = aten_input + `3`;
362	auto t4 = t3 + `4`;
363	auto t5 = t1 + t3;
364
365	std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
366
367	std::vector<at::Tensor> cg_outputs = {
368	at::empty_like(aten_input, options),
369	at::empty_like(aten_input, options),
370	at::empty_like(aten_input, options)};
371
372	FusionExecutor fe;
373	fe.compileFusion(&fusion, {aten_input});
374	fe.runFusion({aten_input}, cg_outputs);
375
376	testValidate(
377	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
378	}
379
380	TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) {
381	for (const auto i : c10::irange(`2`)) {
382	Fusion fusion;
383	FusionGuard fg(&fusion);
384
385	TensorView* tv0 = makeSymbolicTensor(`1`);
386	fusion.addInput(tv0);
387
388	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
389	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`2`));
390
391	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`3`));
392	TensorView* tv4 = add(tv3, IrBuilder::create<Double>(`4`));
393
394	TensorView* tv5 = add(tv1, tv3);
395
396	fusion.addOutput(tv2);
397	fusion.addOutput(tv4);
398	fusion.addOutput(tv5);
399
400	const int tile = `32`;
401
402	tv1->split(-`1`, tile);
403	tv2->split(-`1`, tile);
404	tv3->split(-`1`, tile);
405	tv4->split(-`1`, tile);
406	tv5->split(-`1`, tile);
407
408	auto compute_at_outer = tv1;
409	auto compute_at_inner = tv3;
410	if (i == `1`) {
411	std::swap(compute_at_inner, compute_at_outer);
412	}
413
414	compute_at_outer->computeAt(tv5, -`2`);
415	compute_at_inner->computeAt(tv5, -`1`);
416
417	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
418	at::Tensor aten_input = at::randn({`100`}, options);
419	auto t1 = aten_input + `1`;
420	auto t2 = t1 + `2`;
421	auto t3 = aten_input + `3`;
422	auto t4 = t3 + `4`;
423	auto t5 = t1 + t3;
424
425	std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
426
427	std::vector<at::Tensor> cg_outputs = {
428	at::empty_like(aten_input, options),
429	at::empty_like(aten_input, options),
430	at::empty_like(aten_input, options)};
431
432	FusionExecutor fe;
433	fe.compileFusion(&fusion, {aten_input});
434	fe.runFusion({aten_input}, cg_outputs);
435
436	testValidate(
437	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
438	}
439	}
440
441	TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) {
442	Fusion fusion;
443	FusionGuard fg(&fusion);
444
445	// First tree
446	TensorView* tv0 = makeSymbolicTensor(`1`);
447	fusion.addInput(tv0);
448	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
449	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`2`));
450	TensorView* tv3 = add(tv1, IrBuilder::create<Double>(`3`));
451	fusion.addOutput(tv2);
452	fusion.addOutput(tv3);
453
454	// Second tree
455	TensorView* tv4 = makeSymbolicTensor(`1`);
456	fusion.addInput(tv4);
457	TensorView* tv5 = add(tv4, IrBuilder::create<Double>(`5`));
458	TensorView* tv6 = add(tv5, IrBuilder::create<Double>(`6`));
459	TensorView* tv7 = add(tv5, IrBuilder::create<Double>(`7`));
460	fusion.addOutput(tv6);
461	fusion.addOutput(tv7);
462
463	tv1->computeAt(tv2, -`1`);
464	tv5->computeAt(tv6, -`1`);
465
466	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
467	at::Tensor t0 = at::randn({`100`}, options);
468	at::Tensor t4 = at::rand_like(t0, options);
469
470	auto t1 = t0 + `1`;
471	auto t2 = t1 + `2`;
472	auto t3 = t1 + `3`;
473	auto t5 = t4 + `5`;
474	auto t6 = t5 + `6`;
475	auto t7 = t5 + `7`;
476
477	std::vector<at::Tensor> aten_outputs = {t2, t3, t6, t7};
478	std::vector<IValue> aten_inputs = {t0, t4};
479	std::vector<at::Tensor> cg_outputs = {
480	at::empty_like(t0, options),
481	at::empty_like(t0, options),
482	at::empty_like(t0, options),
483	at::empty_like(t0, options)};
484
485	FusionExecutor fe;
486	fe.compileFusion(&fusion, aten_inputs);
487	fe.runFusion(aten_inputs, cg_outputs);
488
489	testValidate(
490	&fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
491	}
492
493	TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) {
494	Fusion fusion;
495	FusionGuard fg(&fusion);
496
497	TensorView* tv0 = makeSymbolicTensor(`1`);
498	fusion.addInput(tv0);
499	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
500	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`2`));
501	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`3`));
502	TensorView* tv4 = add(tv3, IrBuilder::create<Double>(`4`));
503	TensorView* tv5 = add(tv2, tv4);
504
505	fusion.addOutput(tv1);
506	fusion.addOutput(tv3);
507	fusion.addOutput(tv5);
508
509	tv2->computeAt(tv5, -`1`);
510	tv4->computeAt(tv5, -`1`);
511
512	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
513	at::Tensor aten_input = at::randn({`100`}, options);
514	std::vector<at::Tensor> cg_outputs = {
515	at::empty_like(aten_input, options),
516	at::empty_like(aten_input, options),
517	at::empty_like(aten_input, options)};
518
519	FusionExecutor fe;
520	fe.compileFusion(&fusion, {aten_input});
521	fe.runFusion({aten_input}, cg_outputs);
522
523	auto t1 = aten_input + `1`;
524	auto t2 = t1 + `2`;
525	auto t3 = aten_input + `3`;
526	auto t4 = t3 + `4`;
527	auto t5 = t2 + t4;
528
529	std::vector<at::Tensor> aten_outputs = {t1, t3, t5};
530
531	testValidate(
532	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
533	}
534
535	TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) {
536	Fusion fusion;
537	FusionGuard fg(&fusion);
538
539	TensorView* tv0 = makeSymbolicTensor(`1`);
540	fusion.addInput(tv0);
541	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
542	TensorView* tv2 = add(tv0, IrBuilder::create<Double>(`2`));
543	TensorView* tv3 = add(tv1, tv2);
544	TensorView* tv4 = add(tv3, IrBuilder::create<Double>(`4`));
545
546	fusion.addOutput(tv4);
547
548	tv1->split(`0`, `32`);
549	tv2->split(`0`, `32`);
550	tv3->split(`0`, `32`);
551	tv4->split(`0`, `32`);
552
553	tv3->computeAt(tv4, -`2`);
554	tv1->computeAt(tv3, -`1`);
555	tv2->computeAt(tv3, -`2`);
556
557	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
558	at::Tensor aten_input = at::randn({`100`}, options);
559
560	auto t1 = aten_input + `1`;
561	auto t2 = aten_input + `2`;
562	auto t3 = t1 + t2;
563	auto aten_output = t3 + `4`;
564
565	at::Tensor cg_output = at::empty_like(aten_input, options);
566
567	FusionExecutor fe;
568	fe.compileFusion(&fusion, {aten_input});
569	fe.runFusion({aten_input}, {cg_output});
570
571	testValidate(
572	&fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
573	}
574
575	TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) {
576	Fusion fusion;
577	FusionGuard fg(&fusion);
578
579	TensorView* tv0 = makeSymbolicTensor(`1`);
580	fusion.addInput(tv0);
581	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
582	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`2`));
583	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`3`));
584	TensorView* tv4 = add(tv3, IrBuilder::create<Double>(`4`));
585	TensorView* tv5 = add(tv2, tv4);
586
587	fusion.addOutput(tv5);
588
589	TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5};
590	for (auto tv : tvs) {
591	tv->split(`0`, `2`);
592	tv->split(`0`, `4`);
593	tv->split(`0`, `8`);
594	}
595
596	// computeAt into inner loop nests
597	tv1->computeAt(tv2, -`1`);
598	tv3->computeAt(tv4, -`2`);
599
600	tv2->computeAt(tv5, -`4`);
601	tv4->computeAt(tv5, -`3`);
602
603	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
604	at::Tensor aten_input = at::randn({`100`}, options);
605
606	auto t1 = aten_input + `1`;
607	auto t2 = t1 + `2`;
608	auto t3 = aten_input + `3`;
609	auto t4 = t3 + `4`;
610	auto aten_output = t2 + t4;
611
612	at::Tensor cg_output = at::empty_like(aten_input, options);
613
614	FusionExecutor fe;
615	fe.compileFusion(&fusion, {aten_input});
616	fe.runFusion({aten_input}, {cg_output});
617
618	testValidate(
619	&fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
620	}
621
622	// Test predication of grid reduction
623	TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) {
624	const int gdimx = `4`;
625	const int bdimx = `128`;
626
627	Fusion fusion;
628	FusionGuard fg(&fusion);
629
630	TensorView* tv0 = makeSymbolicTensor(`2`);
631	fusion.addInput(tv0);
632
633	TensorView* tv1 =
634	reductionOp(BinaryOpType::Add, {`1`}, IrBuilder::create<Double>(`0`), tv0);
635	TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1);
636	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`2`));
637
638	fusion.addOutput(tv3);
639	fusion.addOutput(tv2);
640
641	tv1->split(`1`, bdimx);
642	tv1->split(`1`, gdimx);
643	tv3->split(`1`, bdimx);
644	tv3->split(`1`, gdimx);
645
646	TensorView* tv1_rf = tv1->rFactor({`1`});
647
648	tv1->computeAt(tv2, -`1`);
649
650	tv1->axis(`0`)->parallelize(ParallelType::BIDy);
651	tv1_rf->axis(`0`)->parallelize(ParallelType::BIDy);
652	tv2->axis(`0`)->parallelize(ParallelType::BIDy);
653	tv1->axis(-`2`)->parallelize(ParallelType::BIDx);
654	tv1_rf->axis(-`2`)->parallelize(ParallelType::BIDx);
655	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
656	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
657
658	tv3->axis(`3`)->parallelize(ParallelType::TIDx);
659	tv3->axis(`2`)->parallelize(ParallelType::BIDx);
660	tv3->axis(`0`)->parallelize(ParallelType::BIDy);
661
662	int numel_x = `100`;
663	int numel_y = `1000`;
664
665	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
666	at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
667
668	auto t2 = -aten_input.to(at::kDouble).sum({`1`});
669	auto t3 = aten_input + `2.0`;
670
671	std::vector<at::Tensor> aten_outputs = {t3, t2};
672
673	std::vector<at::Tensor> cg_outputs = {
674	at::empty_like(aten_input, options), at::empty({numel_x}, options)};
675
676	FusionExecutor fe;
677	fe.compileFusion(&fusion, {aten_input});
678	fe.runFusion({aten_input}, cg_outputs);
679
680	testValidate(
681	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
682	}
683
684	TEST_F(NVFuserTest, FusionLSTMCell_CUDA) {
685	const int hidden_features = `512`;
686	const int batch_size = `64`;
687
688	Fusion fusion;
689	FusionGuard fg(&fusion);
690
691	TensorView* tvs[`16`];
692	for (const auto i : c10::irange(`16`)) {
693	tvs[i] = makeSymbolicTensor(`2`);
694	fusion.addInput(tvs[i]);
695	}
696
697	auto ingate = unaryOp(
698	UnaryOpType::Sigmoid, add(add(add(tvs[`0`], tvs[`1`]), tvs[`2`]), tvs[`3`]));
699
700	auto forgetgate = unaryOp(
701	UnaryOpType::Sigmoid, add(add(add(tvs[`4`], tvs[`5`]), tvs[`6`]), tvs[`7`]));
702
703	auto cellgate = unaryOp(
704	UnaryOpType::Tanh, add(add(add(tvs[`8`], tvs[`9`]), tvs[`10`]), tvs[`11`]));
705
706	auto outgate = unaryOp(
707	UnaryOpType::Sigmoid, add(add(add(tvs[`12`], tvs[`13`]), tvs[`14`]), tvs[`15`]));
708
709	auto cx = makeContigTensor(`2`);
710	fusion.addInput(cx);
711
712	auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
713
714	auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
715
716	fusion.addOutput(cy);
717	fusion.addOutput(hy);
718
719	std::vector<c10::IValue> aten_inputs;
720	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
721	at::Tensor large_tensor0 =
722	at::randn({batch_size, hidden_features * `4`}, options);
723	at::Tensor large_tensor1 =
724	at::randn({batch_size, hidden_features * `4`}, options);
725	at::Tensor large_tensor2 =
726	at::randn({batch_size, hidden_features * `4`}, options);
727	at::Tensor large_tensor3 =
728	at::randn({batch_size, hidden_features * `4`}, options);
729
730	auto chunked0 = large_tensor0.chunk(`4`, `1`);
731	auto chunked1 = large_tensor1.chunk(`4`, `1`);
732	auto chunked2 = large_tensor2.chunk(`4`, `1`);
733	auto chunked3 = large_tensor3.chunk(`4`, `1`);
734
735	aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end());
736	aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end());
737	aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end());
738	aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end());
739
740	auto at_ingate =
741	chunked0 [`0`].add(chunked0 [`1`]).add(chunked0 [`2`]).add(chunked0 [`3`]).sigmoid();
742	auto at_forgetgate =
743	chunked1 [`0`].add(chunked1 [`1`]).add(chunked1 [`2`]).add(chunked1 [`3`]).sigmoid();
744	auto at_cellgate =
745	chunked2 [`0`].add(chunked2 [`1`]).add(chunked2 [`2`]).add(chunked2 [`3`]).tanh();
746	auto at_outgate =
747	chunked3 [`0`].add(chunked3 [`1`]).add(chunked3 [`2`]).add(chunked3 [`3`]).sigmoid();
748
749	auto at_cx = at::randn({batch_size, hidden_features}, options);
750	aten_inputs.push_back(at_cx);
751	auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
752	auto at_hy = at_outgate.mul(at_cy.tanh());
753
754	auto lparams = schedulePointwise(&fusion, aten_inputs);
755
756	FusionExecutor fe;
757	fe.compileFusion(&fusion, aten_inputs, lparams);
758	auto cg_outputs = fe.runFusion(aten_inputs, lparams);
759
760	testValidate(
761	&fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__);
762	}
763
764	TEST_F(NVFuserTest, FusionReductionHalf_CUDA) {
765	Fusion fusion;
766	FusionGuard fg(&fusion);
767
768	// Set up your input tensor views
769	TensorView* tv0 = makeSymbolicTensor(`3`, DataType::Half);
770	fusion.addInput(tv0);
771
772	auto tv1 = castOp(DataType::Float, tv0);
773	auto tv2 = add(tv1, IrBuilder::create<Double>(`1.0`));
774	auto tv3 = sum(tv2, {`2`});
775	auto tv4 = castOp(DataType::Half, tv3);
776
777	fusion.addOutput(tv4);
778
779	const auto options =
780	at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
781	at::Tensor aten_input = at::randn({`8`, `8`, `16`}, options);
782
783	auto reduction_tv = tv3;
784
785	auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
786	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
787	scheduleReduction(&fusion, *reduction_params);
788
789	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
790
791	auto lparams = reduction_params ->lparams;
792
793	FusionExecutor fe;
794	fe.compileFusion(&fusion, {aten_input}, lparams);
795	// no broadcasting needed, omitting the last optional argument;
796	auto cg_outputs = fe.runFusion({aten_input}, lparams);
797
798	auto aten_output = aten_input.add(`1.0`).to(at::kDouble).sum({`2`});
799
800	testValidate(
801	&fusion,
802	cg_outputs,
803	{aten_input},
804	{aten_output},
805	__LINE__,
806	__FILE__,
807	"",
808	lparams);
809	}
810
811	TEST_F(NVFuserTest, FusionReduceSingle_CUDA) {
812	Fusion fusion;
813	FusionGuard fg(&fusion);
814
815	// Set up your input tensor views
816	TensorView* tv0 = makeConcreteTensor({`100`, `1`});
817	fusion.addInput(tv0);
818	auto tv1 = sum(tv0, {`1`});
819	fusion.addOutput(tv1);
820
821	const auto options =
822	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
823	at::Tensor aten_input = at::randn({`100`, `1`}, options);
824
825	// Grab only tensor views, though there shouldn't be any other type
826	FusionExecutor fe;
827	fe.compileFusion(&fusion, {aten_input});
828	// no broadcasting needed, omitting the last optional argument;
829	auto cg_outputs = fe.runFusion({aten_input});
830
831	auto aten_output = aten_input.to(at::kDouble).sum({`1`});
832	testValidate(
833	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
834	}
835
836	TEST_F(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) {
837	constexpr int bid_x = `80`;
838	constexpr int tid_x = `4096`;
839	constexpr int red_dim = `1`;
840
841	Fusion fusion;
842	FusionGuard fg(&fusion);
843
844	// Set up your input tensor views
845	TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, `1`});
846	fusion.addInput(tv0);
847
848	TensorView* tv1 = reductionOp(
849	BinaryOpType::Add, {red_dim, `2`}, IrBuilder::create<Double>(`0`), tv0);
850	fusion.addOutput(tv1);
851
852	const auto options =
853	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
854	at::Tensor aten_input = at::randn({bid_x, tid_x, `1`}, options);
855
856	// Apply reduction heuristic
857	auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
858	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
859	scheduleReduction(&fusion, *reduction_params);
860	auto lparams = reduction_params ->lparams;
861
862	FusionExecutor fe;
863	fe.compileFusion(&fusion, {aten_input}, lparams);
864	// no broadcasting needed, omitting the last optional argument;
865	auto cg_outputs = fe.runFusion({aten_input}, lparams);
866	auto aten_output = aten_input.to(at::kDouble).sum({red_dim, `2`});
867
868	testValidate(
869	&fusion,
870	cg_outputs,
871	{aten_input},
872	{aten_output},
873	__LINE__,
874	__FILE__,
875	"",
876	lparams);
877	}
878
879	TEST_F(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) {
880	constexpr int bid_x = `80`;
881	constexpr int tid_x = `4096`;
882	constexpr int red_dim = `1`;
883
884	Fusion fusion;
885	FusionGuard fg(&fusion);
886
887	// Set up your input tensor views
888	TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, `1`});
889	fusion.addInput(tv0);
890
891	TensorView* tv1 =
892	reductionOp(BinaryOpType::Add, {`2`}, IrBuilder::create<Double>(`0`), tv0);
893
894	TensorView* tv2 = reductionOp(
895	BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(`0`), tv1);
896	fusion.addOutput(tv2);
897
898	const auto options =
899	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
900	at::Tensor aten_input = at::randn({bid_x, tid_x, `1`}, options);
901
902	// Apply reduction heuristic
903	auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
904	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
905
906	scheduleReduction(&fusion, *reduction_params);
907	auto lparams = reduction_params ->lparams;
908
909	FusionExecutor fe;
910	fe.compileFusion(&fusion, {aten_input}, lparams);
911	// no broadcasting needed, omitting the last optional argument;
912	auto cg_outputs = fe.runFusion({aten_input}, lparams);
913	auto aten_output = aten_input.to(at::kDouble).sum({`1`, `2`});
914
915	testValidate(
916	&fusion,
917	cg_outputs,
918	{aten_input},
919	{aten_output},
920	__LINE__,
921	__FILE__,
922	"",
923	lparams);
924	}
925
926	TEST_F(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) {
927	constexpr int bid_x = `80`;
928	constexpr int tid_x = `4096`;
929	constexpr int red_dim = `1`;
930
931	Fusion fusion;
932	FusionGuard fg(&fusion);
933
934	// Set up your input tensor views
935	TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, `1`});
936	fusion.addInput(tv0);
937
938	TensorView* tv1 = reductionOp(
939	BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(`0`), tv0);
940
941	TensorView* tv2 =
942	reductionOp(BinaryOpType::Add, {`1`}, IrBuilder::create<Double>(`0`), tv1);
943	fusion.addOutput(tv2);
944
945	const auto options =
946	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
947	at::Tensor aten_input = at::randn({bid_x, tid_x, `1`}, options);
948
949	// Apply reduction heuristic
950	auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
951	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
952	scheduleReduction(&fusion, *reduction_params);
953	auto lparams = reduction_params ->lparams;
954
955	FusionExecutor fe;
956	fe.compileFusion(&fusion, {aten_input}, lparams);
957	// no broadcasting needed, omitting the last optional argument;
958	auto cg_outputs = fe.runFusion({aten_input}, lparams);
959	auto aten_output = aten_input.to(at::kDouble).sum({`2`, `1`});
960
961	testValidate(
962	&fusion,
963	cg_outputs,
964	{aten_input},
965	{aten_output},
966	__LINE__,
967	__FILE__,
968	"",
969	lparams);
970	}
971
972	TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) {
973	Fusion fusion;
974	FusionGuard fg(&fusion);
975
976	// Set up your input tensor views
977	TensorView* tv0 = makeConcreteTensor({`10`, `20`, `1`});
978	fusion.addInput(tv0);
979	TensorView* tv1 =
980	reductionOp(BinaryOpType::Add, {`2`}, IrBuilder::create<Double>(`0`), tv0);
981	fusion.addOutput(tv1);
982
983	TORCH_CHECK(
984	ir_utils::getReductionOps(&fusion, true / ignore_trivial /).empty(),
985	"Trivial reduction picked up by fusion");
986
987	const auto options =
988	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
989	at::Tensor aten_input = at::randn({`10`, `20`, `1`}, options);
990
991	FusionExecutor fe;
992	fe.compileFusion(&fusion, {aten_input});
993	auto cg_outputs = fe.runFusion({aten_input});
994	auto aten_output = aten_input.to(at::kDouble).sum({`2`});
995
996	testValidate(
997	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
998	}
999
1000	TEST_F(NVFuserTest, FusionTrivialReduction2_CUDA) {
1001	Fusion fusion;
1002	FusionGuard fg(&fusion);
1003
1004	int w = `1`, x = `1`, y = `7`, z = `8`;
1005
1006	auto tv0 = makeSymbolicTensor(`2`);
1007	auto tv1 = makeConcreteTensor({w, x, y, z});
1008	fusion.addInput(tv0);
1009	fusion.addInput(tv1);
1010
1011	auto tv2 = sum(tv1, {`0`});
1012	auto tv3 = sum(tv2, {`0`});
1013	auto tv4 = add(tv3, tv0);
1014
1015	fusion.addOutput(tv4);
1016
1017	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1018	at::Tensor t0 = at::randn({y, z}, options);
1019	at::Tensor t1 = at::randn({w, x, y, z}, options);
1020	auto aten_output = t1.to(at::kDouble).sum({`0`}).sum({`0`}).add(t0);
1021
1022	std::vector<IValue> aten_inputs = {t0, t1};
1023
1024	auto lparams = schedulePointwise(&fusion, aten_inputs);
1025
1026	FusionExecutor fe;
1027	fe.compileFusion(&fusion, aten_inputs, lparams);
1028	auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1029
1030	testValidate(
1031	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1032	}
1033
1034	TEST_F(NVFuserTest, FusionTrivialReduction3_CUDA) {
1035	Fusion fusion;
1036	FusionGuard fg(&fusion);
1037
1038	int v = `1`, w = `1`, x = `1`, y = `7`, z = `8`;
1039
1040	auto tv0 = makeSymbolicTensor(`2`);
1041	auto tv1 = makeConcreteTensor({v, w, x, y, z});
1042	fusion.addInput(tv0);
1043	fusion.addInput(tv1);
1044
1045	auto tv2 = sum(tv1, {`0`, `1`, `2`});
1046	auto tv3 = add(tv2, tv0);
1047
1048	fusion.addOutput(tv3);
1049
1050	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1051	at::Tensor t0 = at::randn({y, z}, options);
1052	at::Tensor t1 = at::randn({v, w, x, y, z}, options);
1053	auto aten_output = t1.sum({`0`, `1`, `2`}).add(t0);
1054
1055	std::vector<IValue> aten_inputs = {t0, t1};
1056
1057	auto lparams = schedulePointwise(&fusion, aten_inputs);
1058
1059	FusionExecutor fe;
1060	fe.compileFusion(&fusion, aten_inputs, lparams);
1061	auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1062
1063	testValidate(
1064	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1065	}
1066
1067	// Make sure trivial reductions are correctly detected even with
1068	// scheduling applied.
1069	TEST_F(NVFuserTest, FusionDetectTrivialReduction1_CUDA) {
1070	Fusion fusion;
1071	FusionGuard fg(&fusion);
1072
1073	auto tv0 = makeSymbolicTensor(`1`);
1074	fusion.addInput(tv0);
1075
1076	auto tv1 = broadcast(tv0, {false, true});
1077	auto tv2 = sum(tv1, {`1`});
1078	fusion.addOutput(tv2);
1079
1080	tv2->split(`1`, `4`);
1081	tv2->split(`1`, `8`);
1082	auto tv3 = tv2->rFactor({-`1`});
1083	auto tv4 = tv2->rFactor({-`1`});
1084
1085	auto tv5 = broadcast(tv0, {true, false});
1086	auto tv6 = add(tv5, IrBuilder::create<Double>(`1`));
1087	auto tv7 = sub(tv6, IrBuilder::create<Double>(`1`));
1088	auto tv8 = sum(tv7, {`0`});
1089	fusion.addOutput(tv8);
1090
1091	auto tv9 = broadcast(tv0, {false, true, true});
1092	auto tv10 = sum(tv9, {`1`});
1093	auto tv11 = sum(tv10, {`1`});
1094	fusion.addOutput(tv11);
1095
1096	tv8->split(`0`, `3`);
1097	tv10->split(`1`, `4`);
1098	tv11->split(`1`, `5`);
1099
1100	tv0->computeAt(tv2, -`1`);
1101	tv0->computeAt(tv8, -`1`);
1102	tv0->computeAt(tv11, `1`);
1103
1104	// Test indexing to gmem-backed tensors
1105	tv3->setMemoryType(MemoryType::Global);
1106	tv8->setMemoryType(MemoryType::Global);
1107
1108	GpuLower gpulw(&fusion);
1109
1110	// No ReductionOp should be generated as all the reduction
1111	// exprs should be replaced with a unary set op.
1112	for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
1113	TORCH_CHECK(!expr->isA<ReductionOp>());
1114	}
1115
1116	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1117	at::Tensor t0 = at::randn({`100`}, options);
1118	std::vector<IValue> aten_inputs = {t0};
1119
1120	FusionExecutor fe;
1121	fe.compileFusion(&fusion, aten_inputs);
1122	auto cg_outputs = fe.runFusion(aten_inputs);
1123
1124	testValidate(
1125	&fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__);
1126	}
1127
1128	// Test detection of partially trivial reduction
1129	TEST_F(NVFuserTest, FusionDetectTrivialReduction2_CUDA) {
1130	Fusion fusion;
1131	FusionGuard fg(&fusion);
1132
1133	auto tv0 = makeSymbolicTensor(`2`);
1134	fusion.addInput(tv0);
1135	auto tv1 = sum(tv0, {`1`});
1136	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
1137	fusion.addOutput(tv2);
1138
1139	tv1->split(`1`, `1`);
1140	// tv1->axis(1): non-trivial
1141	// tv1->axis(2): trivial
1142
1143	auto tv3 = tv1->rFactor({-`1`});
1144
1145	// Just to suppress register-allocation warning
1146	tv0->computeAt(tv2, `1`);
1147	tv3->computeAt(tv1, -`1`);
1148
1149	GpuLower gpulw(&fusion);
1150
1151	// tv3's reduction axis is a trivial reduction. The only
1152	// ReductionOp should be for tv1.
1153	for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
1154	if (expr->isA<ReductionOp>()) {
1155	auto reduction_out =
1156	expr->as<ReductionOp>()->outputs()[`0`]->as<TensorView>();
1157	TORCH_CHECK(reduction_out->name() == `1`);
1158	}
1159	}
1160	}
1161
1162	TEST_F(NVFuserTest, FusionInputsIdLookup_CUDA) {
1163	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1164	at::Tensor t0 = at::randn({`16`, `8`, `8`}, options);
1165	at::Tensor t1 = at::randn({`8`, `8`}, options);
1166	at::Tensor t2 = at::randn({`6`, `4`}, options);
1167
1168	// create a cache with max size 2;
1169	torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(`2`);
1170
1171	// testing basic function, same encoding for identical inputs
1172	auto id_0 = inputs_id_lookup.lookupId({t0, t1, `5.0`});
1173	auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, `2.5`});
1174	TORCH_CHECK(id_0.id == id_0_lookup.id);
1175	TORCH_CHECK(inputs_id_lookup.size() == `1`);
1176	TORCH_CHECK(id_0.eviction == false);
1177
1178	// new input (even tho same shape, but we have different signature because of
1179	// missing scalar input
1180	auto id_1 = inputs_id_lookup.lookupId({t0, t1});
1181	auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
1182	TORCH_CHECK(id_1.id == id_1_lookup.id);
1183	TORCH_CHECK(inputs_id_lookup.size() == `2`);
1184	TORCH_CHECK(id_1.eviction == false);
1185
1186	// eviction should happen at this point
1187	auto id_2 = inputs_id_lookup.lookupId({t2, t1});
1188	TORCH_CHECK(id_2.id != id_0.id);
1189	TORCH_CHECK(id_2.id != id_1.id);
1190	TORCH_CHECK(inputs_id_lookup.size() == `2`);
1191	TORCH_CHECK(id_2.eviction == true);
1192	TORCH_CHECK(id_2.evict_id == id_0.id);
1193
1194	// look at input 1 again
1195	auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
1196	TORCH_CHECK(id_1_relook.id == id_1.id);
1197	TORCH_CHECK(id_1_relook.eviction == false);
1198	}
1199
1200	TEST_F(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) {
1201	std::vector<int64_t> sizes_vec({`16`, `8`, `8`});
1202	std::vector<int64_t> strides_vec({`64`, `8`, `1`});
1203	auto tensor_type = TensorType::create(
1204	at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1205	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1206
1207	// pass with identical shape
1208	auto t0 = at::randn({`16`, `8`, `8`}, options);
1209	TORCH_CHECK(complyWith(t0, tensor_type));
1210
1211	// pass with dynamic shape
1212	auto t1 = at::randn({`16`, `16`, `8`}, options);
1213	TORCH_CHECK(complyWith(t1, tensor_type));
1214
1215	// broadcasting semantic change failure
1216	auto t2 = at::randn({`16`, `1`, `8`}, options);
1217	TORCH_CHECK(!complyWith(t2, tensor_type));
1218
1219	// contiguity failure via slicing
1220	auto t3 = t0.slice(`1`, `0`, `8`, `2`);
1221	TORCH_CHECK(!complyWith(t3, tensor_type));
1222
1223	// contiguity failure via slicing
1224	auto t4 = t0.slice(`2`, `0`, `8`, `2`);
1225	TORCH_CHECK(!complyWith(t4, tensor_type));
1226
1227	// rank failure
1228	auto t5 = at::randn({`16`, `8`, `8`, `8`}, options);
1229	TORCH_CHECK(!complyWith(t5, tensor_type));
1230
1231	// contiguity on stride 1 dimension with implicit broadcasting
1232	auto t = at::randn({`4`}, options);
1233	auto t6 = t.unsqueeze(`1`).expand({`4`, `8`});
1234	TORCH_CHECK(complyWith(t6, TensorType::create(t6)));
1235	}
1236
1237	TEST_F(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) {
1238	std::vector<int64_t> sizes_vec({`16`, `1`, `8`});
1239	std::vector<int64_t> strides_vec({`8`, `8`, `1`});
1240	auto tensor_type = TensorType::create(
1241	at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1242	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1243
1244	// broadcasting semantic change
1245	auto t0 = at::randn({`16`, `8`, `8`}, options);
1246	TORCH_CHECK(!complyWith(t0, tensor_type));
1247
1248	// dtype failure
1249	auto t1 = at::randn({`16`, `1`, `8`}, options.dtype(at::kHalf));
1250	TORCH_CHECK(!complyWith(t1, tensor_type));
1251
1252	// dtype failure
1253	auto t2 = at::randn({`16`, `1`, `8`}, options);
1254	TORCH_CHECK(complyWith(t2, tensor_type));
1255
1256	// device inconsistency shouldn't fail
1257	auto t3 = at::randn({`16`, `1`, `8`}, options.device(at::kCPU, `0`));
1258	TORCH_CHECK(complyWith(t3, tensor_type));
1259	}
1260
1261	TEST_F(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) {
1262	std::vector<int64_t> sizes_vec({`16`, `8`, `8`});
1263	std::vector<int64_t> strides_vec({`64`, `1`, `8`});
1264	auto tensor_type = TensorType::create(
1265	at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1266	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1267
1268	// failing permutation
1269	auto t0 = at::randn({`16`, `8`, `8`}, options);
1270	TORCH_CHECK(!complyWith(t0, tensor_type));
1271
1272	// passing with dynamic shape
1273	auto t1 = t0.permute({`0`, `2`, `1`});
1274	TORCH_CHECK(complyWith(t1, tensor_type));
1275	}
1276
1277	TEST_F(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) {
1278	std::vector<int64_t> sizes_vec({`16`, `8`, `8`});
1279	std::vector<int64_t> strides_vec({`128`, `16`, `1`});
1280	auto tensor_type = TensorType::create(
1281	at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
1282	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1283
1284	// contiguity check passes although it differs
1285	auto t0 = at::randn({`16`, `16`, `8`}, options);
1286	TORCH_CHECK(complyWith(t0, tensor_type));
1287
1288	// passing with dynamic shape
1289	auto t1 = t0.slice(`1`, `0`, `16`, `2`);
1290	TORCH_CHECK(complyWith(t1, tensor_type));
1291	}
1292
1293	TEST_F(NVFuserTest, FusionDisjointSet_CUDA) {
1294	DisjointSets<int> set;
1295
1296	const std::set<int> group_x({`0`, `1`, `2`});
1297	const std::set<int> group_y({`3`, `4`, `5`});
1298	const std::set<int> group_z({`6`, `7`, `8`});
1299	const std::vector<std::set<int>> groups({group_x, group_y, group_z});
1300	std::set<int> group_all;
1301	std::for_each(groups.begin(), groups.end(), [&](const auto& g) {
1302	group_all.insert(g.begin(), g.end());
1303	});
1304
1305	// Initially, nothing should be considered equivalent
1306	for (auto i : group_all) {
1307	for (auto j : group_all) {
1308	TORCH_CHECK(!set.permissiveAreMapped(i, j));
1309	}
1310	}
1311
1312	// Sets values in group_x are equivalent
1313	for (auto i : group_x) {
1314	for (auto j : group_x) {
1315	set.mapEntries(i, j);
1316	TORCH_CHECK(set.mappingExists(i));
1317	TORCH_CHECK(set.mappingExists(j));
1318	}
1319	}
1320
1321	// All values in group_x shoudl be equivalent with each other
1322	for (auto i : group_x) {
1323	for (auto j : group_x) {
1324	TORCH_CHECK(set.permissiveAreMapped(i, j));
1325	}
1326	}
1327	// But nothing else should be equivalent
1328	for (auto i : group_all) {
1329	for (auto j : group_y) {
1330	TORCH_CHECK(!set.permissiveAreMapped(i, j));
1331	}
1332	for (auto j : group_z) {
1333	TORCH_CHECK(!set.permissiveAreMapped(i, j));
1334	}
1335	}
1336
1337	// Sets values in group_y are equivalent
1338	for (auto i : group_y) {
1339	for (auto j : group_y) {
1340	set.mapEntries(i, j);
1341	TORCH_CHECK(set.mappingExists(i));
1342	TORCH_CHECK(set.mappingExists(j));
1343	}
1344	}
1345
1346	// group_x should be still equivalent
1347	for (auto i : group_x) {
1348	for (auto j : group_x) {
1349	TORCH_CHECK(set.permissiveAreMapped(i, j));
1350	}
1351	}
1352	// group_y should be now equivalent
1353	for (auto i : group_y) {
1354	for (auto j : group_y) {
1355	TORCH_CHECK(set.permissiveAreMapped(i, j));
1356	}
1357	}
1358	// But group_z should not be equivalent with anything yet
1359	for (auto i : group_all) {
1360	for (auto j : group_z) {
1361	TORCH_CHECK(!set.permissiveAreMapped(i, j));
1362	}
1363	}
1364
1365	// Sets values in group_z are equivalent
1366	for (auto i : group_z) {
1367	for (auto j : group_z) {
1368	set.mapEntries(i, j);
1369	TORCH_CHECK(set.mappingExists(i));
1370	TORCH_CHECK(set.mappingExists(j));
1371	}
1372	}
1373
1374	// Now each of the three groups should be equivalent within each
1375	// group
1376	for (const auto gi : c10::irange(groups.size())) {
1377	for (const auto gj : c10::irange(groups.size())) {
1378	for (auto i : groups [gi]) {
1379	for (auto j : groups [gj]) {
1380	TORCH_CHECK(
1381	(gi == gj && set.permissiveAreMapped(i, j)) \|\|
1382	(gi != gj && !set.permissiveAreMapped(i, j)));
1383	}
1384	}
1385	}
1386	}
1387
1388	std::vector<int> all_elements = set.getAllElements().vector();
1389	std::sort(all_elements.begin(), all_elements.end());
1390	std::vector<int> group_all_vec(group_all.begin(), group_all.end());
1391	std::sort(group_all_vec.begin(), group_all_vec.end());
1392	TORCH_CHECK(all_elements == group_all_vec);
1393
1394	set.clear();
1395	TORCH_CHECK(set.getAllElements().vector().size() == `0`);
1396
1397	// All cleared. Nothing should be considered equivalent.
1398	for (auto i : group_all) {
1399	for (auto j : group_all) {
1400	TORCH_CHECK(!set.permissiveAreMapped(i, j));
1401	}
1402	}
1403	}
1404
1405	TEST_F(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) {
1406	Fusion fusion;
1407	FusionGuard fg(&fusion);
1408
1409	auto tv0 = makeSymbolicTensor(`1`);
1410	auto tv1 = makeSymbolicTensor(`2`);
1411	auto tv2 = makeSymbolicTensor(`2`);
1412	fusion.addInput(tv0);
1413	fusion.addInput(tv1);
1414	fusion.addInput(tv2);
1415
1416	auto tv3 = broadcast(tv0, {true, false});
1417	auto tv4 = add(tv3, tv1);
1418	auto tv5 = add(tv3, tv2);
1419
1420	fusion.addOutput(tv4);
1421	fusion.addOutput(tv5);
1422
1423	// In order to do this, tv1->axis(1) and tv2->axis(1) must have the
1424	// same size, but we can't prove it, so this should throw an error.
1425	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1426	ASSERT_ANY_THROW(tv3->computeAt(tv4, -`1`));
1427	}
1428
1429	TEST_F(NVFuserTest, FusionBiasGeluFwd_CUDA) {
1430	Fusion fusion;
1431	FusionGuard fg(&fusion);
1432
1433	const float k_079 = `0.79788456`;
1434	const float k_004 = `0.044715`;
1435
1436	// bias vector
1437	auto t0 = makeSymbolicTensor(`1`, DataType::Half);
1438	fusion.addInput(t0);
1439	auto t1 = castOp(DataType::Float, t0);
1440	// input tensor
1441	auto t2 = makeSymbolicTensor(`3`, DataType::Half);
1442	fusion.addInput(t2);
1443	auto t3 = castOp(DataType::Float, t2);
1444	auto t4 = broadcast(t1, {true, true, false});
1445	auto t5 = add(t4, t3);
1446	auto t6 = mul(t5, IrBuilder::create<Double>(`0.5`));
1447	auto t7 = mul(t5, IrBuilder::create<Double>(k_079));
1448	auto t8 = mul(t5, IrBuilder::create<Double>(k_004));
1449	auto t9 = mul(t8, t5);
1450	auto t10 = add(t9, IrBuilder::create<Int>(`1`));
1451	auto t11 = mul(t7, t10);
1452	auto t12 = unaryOp(UnaryOpType::Tanh, t11);
1453	auto t13 = add(t12, IrBuilder::create<Double>(`1`));
1454	auto t14 = mul(t6, t13);
1455	auto t15 = castOp(DataType::Half, t14);
1456	fusion.addOutput(t15);
1457
1458	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
1459	at::manual_seed(`0`);
1460	std::vector<int64_t> input_shape{`6`, `512`, `4096`};
1461	std::vector<int64_t> bias_shape{`4096`};
1462
1463	auto at_input = at::randn(input_shape, options);
1464	auto at_bias = at::randn(bias_shape, options);
1465
1466	auto at_x =
1467	at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
1468	auto aten_output_float =
1469	at_x * `0.5` * (`1.0` + (k_079 * at_x * (`1` + k_004 * at_x * at_x)).tanh());
1470	auto aten_output = aten_output_float.to(c10::ScalarType::Half);
1471
1472	std::vector<IValue> aten_inputs = {at_bias, at_input};
1473	auto lparams = schedulePointwise(&fusion, aten_inputs);
1474
1475	FusionExecutor fe;
1476	fe.compileFusion(&fusion, aten_inputs, lparams);
1477	auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1478
1479	testValidate(
1480	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1481	}
1482
1483	TEST_F(NVFuserTest, FusionBiasGeluBwd_CUDA) {
1484	if (at::cuda::getDeviceProperties(`0`)->major < `6`) {
1485	return;
1486	}
1487	Fusion fusion;
1488	FusionGuard fg(&fusion);
1489
1490	const float k_079 = `0.79788456`;
1491	const float k_004 = `0.044715`;
1492	const float k_010 = `0.1070322243`;
1493
1494	// gradient tensor
1495	auto t0 = makeSymbolicTensor(`3`, DataType::Half);
1496	fusion.addInput(t0);
1497	auto t1 = castOp(DataType::Float, t0);
1498	// bias tensor
1499	auto t2 = makeSymbolicTensor(`1`, DataType::Half);
1500	fusion.addInput(t2);
1501	auto t3 = castOp(DataType::Float, t2);
1502	// input tensor
1503	auto t4 = makeSymbolicTensor(`3`, DataType::Half);
1504	fusion.addInput(t4);
1505	auto t5 = castOp(DataType::Float, t4);
1506	auto t6 = broadcast(t3, {true, true, false});
1507	auto t7 = add(t6, t5);
1508	auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
1509	auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
1510	auto t10 = mul(t9, t7);
1511	auto t11 = add(t10, IrBuilder::create<Int>(`1`));
1512	auto t12 = mul(t8, t11);
1513	auto t13 = unaryOp(UnaryOpType::Tanh, t12);
1514	auto t14 = mul(t7, IrBuilder::create<Double>(`0.5`));
1515	auto t15 = mul(t13, t13);
1516	auto t16 = unaryOp(UnaryOpType::Neg, t15);
1517	auto t17 = add(t16, IrBuilder::create<Int>(`1`));
1518	auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
1519	auto t19 = mul(t18, t7);
1520	auto t20 = add(t19, IrBuilder::create<Double>(k_079));
1521	auto t21 = mul(t17, t20);
1522	auto t22 = mul(t14, t21);
1523	auto t23 = add(t13, IrBuilder::create<Int>(`1`));
1524	auto t24 = mul(t23, IrBuilder::create<Double>(`0.5`));
1525	auto t25 = add(t22, t24);
1526	auto t26 = mul(t25, t1);
1527	// Save float output for validation
1528	fusion.addOutput(t26);
1529	auto t27 = castOp(DataType::Half, t26);
1530	fusion.addOutput(t27);
1531
1532	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
1533	at::manual_seed(`1`);
1534	std::vector<int64_t> input_shape{`6`, `512`, `4096`};
1535	std::vector<int64_t> bias_shape{`4096`};
1536	auto at_input = at::randn(input_shape, options);
1537	auto at_bias = at::randn(bias_shape, options);
1538	auto at_grad = at::randn(input_shape, options);
1539
1540	auto at_x =
1541	at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
1542	auto at_tanh_out = (k_079 * at_x * (`1` + k_004 * at_x * at_x)).tanh();
1543	auto at_ff = `0.5` * at_x *
1544	((`1` - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) +
1545	`0.5` * (`1` + at_tanh_out);
1546	auto at_out = at_ff * at_grad;
1547	auto at_out_half = at_out.to(c10::ScalarType::Half);
1548
1549	std::vector<IValue> aten_inputs = {at_grad, at_bias, at_input};
1550	std::vector<at::Tensor> aten_outputs = {at_out, at_out_half};
1551
1552	auto lparams = schedulePointwise(&fusion, aten_inputs);
1553
1554	FusionExecutor fe;
1555	fe.compileFusion(&fusion, aten_inputs, lparams);
1556	auto cg_outputs = fe.runFusion(aten_inputs, lparams);
1557
1558	testValidate(
1559	&fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
1560	}
1561
1562	// Reproducer of issue #459
1563	TEST_F(NVFuserTest, FusionIssue459_CUDA) {
1564	Fusion fusion;
1565	FusionGuard fg(&fusion);
1566
1567	auto tv0 = makeSymbolicTensor(`1`);
1568	fusion.addInput(tv0);
1569	auto tv1 = makeSymbolicTensor(`2`);
1570	fusion.addInput(tv1);
1571
1572	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
1573	auto tv3 = broadcast(tv2, {true, false});
1574	auto tv4 = add(tv1, tv3);
1575
1576	// Create two outputs from the final arithmetic result
1577	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
1578	fusion.addOutput(tv5);
1579	auto tv6 = add(tv4, IrBuilder::create<Double>(`1`));
1580	fusion.addOutput(tv6);
1581
1582	// Scheduling
1583	for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
1584	output->merge(-`2`, -`1`);
1585	}
1586	for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
1587	output->split(`0`, `128`);
1588	}
1589
1590	tv0->computeAt(tv5, -`1`);
1591
1592	tv6->axis(`0`)->parallelize(ParallelType::BIDx);
1593	tv6->axis(`1`)->parallelize(ParallelType::TIDx);
1594
1595	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1596	at::manual_seed(`0`);
1597	const int numel_x = `10`;
1598	const int numel_y = `20`;
1599	auto t0 = at::randn({numel_x}, options);
1600	auto t1 = at::randn({numel_y, numel_x}, options);
1601	auto aten_output = (t0 + `1`).unsqueeze(`0`) + t1 + `1`;
1602
1603	std::vector<IValue> aten_inputs = {t0, t1};
1604
1605	torch::jit::fuser::cuda::FusionExecutor fe;
1606	fe.compileFusion(&fusion, aten_inputs);
1607	auto cg_outputs = fe.runFusion(aten_inputs);
1608
1609	testValidate(
1610	&fusion,
1611	cg_outputs,
1612	aten_inputs,
1613	{aten_output, aten_output},
1614	__LINE__,
1615	__FILE__);
1616	}
1617
1618	TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
1619	Fusion fusion;
1620	FusionGuard fg(&fusion);
1621
1622	auto tv0 = makeSymbolicTensor(`2`);
1623	fusion.addInput(tv0);
1624	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
1625	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
1626	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
1627	fusion.addOutput(tv3);
1628
1629	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1630	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
1631
1632	tv0->computeAt(tv3, -`1`);
1633
1634	tv1->setMemoryType(MemoryType::Shared);
1635	tv2->setMemoryType(MemoryType::Global);
1636
1637	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1638
1639	auto aten_input = at::randn({`12`, `34`}, options);
1640	at::Tensor aten_output = aten_input + `1.0` + `1.0` + `1.0`;
1641
1642	FusionExecutor fe;
1643	fe.compileFusion(&fusion, {aten_input});
1644	auto cg_outputs = fe.runFusion({aten_input});
1645
1646	testValidate(
1647	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
1648	}
1649
1650	TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) {
1651	Fusion fusion;
1652	FusionGuard fg(&fusion);
1653
1654	// Symbolic integers we will use for runtime tiling
1655	Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
1656	Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
1657	Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
1658	// Compile-time integer for tiling
1659	int n_smem_tile = `32`;
1660
1661	// Symbolic 2D tensors TV0[M, K], TV1[K, N]
1662	TensorView* tv0 = makeSymbolicTensor(`2`);
1663	TensorView* tv1 = makeSymbolicTensor(`2`);
1664
1665	// Broadcast tv0 to [M, K, ]*
1666	TensorView* tv2 = broadcast(tv0, {false, false, true});
1667	// Broadcast tv1 to [, K, N]*
1668	TensorView* tv3 = broadcast(tv1, {true, false, false});
1669
1670	// Pointwise multiplication resulting in tv3[M, K, N]
1671	TensorView* tv4 = mul(tv2, tv3);
1672
1673	// Sum the K-dim
1674	TensorView* tv5 = sum(tv4, {`1`});
1675
1676	// Register inputs and outputs
1677	fusion.addInput(tv0);
1678	fusion.addInput(tv1);
1679	fusion.addOutput(tv5);
1680
1681	// Register runtime tile dims as inputs
1682	fusion.addInput(symbolic_m_tile_dim);
1683	fusion.addInput(symbolic_split_k_tile_dim);
1684	fusion.addInput(symbolic_block_k_tile_dim);
1685
1686	// Make a 3D tile, mix of symbolic and constant, do in reverse order because
1687	// dims are inserted
1688	// [M, rK, N]
1689	tv5->split(`2`, n_smem_tile);
1690	// [M, rK, No, Ni{32}]
1691	tv5->split(`1`, symbolic_block_k_tile_dim);
1692	// [M, rKo, rKi{i2}, No, Ni{32}]
1693	tv5->split(`1`, symbolic_split_k_tile_dim);
1694	// [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
1695	tv5->split(`0`, symbolic_m_tile_dim);
1696	// [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
1697
1698	// Reorder so all outer tiles are in the leftmost 3 positions
1699	// [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
1700	// [Mo, No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
1701	tv5->reorder({{`1`, `5`}, {`5`, `1`}});
1702
1703	// Factor out the outer reduction IterDomain, then run the inter-cta
1704	// reduction, and intra-cta reduction
1705	// [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}]
1706	// [Mo, No, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
1707	auto tv6 = tv5->rFactor({`2`});
1708
1709	// Scope computations
1710	tv6->computeAt(tv5, `2`);
1711
1712	// [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}]
1713	// [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}]
1714	tv6->reorder({
1715	{`5`, -`2`},
1716	{`6`, -`1`},
1717	{`2`, `2`},
1718	{`3`, `3`},
1719	{`4`, `4`},
1720	});
1721
1722	// Setup compute at schedule
1723	tv0->computeAt(tv6, `3`);
1724	tv1->computeAt(tv6, `3`);
1725	tv4->computeAt(tv6, -`1`);
1726
1727	// Cache smem tiles
1728	tv2->setMemoryType(MemoryType::Shared);
1729	tv3->setMemoryType(MemoryType::Shared);
1730	tv4->setMemoryType(MemoryType::Shared);
1731	tv6->setMemoryType(MemoryType::Shared);
1732
1733	tv5->axis(`0`)->parallelize(ParallelType::BIDz);
1734	tv5->axis(`1`)->parallelize(ParallelType::BIDy);
1735
1736	std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
1737	for (auto tv : tv_list) {
1738	tv->axis(-`2`)->parallelize(ParallelType::TIDz);
1739	tv->axis(-`1`)->parallelize(ParallelType::TIDy);
1740	}
1741
1742	constexpr int M = `31`, K = `65`, N = `32`;
1743
1744	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1745	at::Tensor t0 = at::randn({M, K}, options);
1746	at::Tensor t1 = at::randn({K, N}, options);
1747
1748	at::Tensor aten_output =
1749	mul(t0.unsqueeze(`2`), t1.unsqueeze(`0`)).to(at::kDouble).sum(`1`);
1750
1751	// A, B, m_tile_dim, split_k, intra_cta_tile
1752	std::vector<IValue> aten_inputs = {t0, t1, `3`, `4`, `5`};
1753
1754	FusionExecutor fe;
1755	fe.compileFusion(&fusion, aten_inputs);
1756	auto cg_outputs = fe.runFusion(aten_inputs);
1757
1758	testValidate(
1759	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1760	}
1761
1762	// Reproducer of issue 408
1763	TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
1764	Fusion fusion;
1765	FusionGuard fg(&fusion);
1766
1767	auto tv0 = makeSymbolicTensor(`2`);
1768	fusion.addInput(tv0);
1769	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
1770	auto tv2 = sum(tv1, {`1`});
1771	fusion.addOutput(tv2);
1772
1773	tv2->split(`0`, `4`);
1774
1775	auto tv3 = tv2->cacheBefore();
1776
1777	tv0->computeAt(tv3, -`1`);
1778	tv3->computeAt(tv2, -`1`);
1779
1780	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
1781
1782	const int numel_x = `100`;
1783	const int numel_y = `200`;
1784	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1785
1786	at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
1787	at::Tensor cg_output = at::empty({numel_x}, options);
1788
1789	auto aten_output = (aten_input + `1`).to(at::kDouble).sum({`1`});
1790
1791	FusionExecutor fe;
1792	fe.compileFusion(&fusion, {aten_input});
1793	fe.runFusion({aten_input}, {cg_output});
1794
1795	testValidate(
1796	&fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
1797	}
1798
1799	TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
1800	Fusion fusion;
1801	FusionGuard fg(&fusion);
1802
1803	auto tv0 = makeSymbolicTensor(`3`);
1804	fusion.addInput(tv0);
1805	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
1806	auto tv2 = sum(tv1, {`1`});
1807	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
1808	fusion.addOutput(tv2);
1809	fusion.addOutput(tv3);
1810
1811	auto tv4 = tv2->cacheBefore();
1812
1813	tv4->computeAt(tv3, `1`);
1814	tv0->computeAt(tv4, -`1`);
1815
1816	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
1817	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
1818	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
1819	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
1820	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
1821
1822	const int numel_x = `10`;
1823	const int numel_y = `20`;
1824	const int numel_z = `30`;
1825	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1826
1827	at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
1828	auto t2 = (aten_input + `1`).to(at::kDouble).sum({`1`});
1829	auto t3 = t2 + `1`;
1830	std::vector<at::Tensor> aten_outputs = {t2, t3};
1831
1832	FusionExecutor fe;
1833	fe.compileFusion(&fusion, {aten_input});
1834	auto cg_outputs = fe.runFusion({aten_input});
1835
1836	testValidate(
1837	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
1838	}
1839
1840	TEST_F(NVFuserTest, FusionIssue367_CUDA) {
1841	Fusion fusion;
1842	FusionGuard fg(&fusion);
1843
1844	// Symbolic integers we will use for runtime tiling
1845	Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
1846	Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
1847	Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
1848	// Compile-time integer for tiling
1849	int n_smem_tile = `32`;
1850
1851	// Symbolic 2D tensors TV0[M, K], TV1[K, N]
1852	TensorView* tv0 = makeSymbolicTensor(`2`);
1853	TensorView* tv1 = makeSymbolicTensor(`2`);
1854
1855	// Broadcast tv0 to [M, K, ]*
1856	TensorView* tv2 = broadcast(tv0, {false, false, true});
1857	// Broadcast tv1 to [, K, N]*
1858	TensorView* tv3 = broadcast(tv1, {true, false, false});
1859
1860	// Pointwise multiplication resulting in tv3[M, K, N]
1861	TensorView* tv4 = mul(tv2, tv3);
1862
1863	// Sum the K-dim
1864	TensorView* tv5 = sum(tv4, {`1`});
1865
1866	// Register inputs and outputs
1867	fusion.addInput(tv0);
1868	fusion.addInput(tv1);
1869	fusion.addOutput(tv5);
1870
1871	// Register runtime tile dims as inputs
1872	fusion.addInput(symbolic_m_tile_dim);
1873	fusion.addInput(symbolic_split_k_tile_dim);
1874	fusion.addInput(symbolic_block_k_tile_dim);
1875
1876	// Make a 3D tile, mix of symbolic and constant, do in reverse order because
1877	// dims are inserted
1878	// [M, K, N]
1879	tv5->split(`2`, n_smem_tile);
1880	tv5->split(`1`, symbolic_block_k_tile_dim);
1881	tv5->split(`1`, symbolic_split_k_tile_dim);
1882	tv5->split(`0`, symbolic_m_tile_dim);
1883	// [Mo, Mi, Koo, Koi, Ki, No, Ni]
1884	tv5->reorder({{`1`, `5`}, {`5`, `1`}});
1885	// [Mo, No, Koo, Koi, Ki, Mi, Ni]
1886
1887	auto tv6 = tv5->rFactor({`2`});
1888	auto tv7 = tv5->rFactor({`2`});
1889	// [Mo, No, rKoo, Koi, Ki, Mi, Ni]
1890	// [Mo, No, rKoi, rKi, Mi, Ni]
1891
1892	// Scope computations
1893	tv6->computeAt(tv5, `2`);
1894
1895	tv0->computeAt(tv6, `3`);
1896	tv1->computeAt(tv6, `3`);
1897	tv4->computeAt(tv6, -`1`);
1898
1899	// Cache smem tiles
1900	tv2->setMemoryType(MemoryType::Shared);
1901	tv3->setMemoryType(MemoryType::Shared);
1902	tv4->setMemoryType(MemoryType::Local);
1903	tv6->setMemoryType(MemoryType::Local);
1904	tv7->setMemoryType(MemoryType::Local);
1905
1906	tv5->axis(`0`)->parallelize(ParallelType::BIDz);
1907	tv5->axis(`1`)->parallelize(ParallelType::BIDy);
1908
1909	std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6, tv7};
1910	for (auto tv : tv_list) {
1911	tv->axis(-`2`)->parallelize(ParallelType::TIDz);
1912	tv->axis(-`1`)->parallelize(ParallelType::TIDy);
1913	}
1914	tv2->axis(`3`)->parallelize(ParallelType::TIDx);
1915	tv3->axis(`3`)->parallelize(ParallelType::TIDx);
1916	tv4->axis(`3`)->parallelize(ParallelType::TIDx);
1917	tv6->axis(`3`)->parallelize(ParallelType::TIDx);
1918	tv7->axis(`2`)->parallelize(ParallelType::TIDx);
1919
1920	tv2->axis(`4`)->parallelize(ParallelType::BIDx);
1921	tv3->axis(`4`)->parallelize(ParallelType::BIDx);
1922	tv4->axis(`4`)->parallelize(ParallelType::BIDx);
1923	tv6->axis(`4`)->parallelize(ParallelType::BIDx);
1924	tv7->axis(`3`)->parallelize(ParallelType::BIDx);
1925	tv5->axis(`2`)->parallelize(ParallelType::BIDx);
1926
1927	constexpr int M = `3`, K = `6`, N = `16`;
1928
1929	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1930
1931	at::Tensor t0 = at::randn({M, K}, options);
1932	at::Tensor t1 = at::randn({K, N}, options);
1933
1934	// A, B, m, split_k, block_k
1935	std::vector<IValue> aten_inputs = {t0, t1, `2`, `2`, `3`};
1936	at::Tensor aten_output =
1937	mul(t0.unsqueeze(`2`), t1.unsqueeze(`0`)).to(at::kDouble).sum(`1`);
1938
1939	torch::jit::fuser::cuda::FusionExecutor fe;
1940	fe.compileFusion(&fusion, aten_inputs);
1941	auto cg_outputs = fe.runFusion(aten_inputs);
1942
1943	testValidate(
1944	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
1945	}
1946
1947	TEST_F(NVFuserTest, FusionIssue468_CUDA) {
1948	Fusion fusion;
1949	FusionGuard fg(&fusion);
1950
1951	auto tv0 = makeSymbolicTensor(`2`);
1952	fusion.addInput(tv0);
1953	auto tv1 = sum(tv0, {`1`});
1954	auto tv2 = sum(tv1, {`0`});
1955	fusion.addOutput(tv2);
1956
1957	tv1->axis(`0`)->parallelize(ParallelType::TIDy);
1958	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
1959
1960	tv2->axis(`0`)->parallelize(ParallelType::TIDy);
1961
1962	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
1963	at::Tensor aten_input = at::randn({`10`, `100`}, options);
1964	at::Tensor aten_output = aten_input.to(at::kDouble).sum({`1`}).sum({`0`});
1965
1966	FusionExecutor fe;
1967	fe.compileFusion(&fusion, {aten_input});
1968	auto cg_outputs = fe.runFusion({aten_input});
1969
1970	testValidate(
1971	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
1972	}
1973
1974	TEST_F(NVFuserTest, FusionIssue363_CUDA) {
1975	Fusion fusion;
1976	FusionGuard fg(&fusion);
1977
1978	// Symbolic 2D tensors TV0[M, K], TV1[K, N]
1979	TensorView* tv0 = makeSymbolicTensor(`2`);
1980	TensorView* tv1 = makeSymbolicTensor(`2`);
1981
1982	// Broadcast tv0 to [M, K, ]*
1983	TensorView* tv2 = broadcast(tv0, {false, false, true});
1984	// Broadcast tv1 to [, K, N]*
1985	TensorView* tv3 = broadcast(tv1, {true, false, false});
1986
1987	// Pointwise multiplication resulting in tv3[M, K, N]
1988	TensorView* tv4 = mul(tv2, tv3);
1989
1990	// Sum the K-dim
1991	TensorView* tv5 = sum(tv4, {`1`});
1992
1993	// Register inputs and outputs
1994	fusion.addInput(tv0);
1995	fusion.addInput(tv1);
1996	fusion.addOutput(tv5);
1997
1998	tv2->setMemoryType(MemoryType::Global);
1999	tv3->setMemoryType(MemoryType::Global);
2000	tv4->setMemoryType(MemoryType::Global);
2001
2002	tv0->computeAt(tv5, -`1`);
2003	tv1->computeAt(tv5, -`1`);
2004
2005	tv5->axis(`0`)->parallelize(ParallelType::BIDz);
2006	tv5->axis(`1`)->parallelize(ParallelType::BIDy);
2007
2008	tv5->axis(`2`)->parallelize(ParallelType::BIDx);
2009
2010	constexpr int M = `3`, K = `6`, N = `16`;
2011
2012	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2013
2014	at::Tensor t0 = at::randn({M, K}, options);
2015	at::Tensor t1 = at::randn({K, N}, options);
2016	at::Tensor aten_output =
2017	mul(t0.unsqueeze(`2`), t1.unsqueeze(`0`)).to(at::kDouble).sum(`1`);
2018
2019	std::vector<IValue> aten_inputs = {t0, t1};
2020
2021	torch::jit::fuser::cuda::FusionExecutor fe;
2022	fe.compileFusion(&fusion, aten_inputs);
2023	auto cg_outputs = fe.runFusion(aten_inputs);
2024
2025	testValidate(
2026	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2027	}
2028
2029	TEST_F(NVFuserTest, FusionIssue484_CUDA) {
2030	Fusion fusion;
2031	FusionGuard fg(&fusion);
2032
2033	auto tv0 = makeSymbolicTensor(`2`);
2034	fusion.addInput(tv0);
2035	auto tv1 = sum(tv0, {`1`});
2036	auto tv2 = add(tv1, IrBuilder::create<Double>(`0`));
2037	fusion.addOutput(tv2);
2038
2039	tv1->setMemoryType(MemoryType::Global);
2040	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
2041
2042	constexpr int M = `100`;
2043
2044	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2045
2046	at::Tensor aten_input = at::randn({M, M}, options);
2047	at::Tensor aten_output = aten_input.to(at::kDouble).sum({`1`});
2048
2049	torch::jit::fuser::cuda::FusionExecutor fe;
2050	fe.compileFusion(&fusion, {aten_input});
2051	auto cg_outputs = fe.runFusion({aten_input});
2052
2053	testValidate(
2054	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
2055	}
2056
2057	TEST_F(NVFuserTest, FusionIssue329_CUDA) {
2058	Fusion fusion;
2059	FusionGuard fg(&fusion);
2060
2061	auto tv0 = makeSymbolicTensor(`2`);
2062	fusion.addInput(tv0);
2063	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
2064	auto tv2 = sum(tv1, {`1`});
2065	fusion.addOutput(tv2);
2066	auto tv3 = sum(tv1, {`1`});
2067	fusion.addOutput(tv3);
2068
2069	tv1->computeAt(tv2, -`1`);
2070
2071	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2072
2073	std::vector<int64_t> t0_shape{`17`, `19`};
2074	auto aten_input = at::randn(t0_shape, options);
2075	auto t2 = (aten_input + `1`).to(at::kDouble).sum({`1`});
2076	auto t3 = (aten_input + `1`).to(at::kDouble).sum({`1`});
2077	std::vector<at::Tensor> aten_outputs = {t2, t3};
2078
2079	FusionExecutor fe;
2080	fe.compileFusion(&fusion, {aten_input});
2081	auto cg_outputs = fe.runFusion({aten_input});
2082
2083	testValidate(
2084	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
2085	}
2086
2087	TEST_F(NVFuserTest, FusionIssue382_CUDA) {
2088	Fusion fusion;
2089	FusionGuard fg(&fusion);
2090
2091	auto tv0 = makeSymbolicTensor(`2`);
2092	fusion.addInput(tv0);
2093
2094	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
2095	auto tv2 = broadcast(tv1, {false, false, true});
2096	auto tv3 = makeSymbolicTensor(`3`);
2097	fusion.addInput(tv3);
2098	auto tv4 = add(tv2, tv3);
2099	fusion.addOutput(tv4);
2100
2101	tv2->merge(`1`);
2102	tv4->merge(`1`);
2103
2104	tv1->computeAt(tv4, `1`);
2105
2106	tv4->axis(`0`)->parallelize(ParallelType::BIDx);
2107
2108	tv1->setMemoryType(MemoryType::Global);
2109	tv2->setMemoryType(MemoryType::Global);
2110
2111	const int numel_x = `12`;
2112	const int numel_y = `34`;
2113	const int numel_z = `56`;
2114
2115	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2116	at::manual_seed(`0`);
2117	auto t0 = at::randn({numel_x, numel_y}, options);
2118	auto t3 = at::randn({numel_x, numel_y, numel_z}, options);
2119
2120	std::vector<IValue> aten_inputs = {t0, t3};
2121	auto aten_output = (t0 + `1`).unsqueeze(-`1`) + t3;
2122
2123	FusionExecutor fe;
2124	fe.compileFusion(&fusion, aten_inputs);
2125	auto cg_outputs = fe.runFusion(aten_inputs);
2126
2127	testValidate(
2128	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2129	}
2130
2131	TEST_F(NVFuserTest, FusionIssue507_CUDA) {
2132	Fusion fusion;
2133	FusionGuard fg(&fusion);
2134
2135	auto tv0 = makeSymbolicTensor(`2`);
2136	fusion.addInput(tv0);
2137	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
2138	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
2139	fusion.addOutput(tv2);
2140
2141	tv1->setMemoryType(MemoryType::Shared);
2142
2143	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
2144	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
2145	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
2146	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
2147
2148	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2149
2150	std::vector<int64_t> t0_shape{`17`, `19`};
2151	auto aten_input = at::randn(t0_shape, options);
2152	auto t1 = (aten_input + `1`);
2153	auto aten_output = (t1 + `1`);
2154
2155	FusionExecutor fe;
2156	fe.compileFusion(&fusion, {aten_input});
2157	auto cg_outputs = fe.runFusion({aten_input});
2158
2159	testValidate(
2160	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
2161	}
2162
2163	TEST_F(NVFuserTest, FusionIssue532_CUDA) {
2164	Fusion fusion;
2165	FusionGuard fg(&fusion);
2166
2167	// Algorithm
2168	TensorView* tv0 = makeSymbolicTensor(`1`);
2169	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
2170	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`1`));
2171	fusion.addInput(tv0);
2172	fusion.addOutput(tv2);
2173
2174	const int M_BLOCK = `64`;
2175	const int M_THREAD = `4`;
2176
2177	tv2->split(`0`, M_BLOCK);
2178	// tv2: [M/M_BLOCK, M_BLOCK]
2179	tv1->computeAt(tv2, `1`);
2180	// tv1: [M/M_BLOCK, M_BLOCK]
2181
2182	tv1->split(-`1`, M_BLOCK / M_THREAD);
2183	// tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD]
2184
2185	tv2->split(-`1`, M_THREAD);
2186	// tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD]
2187
2188	constexpr int M = `1000`;
2189
2190	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2191	at::manual_seed(`0`);
2192	at::Tensor t0 = at::randn({M}, options);
2193	std::vector<IValue> aten_inputs = {t0};
2194
2195	FusionExecutor fe;
2196	fe.compileFusion(&fusion, aten_inputs);
2197	auto outputs = fe.runFusion(aten_inputs);
2198
2199	at::Tensor aten_output = t0 + `1` + `1`;
2200
2201	testValidate(
2202	&fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2203	}
2204
2205	TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) {
2206	Fusion fusion;
2207	FusionGuard fg(&fusion);
2208
2209	// Algorithm
2210	TensorView* tv0 = makeSymbolicTensor(`1`);
2211	TensorView* tv1 = add(tv0, IrBuilder::create<Double>(`1`));
2212	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`1`));
2213	fusion.addInput(tv0);
2214	fusion.addOutput(tv2);
2215
2216	tv2->split(`0`, `32`);
2217	tv1->computeAt(tv2, -`1`);
2218
2219	tv2->axis(`1`)->parallelize(ParallelType::Unswitch);
2220
2221	constexpr int M = `1000`;
2222
2223	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2224	at::manual_seed(`0`);
2225	at::Tensor t0 = at::randn({M}, options);
2226	std::vector<IValue> aten_inputs = {t0};
2227
2228	FusionExecutor fe;
2229	fe.compileFusion(&fusion, aten_inputs);
2230	auto outputs = fe.runFusion(aten_inputs);
2231
2232	at::Tensor aten_output = t0 + `1` + `1`;
2233
2234	testValidate(
2235	&fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
2236	}
2237
2238	TEST_F(NVFuserTest, FusionIssue549_CUDA) {
2239	Fusion fusion;
2240	FusionGuard fg(&fusion);
2241
2242	// Set up your input tensor views
2243	TensorView* tv0 = makeSymbolicTensor(`2`); // M, K
2244	TensorView* tv1 = makeSymbolicTensor(`2`); // K, N
2245	fusion.addInput(tv0);
2246	fusion.addInput(tv1);
2247
2248	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
2249
2250	TensorView* tv3 = broadcast(tv2, {false, false, true});
2251	// tv3[I0, I1, B] = tv0[I0, I1]
2252
2253	TensorView* tv4 = broadcast(tv1, {true, false, false});
2254	// tv4[B, I1, I2] = tv1[I1, I2]
2255
2256	// tv5[I0, I1, I2] = tv3[I0, I1, B] tv4[B, I1, I2]*
2257	TensorView* tv5 = mul(tv3, tv4);
2258	// tv6[I0, R1, I2] = tv5[I0, I1, I2]
2259	TensorView* tv6 = sum(tv5, {`1`});
2260	fusion.addOutput(tv6);
2261
2262	tv6->split(`1`, `32`);
2263	// tv6[I0, R1o, R1i{32}, I2]
2264
2265	auto tv7 = tv6->rFactor({`1`});
2266	// tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2]
2267	// tv6[I0, , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2]
2268
2269	tv6->split(`0`, `4`);
2270	tv6->split(-`1`, `4`);
2271	// tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
2272	// tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
2273
2274	tv0->computeAt(tv6, -`1`);
2275	tv1->computeAt(tv6, -`1`);
2276
2277	// tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
2278	// tv6[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}]
2279	//--> (line symbolizes compute at location)
2280	// tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}\|, I1o]
2281	// tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}\|, R1o]
2282	// tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}\|]
2283
2284	tv0->computeAt(tv7, -`1`);
2285	tv1->computeAt(tv7, -`1`);
2286	// tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o \|]
2287	// tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o \|]
2288	// tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}\|]
2289
2290	tv6->axis(`0`)->parallelize(ParallelType::BIDz);
2291	tv6->axis(`1`)->parallelize(ParallelType::TIDz);
2292
2293	tv6->axis(-`2`)->parallelize(ParallelType::BIDy);
2294	tv6->axis(-`1`)->parallelize(ParallelType::TIDy);
2295
2296	tv6->axis(`2`)->parallelize(ParallelType::TIDx);
2297	tv7->axis(`2`)->parallelize(ParallelType::TIDx);
2298
2299	constexpr int M = `65`, K = `33`, N = `17`;
2300
2301	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2302
2303	at::Tensor t0 = at::randn({M, K}, options);
2304	at::Tensor t1 = at::randn({K, N}, options);
2305
2306	// Lets specify a few bounds in launch params to make sure it works
2307	LaunchParams lparams(`1`, -`1`, -`1`, `32`, `4`, `4`);
2308
2309	FusionExecutor fe;
2310	fe.compileFusion(&fusion, {t0, t1}, lparams);
2311	fe.runFusion({t0, t1}, lparams);
2312
2313	// Make sure bad launch params throws
2314	// TODO: Re-enable once we have parallelization validation in.
2315	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2316	// ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
2317
2318	// Don't specify any launch params
2319	auto cg_outputs = fe.runFusion({t0, t1});
2320
2321	auto aten_output = (t0 + `1`).to(at::kDouble).matmul(t1.to(at::kDouble));
2322
2323	testValidate(
2324	&fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
2325	}
2326
2327	TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) {
2328	FusionExecutor fe;
2329	std::string kernel = R"(
2330	__global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
2331	if(threadIdx.x==0){
2332	for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) {
2333	T1[ki28T1.stride[0]] = T0[ki28T0.stride[0]]*2;
2334	}
2335	}
2336	}
2337	)";
2338	fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2339	LaunchParams lp(
2340	`256`, // gdimx
2341	`1`, // gdimy
2342	`1`, // gdimz
2343	`1`, // bdimx
2344	`1`, // bdimy
2345	`1` // bdimz
2346	);
2347	lp.setSmem(`0`);
2348	const auto options =
2349	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2350	const std::vector<int64_t> tensor_dims = {`8`};
2351	auto in0 = at::randn(tensor_dims, options);
2352	auto out0 = at::empty_like(in0);
2353	fe.runRtc(lp, {in0, out0});
2354
2355	auto out_ref = in0 * `2`;
2356	TORCH_CHECK(out_ref.allclose(out0));
2357	}
2358
2359	TEST_F(NVFuserTest, FusionSerialWelford_CUDA) {
2360	FusionExecutor fe;
2361	int x = `128`, y = `64`, z = `64`;
2362
2363	std::string kernel = R"(
2364	__global__ void kernel1(
2365	Tensor<float,3> inp,
2366	Tensor<float,1> out_var,
2367	Tensor<float,1> out_avg
2368	){
2369	for(int i0=0;i0<inp.size[0];i0++){
2370	float tmp_M2=0;
2371	float tmp_avg=0;
2372	long tmp_N=0;
2373	for(int i1=0;i1<inp.size[1];i1++){
2374	for(int i2=0;i2<inp.size[2];i2++){
2375	welfordCombine(
2376	tmp_avg,
2377	tmp_M2,
2378	tmp_N,
2379	inp[i0*inp.stride[0]+
2380	i1*inp.stride[1]+
2381	i2*inp.stride[2]],
2382	0.f,
2383	(long)1
2384	);
2385	}
2386	}
2387	out_var[i0*out_var.stride[0]]=
2388	tmp_M2/(tmp_N);
2389	out_avg[i0*out_avg.stride[0]]=
2390	tmp_avg;
2391	}
2392	}
2393	)";
2394	fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2395	LaunchParams lp(
2396	`1`, // gdimx
2397	`1`, // gdimy
2398	`1`, // gdimz
2399	`1`, // bdimx
2400	`1`, // bdimy
2401	`1` // bdimz
2402	);
2403	lp.setSmem(`0`);
2404	const auto options =
2405	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2406	const std::vector<int64_t> tensor_dims = {x, y, z};
2407	auto in0 = at::randn(tensor_dims, options);
2408	auto out_var = at::empty({x}, options);
2409	auto out_avg = at::empty({x}, options);
2410	fe.runRtc(lp, {in0, out_var, out_avg});
2411
2412	TORCH_CHECK(in0.var({`1`, `2`}, false).allclose(out_var));
2413	TORCH_CHECK(in0.mean({`1`, `2`}).allclose(out_avg, /rtol/ `1e-5`, /atol/ `1e-6`));
2414	}
2415
2416	TEST_F(NVFuserTest, FusionBlockWelford_CUDA) {
2417	FusionExecutor fe;
2418	int x = `7`, y = `8`, z = `9`;
2419
2420	std::string kernel = R"(
2421	__global__ void kernel1(
2422	Tensor<float,2> inp,
2423	Tensor<float,1> out_avg,
2424	Tensor<float,1> out_var,
2425	Tensor<float,1> init_avg,
2426	Tensor<float,1> init_var,
2427	Tensor<long,0> init_N
2428	){
2429	//actual generated kernel will use dynamic shared mem,
2430	// here is just for prototype
2431	__shared__ float mem_avg[512];
2432	__shared__ float mem_M2[512];
2433	__shared__ long mem_N[512];
2434	float in=inp[threadIdx.x*inp.stride[0]+
2435	threadIdx.y*inp.stride[1]];
2436	float tmp_avg=0;
2437	float tmp_M2=0;
2438	long tmp_N=0;
2439	blockWelford<false,true,false>(
2440	tmp_avg,
2441	tmp_M2,
2442	tmp_N,
2443	in,
2444	0.f,
2445	(long)1,
2446	threadIdx,
2447	blockDim,
2448	(float*)mem_avg,
2449	(float*)mem_M2,
2450	(long*)mem_N,
2451	(bool)(threadIdx.x<inp.size[0]),
2452	0.f);
2453	__syncthreads();
2454	if(threadIdx.x<out_var.size[0] && threadIdx.y==0){
2455	welfordCombine(
2456	tmp_avg,
2457	tmp_M2,
2458	tmp_N,
2459	init_avg[threadIdx.x*init_avg.stride[0]],
2460	init_var[threadIdx.xinit_var.stride[0]]init_N[0],
2461	init_N[0]
2462	);
2463	out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
2464	out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
2465	}
2466	}
2467	)";
2468	fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2469	LaunchParams lp(
2470	`1`, // gdimx
2471	`1`, // gdimy
2472	`1`, // gdimz
2473	x, // bdimx
2474	y, // bdimy
2475	`1` // bdimz
2476	);
2477	lp.setSmem(`0`);
2478	const auto options =
2479	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2480	const std::vector<int64_t> tensor_dims = {x, y};
2481	const std::vector<int64_t> init_dims = {x, z};
2482
2483	// generate initial values
2484	auto init_in = at::randn(init_dims, options);
2485	auto init_var = init_in.var({`1`}, false);
2486	auto init_avg = init_in.mean({`1`});
2487	auto init_N =
2488	at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`));
2489
2490	auto in0 = at::randn(tensor_dims, options);
2491
2492	// run kernel
2493	auto out_var = at::zeros({x}, options);
2494	auto out_avg = at::zeros({x}, options);
2495	fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N});
2496
2497	// compare with reference output
2498	auto cat_tensor = at::cat({init_in, in0}, `1`);
2499	TORCH_CHECK(cat_tensor.var({`1`}, false).allclose(out_var));
2500	TORCH_CHECK(
2501	cat_tensor.mean({`1`}).allclose(out_avg, /rtol/ `1e-5`, /atol/ `1e-6`));
2502	}
2503
2504	TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
2505	FusionExecutor fe;
2506	int x = `7`, y = `8`, z = `9`;
2507
2508	// need support IValue for integer input as initial count
2509	std::string kernel = R"(
2510	__global__ void kernel1(
2511	Tensor<float,3> inp,
2512	Tensor<float,1> out_avg,
2513	Tensor<float,1> out_var
2514	){
2515	//actual generated kernel will use dynamic shared mem,
2516	// here is just for prototype
2517	__shared__ float mem_avg[512];
2518	__shared__ float mem_M2[512];
2519	__shared__ long mem_N[512];
2520	float in=inp[threadIdx.x*inp.stride[0]+
2521	threadIdx.y*inp.stride[1]+
2522	threadIdx.z*inp.stride[2]];
2523	float tmp_avg=0;
2524	float tmp_M2=0;
2525	long tmp_N=0;
2526	block_sync::init();
2527	blockWelford<false,true,true>(
2528	tmp_avg,
2529	tmp_M2,
2530	tmp_N,
2531	in,
2532	0.f,
2533	(long) 1,
2534	threadIdx,
2535	blockDim,
2536	(float*)mem_avg,
2537	(float*)mem_M2,
2538	(long*)mem_N,
2539	(bool)(threadIdx.x<inp.size[0]),
2540	0.f);
2541	__syncthreads();
2542	if(threadIdx.x<out_var.size[0] && threadIdx.y==0 && threadIdx.z==0){
2543	out_avg[threadIdx.x*out_var.stride[0]]=tmp_avg;
2544	out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
2545	}
2546	}
2547	)";
2548	fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2549	LaunchParams lp(
2550	`1`, // gdimx
2551	`1`, // gdimy
2552	`1`, // gdimz
2553	x, // bdimx
2554	y, // bdimy
2555	z // bdimz
2556	);
2557	lp.setSmem(`0`);
2558	const auto options =
2559	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2560	const std::vector<int64_t> tensor_dims = {x, y, z};
2561	auto in0 = at::randn(tensor_dims, options);
2562	auto out_var = at::empty({x}, options);
2563	auto out_avg = at::empty({x}, options);
2564	fe.runRtc(lp, {in0, out_avg, out_var});
2565
2566	TORCH_CHECK(in0.var({`1`, `2`}, false).allclose(out_var));
2567	TORCH_CHECK(in0.mean({`1`, `2`}).allclose(out_avg, /rtol/ `1e-5`, /atol/ `1e-6`));
2568	}
2569
2570	TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
2571	FusionExecutor fe;
2572	int x = `128`, y = `64`, z = `128`;
2573
2574	std::string kernel = R"(
2575	__global__ void kernel1(
2576	Tensor<float,3> inp,
2577	Tensor<float,1> out_avg,
2578	Tensor<float,1> out_var,
2579	Tensor<float,1> work_buf_avg,
2580	Tensor<float,1> work_buf_M2,
2581	Tensor<long,1> work_buf_N,
2582	Tensor<int64_t,1> sync_flag
2583	){
2584	__shared__ float shared_buf_avg[512];
2585	__shared__ float shared_buf_M2[512];
2586	__shared__ long shared_buf_N[512];
2587	float tmp_avg=0;
2588	float tmp_M2=0;
2589	long tmp_N=0;
2590	float in = inp[ blockIdx.x * inp.stride[0]+
2591	blockIdx.y * inp.stride[1]+
2592	threadIdx.x * inp.stride[2]];
2593	block_sync::init();
2594	welford::gridWelford<
2595	true,true,false,
2596	true,false,false,
2597	false
2598	>(
2599	tmp_avg,
2600	tmp_M2,
2601	tmp_N,
2602	in,
2603	0.f,
2604	(long) 1,
2605	&work_buf_avg[0],
2606	&work_buf_M2[0],
2607	&work_buf_N[0],
2608	sync_flag,
2609	(float*)shared_buf_avg,
2610	(float*)shared_buf_M2,
2611	(long*)shared_buf_N,
2612	threadIdx.x<out_var.size[0],
2613	threadIdx.x<out_var.size[0],
2614	0.f,
2615	0,
2616	1);
2617	if(blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1){
2618	out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
2619	out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/tmp_N;
2620	}
2621	}
2622	)";
2623	fe.compileRtc(kernel, "CudaCodeGen::kernel1");
2624	LaunchParams lp(
2625	x, // gdimx
2626	y, // gdimy
2627	`1`, // gdimz
2628	z, // bdimx
2629	`1`, // bdimy
2630	`1` // bdimz
2631	);
2632	lp.setSmem(`0`);
2633	const auto options =
2634	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2635	const auto options_int =
2636	at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2637
2638	const std::vector<int64_t> tensor_dims = {x, y, z};
2639	auto in0 = at::randn(tensor_dims, options);
2640
2641	auto out_avg = at::empty({z}, options);
2642	auto out_var = at::empty({z}, options);
2643	auto work_buf_avg = at::empty({x * y * z}, options);
2644	auto work_buf_var = at::empty({x * y * z}, options);
2645	auto work_buf_N = at::empty({x * y * z}, options_int);
2646	auto sync_flag = at::zeros({`1`}, options_int);
2647	fe.runRtc(
2648	lp,
2649	{in0,
2650	out_avg,
2651	out_var,
2652	work_buf_avg,
2653	work_buf_var,
2654	work_buf_N,
2655	sync_flag});
2656	std::vector<int64_t> dims{`0`, `1`};
2657
2658	TORCH_CHECK(in0.mean(dims).allclose(out_avg, /rtol/ `1e-5`, /atol/ `1e-6`));
2659	TORCH_CHECK(in0.var(dims, false).allclose(out_var));
2660	}
2661
2662	TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
2663	Fusion fusion;
2664	FusionGuard fg(&fusion);
2665
2666	int M = `64`, N = `128`;
2667
2668	auto tv0 = makeSymbolicTensor(`2`);
2669	fusion.addInput(tv0);
2670	auto tv1 = mul(tv0, IrBuilder::create<Double>(`1`));
2671	auto tvs = Welford(tv1, {`1`});
2672	auto tv_avg = tvs.avg;
2673	auto tv_M2 = tvs.var_sum;
2674	auto tv_N = tvs.n;
2675	fusion.addOutput(tv_avg);
2676	fusion.addOutput(tv_M2);
2677	fusion.addOutput(tv_N);
2678
2679	tv_avg->split(`1`, `32`);
2680	tv_avg->split(`0`, `32`);
2681	tv_avg->split(`0`, `4`);
2682	tv_avg->reorder({{-`1`, -`3`}, {-`3`, -`1`}});
2683	tv1->computeAt(tv_avg, -`1`);
2684
2685	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2686	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2687	at::manual_seed(`0`);
2688	at::Tensor t0 = at::randn({M, N}, options);
2689
2690	FusionExecutor fe;
2691	fe.compileFusion(&fusion, {t0});
2692	auto outputs = fe.runFusion({t0});
2693
2694	// by default Welford outputs sum of square diff so need to divide to get var
2695	outputs [`1`] /= N;
2696
2697	testValidate(
2698	fe.kernel(),
2699	outputs,
2700	{t0},
2701	{t0.mean({`1`}), t0.var({`1`}, false), at::ones({M}, options_int) * N},
2702	__LINE__,
2703	__FILE__);
2704	}
2705
2706	TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
2707	Fusion fusion;
2708	FusionGuard fg(&fusion);
2709
2710	int M = `64`, N = `128`;
2711
2712	auto tv0 = makeSymbolicTensor(`2`);
2713	fusion.addInput(tv0);
2714	auto tv1 = mul(tv0, IrBuilder::create<Double>(`1`));
2715	auto tvs = Welford(tv1, {`1`});
2716	auto tv_avg = tvs.avg;
2717	auto tv_M2 = tvs.var_sum;
2718	auto tv_N = tvs.n;
2719	fusion.addOutput(tv_avg);
2720	fusion.addOutput(tv_M2);
2721	fusion.addOutput(tv_N);
2722
2723	tv_avg->axis(-`1`)->parallelize(ParallelType::TIDx);
2724
2725	tv1->computeAt(tv_avg, -`1`);
2726
2727	//
2728	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2729	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2730	at::manual_seed(`0`);
2731	at::Tensor t0 = at::randn({M, N}, options);
2732	at::Tensor t_var = at::empty({M}, options);
2733	at::Tensor t_avg = at::empty({M}, options);
2734	at::Tensor t_N = at::empty({M}, options_int);
2735
2736	FusionExecutor fe;
2737	fe.compileFusion(&fusion, {t0});
2738	auto outputs = fe.runFusion({t0});
2739
2740	// by default Welford outputs sum of square diff so need to divide to get var
2741	outputs [`1`] /= N;
2742
2743	testValidate(
2744	fe.kernel(),
2745	outputs,
2746	{t0},
2747	{t0.mean({`1`}), t0.var({`1`}, false), at::ones({M}, options_int) * N},
2748	__LINE__,
2749	__FILE__);
2750	}
2751
2752	TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
2753	Fusion fusion;
2754	FusionGuard fg(&fusion);
2755
2756	int M = `64`, N = `128`;
2757
2758	auto tv0 = makeSymbolicTensor(`2`);
2759	fusion.addInput(tv0);
2760	auto tv1 = mul(tv0, IrBuilder::create<Double>(`1`));
2761	auto tvs = Welford(tv1, {`1`});
2762	auto tv_avg = tvs.avg;
2763	auto tv_M2 = tvs.var_sum;
2764	auto tv_N = tvs.n;
2765	fusion.addOutput(tv_avg);
2766	fusion.addOutput(tv_M2);
2767	fusion.addOutput(tv_N);
2768
2769	tv_avg->axis(`0`)->parallelize(ParallelType::TIDx);
2770	tv_avg->axis(-`1`)->parallelize(ParallelType::BIDx);
2771
2772	tv1->computeAt(tv_avg, -`1`);
2773
2774	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2775	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2776	at::manual_seed(`0`);
2777	at::Tensor t0 = at::randn({M, N}, options);
2778	at::Tensor t_avg = at::empty({M}, options);
2779	at::Tensor t_var = at::empty({M}, options);
2780	at::Tensor t_N = at::empty({M}, options_int);
2781
2782	FusionExecutor fe;
2783	fe.compileFusion(&fusion, {t0});
2784	auto outputs = fe.runFusion({t0});
2785
2786	// by default Welford outputs sum of square diff so need to divide to get var
2787	outputs [`1`] /= N;
2788
2789	testValidate(
2790	fe.kernel(),
2791	outputs,
2792	{t0},
2793	{t0.mean({`1`}), t0.var({`1`}, false), at::ones({M}, options_int) * N},
2794	__LINE__,
2795	__FILE__);
2796	}
2797
2798	TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
2799	Fusion fusion;
2800	FusionGuard fg(&fusion);
2801
2802	int M = `64`, N = `128`;
2803
2804	auto tv0 = makeSymbolicTensor(`2`);
2805	fusion.addInput(tv0);
2806	auto tv1 = mul(tv0, IrBuilder::create<Double>(`1`));
2807	auto tvs = Welford(tv1, {`1`});
2808	auto tv_avg = tvs.avg;
2809	auto tv_M2 = tvs.var_sum;
2810	auto tv_N = tvs.n;
2811	fusion.addOutput(tv_avg);
2812	fusion.addOutput(tv_M2);
2813	fusion.addOutput(tv_N);
2814
2815	tv_avg->split(`1`, `4`);
2816	ir_utils::rfactorHelper(tvs.avg, {`2`});
2817	tv1->computeAt(tv_avg, -`1`);
2818
2819	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2820	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2821	at::manual_seed(`0`);
2822	at::Tensor t0 = at::randn({M, N}, options);
2823	at::Tensor t_avg = at::empty({M}, options);
2824	at::Tensor t_var = at::empty({M}, options);
2825	at::Tensor t_N = at::empty({M}, options_int);
2826
2827	FusionExecutor fe;
2828	fe.compileFusion(&fusion, {t0});
2829	auto outputs = fe.runFusion({t0});
2830
2831	// by default Welford outputs sum of square diff so need to divide to get var
2832	outputs [`1`] /= N;
2833
2834	testValidate(
2835	fe.kernel(),
2836	outputs,
2837	{t0},
2838	{t0.mean({`1`}), t0.var({`1`}, false), at::ones({M}, options_int) * N},
2839	__LINE__,
2840	__FILE__);
2841	}
2842
2843	TEST_F(NVFuserTest, FusionWelfordSchedule_CUDA) {
2844	Fusion fusion;
2845	FusionGuard fg(&fusion);
2846
2847	int M = `64`, N = `128`;
2848
2849	auto tv0 = makeSymbolicTensor(`2`);
2850	fusion.addInput(tv0);
2851	auto tv1 = mul(tv0, IrBuilder::create<Double>(`1`));
2852	auto tvs = Welford(tv1, {`1`});
2853	auto tv_avg = tvs.avg;
2854	auto tv_M2 = tvs.var_sum;
2855	auto tv_N = tvs.n;
2856	fusion.addOutput(tv_avg);
2857	fusion.addOutput(tv_M2);
2858	fusion.addOutput(tv_N);
2859
2860	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
2861	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2862	at::manual_seed(`0`);
2863	at::Tensor t0 = at::randn({M, N}, options);
2864	// TODO: Why do we use launch params from here, but not scheduling???
2865	auto reduction_params = getReductionHeuristics(&fusion, {t0});
2866	scheduleReduction(&fusion, *reduction_params);
2867
2868	auto lparams = reduction_params ->lparams;
2869	FusionExecutor fe;
2870	fe.compileFusion(&fusion, {t0}, lparams);
2871	auto outputs = fe.runFusion({t0}, lparams);
2872
2873	// by default Welford outputs sum of square diff so need to divide to get var
2874	outputs [`1`] /= N;
2875
2876	auto at_avg = t0.mean({`1`});
2877	auto at_var = t0.var({`1`}, false);
2878	auto at_n = at::ones({M}, options_int) * N;
2879
2880	testValidate(
2881	fe.kernel(),
2882	outputs,
2883	{t0},
2884	{at_avg, at_var, at_n},
2885	__LINE__,
2886	__FILE__,
2887	"validate welford",
2888	reduction_params ->lparams);
2889	}
2890
2891	namespace {
2892	void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
2893	const int axis = red_axis;
2894	at::ScalarType aten_dtype = data_type_to_aten(dtype);
2895
2896	Fusion fusion;
2897	FusionGuard fg(&fusion);
2898	TensorView* tv0 = makeSymbolicTensor(`2`, dtype);
2899	bool is_fp16 = dtype == DataType::Half;
2900	bool is_bf16 = dtype == DataType::BFloat16;
2901	TensorView* tv0_cast = tv0;
2902	if (is_fp16 \|\| is_bf16) {
2903	tv0_cast = castOp(DataType::Float, tv0);
2904	}
2905	fusion.addInput(tv0);
2906	auto tv1 = mul(tv0_cast, IrBuilder::create<Double>(`1`));
2907	auto tvs = Welford(tv1, {axis});
2908	auto tv_avg = tvs.avg;
2909	auto tv_M2 = tvs.var_sum;
2910	auto tv_N = tvs.n;
2911
2912	TensorView* avg_cast = tv_avg;
2913	TensorView* M2_cast = tv_M2;
2914
2915	if (is_fp16) {
2916	avg_cast = castOp(DataType::Half, tv_avg);
2917	M2_cast = castOp(DataType::Half, tv_M2);
2918	}
2919	if (is_bf16) {
2920	avg_cast = castOp(DataType::BFloat16, tv_avg);
2921	M2_cast = castOp(DataType::BFloat16, tv_M2);
2922	}
2923
2924	fusion.addOutput(avg_cast);
2925	fusion.addOutput(M2_cast);
2926	fusion.addOutput(tv_N);
2927
2928	auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, `0`);
2929	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
2930	at::manual_seed(`0`);
2931	std::vector<TensorView*> outputs_of_red;
2932	at::Tensor aten_input =
2933	(axis ? at::randn({odim, rdim}, options)
2934	: at::randn({rdim, odim}, options));
2935
2936	if (is_fp16 \|\| is_bf16) {
2937	outputs_of_red.push_back(avg_cast);
2938	outputs_of_red.push_back(M2_cast);
2939	}
2940
2941	auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
2942	scheduleReduction(&fusion, *reduction_params);
2943
2944	auto lparams = reduction_params ->lparams;
2945
2946	FusionExecutor fe;
2947	fe.compileFusion(&fusion, {aten_input}, lparams);
2948	auto outputs = fe.runFusion({aten_input}, lparams);
2949
2950	// by default Welford outputs sum of square diff so need to divide to
2951	// get var
2952
2953	outputs [`1`] /= rdim;
2954
2955	auto at_avg = aten_input.mean({axis});
2956	auto at_var = aten_input.var({axis}, false);
2957	auto at_n =
2958	(axis ? at::ones({odim, rdim}, options)
2959	: at::ones({rdim, odim}, options));
2960	at_n = at_n.sum({axis});
2961
2962	testValidate(
2963	fe.kernel(),
2964	outputs,
2965	{aten_input},
2966	{at_avg, at_var, at_n},
2967	__LINE__,
2968	__FILE__,
2969	"validate welford",
2970	reduction_params ->lparams);
2971	}
2972	} // namespace
2973
2974	TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) {
2975	std::vector<DataType> dtypes = {
2976	DataType::Double, DataType::Float, DataType::Half};
2977	// TODO: enable this for complex. Currently, complex yields
2978	// silent wrong results:
2979	// Detected abs error of: 3.8062
2980	// absolute tolerance was set to 2.23704e-06
2981	// and relative tolerance set to 2.23704e-08
2982	#if !defined(USE_ROCM)
2983	if (at::cuda::getDeviceProperties(`0`)->major >= `8`) {
2984	dtypes.insert(dtypes.end(), DataType::BFloat16);
2985	}
2986	#endif
2987
2988	std::vector<int> red_axis = {`1`, `0`};
2989	std::vector<int> output_dims = {`160`, `320`};
2990	std::vector<int> red_dims;
2991
2992	// Tried to cut down the number iterations with just
2993	// doing every other power of 2.
2994	for (int i = `1`; i <= `1024` * `1024`; i <<= `2`) {
2995	red_dims.push_back(i);
2996	}
2997
2998	for (auto dtype : dtypes) {
2999	for (auto& axis : red_axis) {
3000	for (auto& odim : output_dims) {
3001	for (auto& rdim : red_dims) {
3002	// TODO: original welford algorithm actually keeps a running sum of
3003	// squares, i.e. M_{2n} in the
3004	// cf:
3005	// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
3006	// algorithm notation, and it can reach inf for large numbers
3007	// with half precision. skipping too large volumes for half for
3008	// nwo might need further numerical experiments to re-design
3009	// this.
3010	if (rdim > `32768` &&
3011	(dtype == DataType::Half \|\| dtype == DataType::BFloat16)) {
3012	continue;
3013	}
3014	testWelford(dtype, axis, odim, rdim);
3015	}
3016	}
3017	}
3018	}
3019	}
3020
3021	namespace {
3022	void testVarMean(at::ScalarType dtype, int correction, bool keepdim) {
3023	auto fusion = std::make_unique<Fusion>();
3024	FusionGuard fg(fusion.get());
3025
3026	int M = `64`, N = `128`;
3027
3028	auto tv0 = makeSymbolicTensor(`2`, aten_to_data_type(dtype));
3029	fusion ->addInput(tv0);
3030	auto tvs = variance_mean(tv0, {`1`}, correction, keepdim);
3031	auto tv_mean = tvs.mean;
3032	auto tv_var = tvs.var;
3033	fusion ->addOutput(tv_var);
3034	fusion ->addOutput(tv_mean);
3035
3036	auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, `0`);
3037	at::manual_seed(`0`);
3038	at::Tensor t0 = at::randn({M, N}, options);
3039
3040	FusionExecutorCache executor_cache(std::move(fusion));
3041	auto outputs = executor_cache.runFusionWithInputs({t0});
3042
3043	auto at_var_mean = at::var_mean(t0, {`1`}, correction, keepdim);
3044	std::vector<at::Tensor> aten_outputs = {
3045	std::get<`0`>(at_var_mean), std::get<`1`>(at_var_mean)};
3046
3047	testValidate(
3048	executor_cache.fusion(), outputs, {t0}, aten_outputs, __LINE__, __FILE__);
3049	}
3050	} // namespace
3051
3052	TEST_F(NVFuserTest, FusionVarMean_CUDA) {
3053	std::vector<at::ScalarType> dtypes = {at::kFloat, at::kDouble};
3054	std::vector<int> corrections = {`0`, `1`};
3055	std::vector<bool> keepdims = {false, true};
3056	for (auto correction : corrections) {
3057	for (auto keepdim : keepdims) {
3058	for (auto dtype : dtypes) {
3059	testVarMean(dtype, correction, keepdim);
3060	}
3061	}
3062	}
3063	}
3064
3065	TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
3066	Fusion fusion;
3067	FusionGuard fg(&fusion);
3068
3069	// Set up your input tensor views
3070
3071	TensorView* tv0 = makeSymbolicTensor(`2`); // K, M
3072	TensorView* tv1 = makeSymbolicTensor(`2`); // N, K
3073	fusion.addInput(tv0);
3074	fusion.addInput(tv1);
3075
3076	TensorView* tv0_t = transpose(tv0);
3077	TensorView* tv1_t = transpose(tv1);
3078
3079	TensorView* tv2 = broadcast(tv0_t, {false, false, true});
3080	// tv2[I0, I1, B] = tv0[I0, I1]
3081
3082	TensorView* tv3 = broadcast(tv1_t, {true, false, false});
3083	// tv3[B, I1, I2] = tv1[I1, I2]
3084
3085	// tv4[I0, I1, I2] = tv2[I0, I1, B] tv3[B, I1, I2]*
3086	TensorView* tv4 = mul(tv2, tv3);
3087	// tv5[I0, R1, I2] = tv4[I0, I1, I2]
3088	TensorView* tv5 = sum(tv4, {`1`});
3089	fusion.addOutput(tv5);
3090
3091	tv5->split(`1`, `32`);
3092	// tv5[I0, R1o, R1i{32}, I2]
3093
3094	auto tv6 = tv5->rFactor({`1`});
3095	// tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
3096	// tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
3097
3098	tv5->split(`0`, `4`);
3099	tv5->split(-`1`, `4`);
3100	// tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
3101	// tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
3102
3103	tv0_t->computeAt(tv5, -`1`);
3104	tv1_t->computeAt(tv5, -`1`);
3105
3106	// tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
3107	// tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}]
3108	//--> (line symbolizes compute at location)
3109	// tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}\|, I1o]
3110	// tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}\|, R1o]
3111	// tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}\|]
3112
3113	tv0_t->computeAt(tv6, -`1`);
3114	tv1_t->computeAt(tv6, -`1`);
3115	// tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o \|]
3116	// tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o \|]
3117	// tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}\|]
3118
3119	tv5->axis(`0`)->parallelize(ParallelType::BIDz);
3120	tv5->axis(`1`)->parallelize(ParallelType::TIDz);
3121
3122	tv5->axis(-`2`)->parallelize(ParallelType::BIDy);
3123	tv5->axis(-`1`)->parallelize(ParallelType::TIDy);
3124
3125	tv5->axis(`2`)->parallelize(ParallelType::TIDx);
3126	tv6->axis(`2`)->parallelize(ParallelType::TIDx);
3127
3128	constexpr int M = `65`, K = `33`, N = `17`;
3129
3130	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3131
3132	at::Tensor t0 = at::randn({K, M}, options);
3133	at::Tensor t1 = at::randn({N, K}, options);
3134
3135	// Lets specify a few bounds in launch params to make sure it works
3136	LaunchParams lparams(`1`, -`1`, -`1`, `32`, `4`, `4`);
3137	FusionExecutor fe;
3138	fe.compileFusion(&fusion, {t0, t1}, lparams);
3139	fe.runFusion({t0, t1}, lparams);
3140
3141	// Don't specify any launch params
3142	auto cg_outputs = fe.runFusion({t0, t1});
3143
3144	auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
3145
3146	testValidate(
3147	&fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
3148	}
3149
3150	TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
3151	Fusion fusion;
3152	FusionGuard fg(&fusion);
3153
3154	const int tidx = `32`;
3155	const int dimx = `32`;
3156	const int dimy = `16`;
3157	const int dimz = `130`;
3158
3159	// Set up your input tensor views
3160	TensorView* input_tv0 = makeSymbolicTensor(`3`);
3161	fusion.addInput(input_tv0);
3162
3163	TensorView* input_t = transpose(input_tv0, `1`, `2`);
3164
3165	TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t);
3166	TensorView* sum_exp_tv2 = sum(exp_tv1, {-`1`});
3167	TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
3168
3169	// Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
3170	// computed at sum_exp_rf_tv8.
3171	TensorView* input_t_copy = transpose(input_tv0, `1`, `2`);
3172	TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy);
3173
3174	TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
3175
3176	fusion.addOutput(output_tv4);
3177
3178	bcast_sum_tv3->split(-`1`, tidx);
3179
3180	sum_exp_tv2->split(-`1`, tidx);
3181	TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-`2`});
3182
3183	output_tv4->split(-`1`, tidx);
3184
3185	input_t->computeAt(sum_exp_rf_tv5, -`1`);
3186	input_t_copy->computeAt(output_tv4, -`1`);
3187
3188	TensorView* tensors_to_parallelize[] = {
3189	sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
3190
3191	for (auto tv : tensors_to_parallelize) {
3192	tv->axis(`0`)->parallelize(ParallelType::BIDx);
3193	tv->axis(`1`)->parallelize(ParallelType::BIDy);
3194	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
3195	}
3196
3197	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3198	at::Tensor input = at::randn({dimx, dimz, dimy}, options);
3199
3200	at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
3201
3202	FusionExecutor fe;
3203	fe.compileFusion(&fusion, {input});
3204	fe.runFusion({input}, {cg_output});
3205
3206	auto aten_input_t = at::transpose(input, `1`, `2`);
3207	auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -`1`, false);
3208
3209	testValidate(
3210	&fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
3211	}
3212
3213	TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
3214	// Case 1
3215	// tv1 = tv0 0.5*
3216	// tv2 = tv1 -1*
3217	// tv3 = tv1 + 3
3218	// tv4 = tv1 2*
3219	// tv5 = tv3 + tv2
3220	// tv6 = tv5 + tv4
3221	// tv7 = tv1 + tv4
3222	Fusion fusion;
3223	FusionGuard fg(&fusion);
3224
3225	TensorView* tv0 = makeSymbolicTensor(`2`);
3226	fusion.addInput(tv0);
3227
3228	tv0 = transpose(tv0);
3229
3230	TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(`0.5`));
3231	TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-`1.0`));
3232	TensorView* tv3 = add(tv1, IrBuilder::create<Double>(`3.0`));
3233	TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(`2.0`));
3234	TensorView* tv5 = add(tv3, tv2);
3235
3236	TensorView* tv6 = add(tv5, tv4);
3237	TensorView* tv7 = add(tv1, tv4);
3238
3239	fusion.addOutput(tv6);
3240	fusion.addOutput(tv7);
3241
3242	// Lets setup to actually run
3243	tv7->merge(`0`);
3244	tv7->split(`0`, `128`);
3245	tv7->split(`0`, `4`);
3246
3247	tv7->axis(`0`)->parallelize(ParallelType::BIDx);
3248
3249	tv0->computeAt(tv7, `1`);
3250
3251	// The this-position of the last tensor should be zero.
3252	TORCH_CHECK(
3253	tv7->nDims() == `3` && tv7->getComputeAtPosition() == `0` &&
3254	tv7->getMaxProducerPosition() == `1`);
3255	TORCH_CHECK(
3256	tv6->nDims() == `3` && tv6->getComputeAtPosition() == `0` &&
3257	tv6->getMaxProducerPosition() == `1`);
3258	// The position of every other tensor should be 1.
3259	for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
3260	TORCH_CHECK(tv->nDims() == `3` && tv->getComputeAtPosition() == `1`);
3261	}
3262
3263	for (Val* val : fusion.vals()) {
3264	if (!val->isFusionInput() &&
3265	val->getValType().value() == ValType::TensorView) {
3266	TensorView* tv = static_cast<TensorView*>(val);
3267	tv->axis(`1`)->parallelize(ParallelType::Unroll);
3268	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
3269	}
3270	}
3271
3272	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3273
3274	at::Tensor aten_input = at::randn({`129`, `127`}, options);
3275
3276	FusionExecutor fe;
3277	fe.compileFusion(&fusion, {aten_input});
3278	auto cg_outputs = fe.runFusion({aten_input});
3279
3280	at::Tensor aten_input_t = aten_input.t();
3281
3282	auto t1 = aten_input_t.mul({`0.5`});
3283	auto t2 = t1.mul({-`1.0`});
3284	auto t3 = t1.add({`3.0`});
3285	auto t4 = t1.mul({`2.0`});
3286	auto t5 = t3.add(t2);
3287	auto t6 = t5.add(t4);
3288	auto t7 = t1.add(t4);
3289
3290	std::vector<at::Tensor> aten_outputs = {t6, t7};
3291
3292	testValidate(
3293	&fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
3294	}
3295
3296	TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
3297	// Case 2
3298	// tv1 = tv0 -1*
3299	// tv2 = tv0 + 3
3300	// tv3 = tv0 2*
3301	// tv4 = tv2 + tv1
3302	// tv5 = tv4 + tv3
3303	// tv6 = tv5 + tv3
3304	Fusion fusion;
3305	FusionGuard fg(&fusion);
3306
3307	TensorView* tv0 = makeSymbolicTensor(`2`);
3308	fusion.addInput(tv0);
3309
3310	tv0 = transpose(tv0);
3311
3312	TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-`1.0`));
3313	TensorView* tv2 = add(tv0, IrBuilder::create<Double>(`3.0`));
3314	TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(`2.0`));
3315	TensorView* tv4 = add(tv2, tv1);
3316
3317	TensorView* tv5 = add(tv4, tv3);
3318	TensorView* tv6 = add(tv5, tv3);
3319
3320	fusion.addOutput(tv5);
3321	fusion.addOutput(tv6);
3322
3323	// Lets setup to actually run
3324	tv6->merge(`0`);
3325	tv6->split(`0`, `128`);
3326	tv6->split(`0`, `4`);
3327
3328	tv6->axis(`0`)->parallelize(ParallelType::BIDx);
3329
3330	tv0->computeAt(tv6, `1`);
3331
3332	for (Val* val : fusion.vals()) {
3333	if (!val->isFusionInput() &&
3334	val->getValType().value() == ValType::TensorView) {
3335	TensorView* tv = static_cast<TensorView*>(val);
3336
3337	tv->axis(`1`)->parallelize(ParallelType::Unroll);
3338	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
3339	}
3340	}
3341
3342	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3343	at::Tensor input = at::randn({`129`, `127`}, options);
3344
3345	FusionExecutor fe;
3346	fe.compileFusion(&fusion, {input});
3347	auto cg_outputs = fe.runFusion({input});
3348
3349	auto input_t = input.t();
3350	auto t1 = input_t.mul({-`1.0`});
3351	auto t2 = input_t.add({`3.0`});
3352	auto t3 = input_t.mul({`2.0`});
3353	auto t4 = t2.add(t1);
3354	auto t5 = t4.add(t3);
3355	auto t6 = t5.add(t3);
3356
3357	std::vector<at::Tensor> aten_outputs = {t5, t6};
3358
3359	testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
3360	}
3361
3362	TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
3363	// Case 3
3364	// T2 = T1 0.979361*
3365	// T3 = T2 T0*
3366	Fusion fusion;
3367	FusionGuard fg(&fusion);
3368
3369	TensorView* tv0 = makeSymbolicTensor(`4`);
3370	fusion.addInput(tv0);
3371
3372	tv0 = permute(tv0, {`3`, `0`, `1`, `2`});
3373
3374	TensorView* tv1 = makeSymbolicTensor(`4`);
3375	fusion.addInput(tv1);
3376
3377	tv1 = permute(tv1, {`3`, `0`, `1`, `2`});
3378
3379	TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(`.979361`));
3380	TensorView* tv3 = mul(tv2, tv0);
3381
3382	fusion.addOutput(tv3);
3383
3384	// Lets setup to actually run
3385	while (tv3->nDims() > `1`)
3386	tv3->merge(`0`);
3387	tv3->split(`0`, `128`);
3388	tv3->split(`0`, `4`);
3389
3390	tv0->computeAt(tv3, `1`);
3391	tv1->computeAt(tv3, `1`);
3392
3393	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
3394
3395	for (Val* val : fusion.vals()) {
3396	if (!val->isFusionInput() &&
3397	val->getValType().value() == ValType::TensorView) {
3398	TensorView* tv = static_cast<TensorView*>(val);
3399
3400	tv->axis(`1`)->parallelize(ParallelType::Unroll);
3401	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
3402	}
3403	}
3404
3405	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3406	at::Tensor t0 = at::randn({`129`, `127`, `63`, `65`}, options);
3407	at::Tensor t1 = at::rand_like(t0, options);
3408
3409	std::vector<IValue> aten_inputs = {t0, t1};
3410
3411	FusionExecutor fe;
3412	fe.compileFusion(&fusion, aten_inputs);
3413	auto cg_outputs = fe.runFusion(aten_inputs);
3414
3415	auto t0_t = t0.permute({`3`, `0`, `1`, `2`});
3416	auto t1_t = t1.permute({`3`, `0`, `1`, `2`});
3417	auto t2 = t1_t.mul({`0.979361`});
3418	auto aten_output = t2.mul(t0_t);
3419
3420	testValidate(
3421	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3422	}
3423
3424	TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
3425	// Case 4
3426	// T4 = T2 - T3
3427	// T5 = T1 + T4
3428	// T6 = T5 - T0
3429	Fusion fusion;
3430	FusionGuard fg(&fusion);
3431
3432	TensorView* tv0 = makeSymbolicTensor(`4`);
3433	fusion.addInput(tv0);
3434
3435	tv0 = permute(tv0, {`3`, `0`, `1`, `2`});
3436
3437	TensorView* tv1 = makeSymbolicTensor(`4`);
3438	fusion.addInput(tv1);
3439
3440	tv1 = permute(tv1, {`3`, `0`, `1`, `2`});
3441
3442	TensorView* tv2 = makeSymbolicTensor(`4`);
3443	fusion.addInput(tv2);
3444
3445	tv2 = permute(tv2, {`3`, `0`, `1`, `2`});
3446
3447	TensorView* tv3 = makeSymbolicTensor(`4`);
3448	fusion.addInput(tv3);
3449
3450	tv3 = permute(tv3, {`3`, `0`, `1`, `2`});
3451
3452	TensorView* tv4 = sub(tv2, tv3);
3453	TensorView* tv5 = add(tv1, tv4);
3454	TensorView* tv6 = sub(tv5, tv0);
3455
3456	fusion.addOutput(tv6);
3457
3458	// Lets setup to actually run
3459	while (tv6->nDims() > `1`)
3460	tv6->merge(`0`);
3461	tv6->split(`0`, `128`);
3462	tv6->split(`0`, `4`);
3463
3464	tv0->computeAt(tv6, `1`);
3465	tv1->computeAt(tv6, `1`);
3466	tv2->computeAt(tv6, `1`);
3467	tv3->computeAt(tv6, `1`);
3468
3469	tv6->axis(`0`)->parallelize(ParallelType::BIDx);
3470
3471	for (Val* val : fusion.vals()) {
3472	if (!val->isFusionInput() &&
3473	val->getValType().value() == ValType::TensorView) {
3474	TensorView* tv = static_cast<TensorView*>(val);
3475
3476	tv->axis(`1`)->parallelize(ParallelType::Unroll);
3477	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
3478	}
3479	}
3480
3481	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3482	at::Tensor t0 = at::randn({`129`, `127`, `63`, `65`}, options);
3483	at::Tensor t1 = at::rand_like(t0, options);
3484	at::Tensor t2 = at::rand_like(t0, options);
3485	at::Tensor t3 = at::rand_like(t0, options);
3486
3487	std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
3488
3489	FusionExecutor fe;
3490	fe.compileFusion(&fusion, aten_inputs);
3491	auto cg_outputs = fe.runFusion(aten_inputs);
3492
3493	auto t0_t = t0.permute({`3`, `0`, `1`, `2`});
3494	auto t1_t = t1.permute({`3`, `0`, `1`, `2`});
3495	auto t2_t = t2.permute({`3`, `0`, `1`, `2`});
3496	auto t3_t = t3.permute({`3`, `0`, `1`, `2`});
3497	auto t4 = t2_t.sub(t3_t);
3498	auto t5 = t1_t.add(t4);
3499	auto aten_output = t5.sub(t0_t);
3500
3501	testValidate(
3502	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3503	}
3504
3505	TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
3506	// Case 5
3507	// tv2 = tv0 + 2.0
3508	// tv3 = tv1 tv2*
3509	Fusion fusion;
3510	FusionGuard fg(&fusion);
3511
3512	// Set up your input tensor views
3513	TensorView* tv0 = makeSymbolicTensor(`2`);
3514	fusion.addInput(tv0);
3515	tv0 = transpose(tv0);
3516	TensorView* tv1 = makeSymbolicTensor(`2`);
3517	fusion.addInput(tv1);
3518	tv1 = transpose(tv1);
3519	TensorView* tv2 = add(tv0, IrBuilder::create<Double>(`2.0`));
3520	TensorView* tv3 = mul(tv1, tv2);
3521	fusion.addOutput(tv3);
3522
3523	tv3->merge(`0`);
3524	tv3->split(-`1`, `8`);
3525	tv3->split(-`1`, `4`);
3526
3527	tv0->computeAt(tv3, `1`);
3528	tv1->computeAt(tv3, `1`);
3529	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
3530
3531	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3532	at::Tensor t0 = at::randn({`63`, `65`}, options);
3533	at::Tensor t1 = at::rand_like(t0, options);
3534
3535	std::vector<IValue> aten_inputs = {t0, t1};
3536
3537	FusionExecutor fe;
3538	fe.compileFusion(&fusion, aten_inputs);
3539	auto cg_outputs = fe.runFusion(aten_inputs);
3540
3541	auto t2 = t0.t().add(`2.0`);
3542	auto aten_output = t1.t().mul(t2);
3543
3544	testValidate(
3545	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3546	}
3547
3548	TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
3549	Fusion fusion;
3550	FusionGuard fg(&fusion);
3551
3552	TensorView* tv0 = makeSymbolicTensor(`2`);
3553	fusion.addInput(tv0);
3554	tv0 = transpose(tv0);
3555	TensorView* tv1 = makeSymbolicTensor(`2`);
3556	fusion.addInput(tv1);
3557	tv1 = transpose(tv1);
3558	TensorView* tv2 = add(tv0, IrBuilder::create<Double>(`2.0`));
3559	TensorView* tv3 = mul(tv1, tv2);
3560	fusion.addOutput(tv3);
3561
3562	tv2->merge(`0`);
3563	tv2->split(-`1`, `8`);
3564	tv2->split(-`1`, `4`);
3565	tv3->merge(`0`);
3566	tv3->split(-`1`, `8`);
3567
3568	tv0->computeAt(tv3, `1`);
3569	tv1->computeAt(tv3, `1`);
3570
3571	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
3572
3573	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3574	at::Tensor t0 = at::randn({`63`, `65`}, options);
3575	at::Tensor t1 = at::rand_like(t0, options);
3576
3577	std::vector<IValue> aten_inputs = {t0, t1};
3578
3579	FusionExecutor fe;
3580	fe.compileFusion(&fusion, aten_inputs);
3581	auto cg_outputs = fe.runFusion(aten_inputs);
3582
3583	auto t2 = t0.t().add(`2.0`);
3584	auto aten_output = t1.t().mul(t2);
3585
3586	testValidate(
3587	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3588	}
3589
3590	TEST_F(NVFuserTest, FusionSegmentReducePointwise_CUDA) {
3591	auto fusion = std::make_unique<Fusion>();
3592	FusionGuard fg(fusion.get());
3593
3594	TensorView* tv0 = makeSymbolicTensor(`2`);
3595	TensorView* tv1 = makeSymbolicTensor(`1`);
3596	TensorView* tv2 = makeSymbolicTensor(`2`);
3597
3598	fusion ->addInput(tv0);
3599	fusion ->addInput(tv1);
3600	fusion ->addInput(tv2);
3601
3602	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`1`)); // Group 0
3603	TensorView* tv4 =
3604	max(tv3, {`0`}); // Group 0 (use max instead to avoid numerical issues)
3605	TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce,
3606	// keeps normalization scheduler away)
3607	TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce)
3608
3609	fusion ->addOutput(tv6);
3610
3611	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3612	at::Tensor t0 = at::randn({`128`, `65`}, options);
3613	at::Tensor t1 = at::randn({`65`}, options);
3614	at::Tensor t2 = at::randn({`128`, `65`}, options);
3615
3616	auto t3 = t0.add(`1.0`);
3617	auto t4 = std::get<`0`>(at::max(t3, `0`));
3618	auto t5 = t4.add(t1);
3619	auto t6 = t5.add(t2);
3620
3621	FusionExecutorCache executor_cache(std::move(fusion));
3622
3623	auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
3624
3625	TORCH_CHECK(
3626	executor_cache.getMostRecentKernelRuntime()->isSegmented(),
3627	"segmentation didn't happen");
3628	TORCH_CHECK(
3629	executor_cache.getMostRecentKernelRuntime()
3630	->fusionSegments()
3631	->groups()
3632	.size() == `2`,
3633	"segmentation didn't happen as expected");
3634
3635	testValidate(
3636	executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
3637	}
3638
3639	TEST_F(NVFuserTest, FusionMultipleVectorize_CUDA) {
3640	auto fusion = std::make_unique<Fusion>();
3641	FusionGuard fg(fusion.get());
3642
3643	TensorView* tv0 = makeContigTensor(`1`);
3644	TensorView* tv1 = makeContigTensor(`1`);
3645
3646	fusion ->addInput(tv0);
3647	fusion ->addInput(tv1);
3648
3649	TensorView* tv3 = add(tv0, tv1);
3650	fusion ->addOutput(tv3);
3651
3652	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3653	at::Tensor t0 = at::randn({`40960`}, options);
3654	at::Tensor t1 = at::randn({`40960`}, options);
3655	auto t2 = t0 + t1;
3656
3657	FusionExecutorCache executor_cache(std::move(fusion));
3658	executor_cache.profile(true);
3659
3660	auto outputs = executor_cache.runFusionWithInputs({t0, t1});
3661	auto runtime1 = executor_cache.getMostRecentKernelRuntime();
3662	auto log1 =
3663	executor_cache.getMostRecentExecutorInfo().params ->as<PointwiseParams>();
3664	TORCH_CHECK(log1 != nullptr);
3665	TORCH_CHECK(log1->vectorize);
3666
3667	testValidate(
3668	executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
3669
3670	t0 = at::randn({`40964`}, options);
3671	t1 = at::randn({`40964`}, options);
3672	t2 = t0 + t1;
3673
3674	outputs = executor_cache.runFusionWithInputs({t0, t1});
3675	auto runtime2 = executor_cache.getMostRecentKernelRuntime();
3676	auto log2 =
3677	executor_cache.getMostRecentExecutorInfo().params ->as<PointwiseParams>();
3678	TORCH_CHECK(log2 != nullptr);
3679	TORCH_CHECK(log2->vectorize);
3680
3681	testValidate(
3682	executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
3683
3684	t0 = at::randn({`40962`}, options);
3685	t1 = at::randn({`40962`}, options);
3686	t2 = t0 + t1;
3687
3688	outputs = executor_cache.runFusionWithInputs({t0, t1});
3689	auto runtime3 = executor_cache.getMostRecentKernelRuntime();
3690	auto log3 =
3691	executor_cache.getMostRecentExecutorInfo().params ->as<PointwiseParams>();
3692	TORCH_CHECK(log3 != nullptr);
3693	TORCH_CHECK(log3->vectorize);
3694
3695	testValidate(
3696	executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
3697
3698	TORCH_CHECK(runtime1 == runtime2);
3699	TORCH_CHECK(runtime1 != runtime3);
3700	}
3701
3702	TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) {
3703	Fusion fusion;
3704	FusionGuard fg(&fusion);
3705
3706	TensorView* tv0 = makeContigTensor(`3`);
3707
3708	fusion.addInput(tv0);
3709
3710	auto tv1 = unaryOp(UnaryOpType::Sin, tv0);
3711
3712	fusion.addOutput(tv1);
3713
3714	auto tv0_cache = tv0->cacheAfter();
3715
3716	auto tv1_cache = tv1->cacheBefore();
3717
3718	tv1->merge(`0`);
3719	tv1->merge(`0`);
3720	tv1->split(`0`, `4`);
3721	tv1->split(`0`, `128`);
3722
3723	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
3724	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
3725
3726	tv0->computeAt(tv1, `2`);
3727
3728	tv0_cache->axis(`2`)->parallelize(ParallelType::Vectorize);
3729	tv1->axis(`2`)->parallelize(ParallelType::Vectorize);
3730
3731	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3732
3733	at::Tensor aten_input = at::empty({`2`, `6`, `32`}, options);
3734
3735	FusionExecutor fe;
3736	fe.compileFusion(&fusion, {aten_input});
3737	auto cg_outputs = fe.runFusion({aten_input});
3738
3739	at::Tensor aten_output = aten_input.sin();
3740
3741	testValidate(
3742	&fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
3743	}
3744
3745	TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
3746	Fusion fusion;
3747	FusionGuard fg(&fusion);
3748	// dimensionality of the problem
3749	int nDims = `3`;
3750
3751	// Set up your input tensor views
3752	TensorView* tv0 = makeContigTensor(nDims);
3753	TensorView* tv1 = makeContigTensor(nDims);
3754
3755	// Register your inputs
3756	fusion.addInput(tv0);
3757	fusion.addInput(tv1);
3758
3759	// Do math with it, it returns a `Val` but can be static_casted back to*
3760	// TensorView
3761	TensorView* tv2 = add(tv1, IrBuilder::create<Double>(`2.0`));
3762	TensorView* tv3 = add(tv0, tv2);
3763
3764	// Register your outputs
3765	fusion.addOutput(tv3);
3766
3767	auto tv0_cache = tv0->cacheAfter();
3768	auto tv1_cache = tv1->cacheAfter();
3769	auto tv3_cache = tv3->cacheBefore();
3770
3771	// Do transformations, remember, transformations are outputs to inputs
3772	// This doesn't have to be in this order
3773	tv3->merge(`1`);
3774
3775	// Split by n_threads
3776	tv3->split(`1`, `2`);
3777	tv3->split(`0`, `3`);
3778	tv3->split(`0`, `1`);
3779
3780	// [bidx, unswitch, unroll{2}, tidx, vectorize{2}]
3781
3782	// Parallelize TV3
3783	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
3784	tv3->axis(`1`)->parallelize(ParallelType::Unswitch);
3785	tv3->axis(`2`)->parallelize(ParallelType::Unroll);
3786	tv3->axis(`3`)->parallelize(ParallelType::TIDx);
3787
3788	tv3->reorder({{`4`, `2`}});
3789	// [bidx, unswitch, vectorize{2}, unroll{2}, tidx]
3790
3791	TransformPropagatorWithCheck propagator(tv3);
3792	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
3793	scheduler_utils::parallelizeAllLike(tv3);
3794
3795	tv0_cache->axis(`2`)->parallelize(ParallelType::Vectorize);
3796	tv1_cache->axis(`2`)->parallelize(ParallelType::Vectorize);
3797	tv3->axis(`2`)->parallelize(ParallelType::Vectorize);
3798
3799	// For all inputs, computeAt the output inline, temporaries should be squeezed
3800	// between them
3801	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
3802	tv1->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
3803
3804	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3805
3806	at::Tensor input1 = at::randn({`64`, `2`, `128`}, options);
3807	at::Tensor input2 = at::rand_like(input1);
3808	at::Tensor output = at::empty_like(input1);
3809
3810	FusionExecutor fe;
3811	fe.compileFusion(&fusion, {input1, input2});
3812	fe.runFusion({input1, input2}, {output});
3813
3814	at::Tensor tv2_ref = input2 + `2.0`;
3815	at::Tensor output_ref = input1 + tv2_ref;
3816
3817	TORCH_CHECK(output_ref.equal(output));
3818	}
3819
3820	TEST_F(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) {
3821	auto fusion = std::make_unique<Fusion>();
3822	FusionGuard fg(fusion.get());
3823
3824	std::vector<int64_t> input_shape{`32`, `64`, `8`};
3825	const int kReductionAxis = `1`;
3826
3827	auto tv0 = TensorViewBuilder ()
3828	.ndims(input_shape.size())
3829	.dtype(DataType::Double)
3830	.build();
3831
3832	fusion ->addInput(tv0);
3833
3834	auto tv1 = add(tv0, IrBuilder::create<Double>(`1.0`));
3835	auto tv2 = sum(tv1, {`2`}); // Group 0
3836
3837	auto output = softmax(tv2, kReductionAxis); // Group 1
3838	fusion ->addOutput(output);
3839
3840	auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, `0`);
3841	at::Tensor at_x = at::randn(input_shape, options);
3842
3843	FusionExecutorCache executor_cache(std::move(fusion));
3844
3845	auto outputs = executor_cache.runFusionWithInputs({at_x});
3846
3847	auto t1 = at_x.add(`1.0`);
3848	auto t2 = t1.sum({`2`});
3849	auto t3 = at::_softmax(t2.to(at::kDouble), -`1`, false);
3850
3851	auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
3852	TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen");
3853	TORCH_CHECK(
3854	optimized_fusion->fusionSegments()->groups().size() == `2`,
3855	"segmentation didn't happen as expected");
3856
3857	testValidate(
3858	executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__);
3859	}
3860
3861	TEST_F(NVFuserTest, FusionSwizzle1_CUDA) {
3862	Fusion fusion;
3863	FusionGuard fg(&fusion);
3864
3865	auto tv0 = makeSymbolicTensor(`1`);
3866	fusion.addInput(tv0);
3867	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3868	auto tv2 = mul(tv1, IrBuilder::create<Double>(`2`));
3869	fusion.addOutput(tv2);
3870
3871	tv2->split(`0`, `7`);
3872	tv2->split(`0`, `9`);
3873
3874	tv0->computeAt(tv2, `1`);
3875
3876	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
3877
3878	tv1->setMemoryType(MemoryType::Shared);
3879	tv1->swizzle(SwizzleType::Transpose, {`1`, `2`});
3880
3881	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
3882	tv1->axis(`2`)->parallelize(ParallelType::TIDy);
3883
3884	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
3885	tv2->axis(`2`)->parallelize(ParallelType::TIDy);
3886
3887	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3888	at::Tensor t0 = at::randn({`100`}, options);
3889
3890	std::vector<IValue> aten_inputs = {t0};
3891
3892	FusionExecutor fe;
3893	fe.compileFusion(&fusion, aten_inputs);
3894	auto cg_outputs = fe.runFusion(aten_inputs);
3895
3896	auto aten_output = (t0 + `1`) * `2`;
3897
3898	testValidate(
3899	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3900	}
3901
3902	TEST_F(NVFuserTest, FusionSwizzle2_CUDA) {
3903	Fusion fusion;
3904	FusionGuard fg(&fusion);
3905
3906	auto tv0 = makeSymbolicTensor(`1`);
3907	fusion.addInput(tv0);
3908	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
3909	auto tv2 = mul(tv1, IrBuilder::create<Double>(`2`));
3910	fusion.addOutput(tv2);
3911
3912	tv1->split(-`1`, `4`);
3913	tv1->split(-`2`, `4`);
3914
3915	tv2->split(-`1`, `4`);
3916	tv2->split(-`2`, `4`);
3917
3918	tv0->computeAt(tv2, `1`);
3919
3920	tv2->reorder({{-`1`, -`2`}});
3921
3922	tv1->setMemoryType(MemoryType::Shared);
3923	tv1->swizzle(SwizzleType::Transpose, {-`2`, -`1`});
3924
3925	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
3926	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
3927	tv2->axis(-`2`)->parallelize(ParallelType::TIDy);
3928	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
3929	tv1->axis(-`2`)->parallelize(ParallelType::TIDy);
3930
3931	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3932	at::Tensor t0 = at::randn({`123`}, options);
3933
3934	std::vector<IValue> aten_inputs = {t0};
3935
3936	FusionExecutor fe;
3937	fe.compileFusion(&fusion, aten_inputs);
3938	auto cg_outputs = fe.runFusion(aten_inputs);
3939
3940	auto aten_output = (t0 + `1`) * `2`;
3941
3942	testValidate(
3943	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
3944	}
3945
3946	TEST_F(NVFuserTest, FusionGridPersistence_CUDA) {
3947	Fusion fusion;
3948	FusionGuard fg(&fusion);
3949
3950	auto tv0 = makeSymbolicTensor(`1`);
3951	fusion.addInput(tv0);
3952
3953	auto tv1 = sum(tv0, {`0`});
3954	auto tv2 = broadcast(tv1, {true});
3955	auto tv3 = add(tv0, tv2);
3956	fusion.addOutput(tv3);
3957
3958	std::vector<TensorView*> tvs = {tv1, tv2, tv3};
3959	for (auto tv : tvs) {
3960	tv->split(`0`, `2`);
3961	tv->axis(`0`)->parallelize(ParallelType::BIDx);
3962	tv->axis(`1`)->parallelize(ParallelType::BIDy);
3963	}
3964
3965	const int numel_x = `10`;
3966
3967	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
3968	at::Tensor input = at::randn({numel_x}, options);
3969
3970	FusionExecutor fe;
3971	fe.compileFusion(&fusion, {input});
3972	auto out = fe.runFusion({input});
3973
3974	auto aten_output = input.sum({`0`}).unsqueeze(-`1`).add(input);
3975
3976	testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
3977	}
3978
3979	TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) {
3980	Fusion fusion;
3981	FusionGuard fg(&fusion);
3982
3983	auto tv0 = makeSymbolicTensor(`2`);
3984	fusion.addInput(tv0);
3985
3986	auto tv1 = sum(tv0, {`0`});
3987	auto tv2 = broadcast(tv1, {true, false});
3988	auto tv3 = add(tv0, tv2);
3989	fusion.addOutput(tv3);
3990
3991	std::vector<TensorView*> tvs = {tv1, tv2, tv3};
3992	for (auto tv : tvs) {
3993	tv->split(`0`, `2`);
3994	tv->axis(`0`)->parallelize(ParallelType::BIDx);
3995	tv->axis(`1`)->parallelize(ParallelType::TIDy);
3996	tv->axis(`2`)->parallelize(ParallelType::TIDx);
3997	}
3998
3999	const int numel_x = `10`;
4000	const int numel_y = `3`;
4001
4002	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4003	at::Tensor input = at::randn({numel_x, numel_y}, options);
4004
4005	FusionExecutor fe;
4006	fe.compileFusion(&fusion, {input});
4007	auto out = fe.runFusion({input});
4008
4009	auto aten_output = input.sum({`0`}).unsqueeze(`0`).add(input);
4010
4011	testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
4012	}
4013
4014	TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) {
4015	Fusion fusion;
4016	FusionGuard fg(&fusion);
4017
4018	auto tv0 = makeSymbolicTensor(`1`);
4019	fusion.addInput(tv0);
4020
4021	auto tvs = Welford(tv0, {`0`});
4022	auto tv4 = add(tvs.avg, tvs.var_sum);
4023	auto tv5 = broadcast(tv4, {true});
4024	auto tv6 = add(tv0, tv5);
4025	fusion.addOutput(tv6);
4026
4027	std::vector<TensorView*> schedule_tvs = {
4028	tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
4029
4030	for (auto tv : schedule_tvs) {
4031	tv->split(`0`, `2`);
4032	tv->axis(`0`)->parallelize(ParallelType::BIDx);
4033	tv->axis(`1`)->parallelize(ParallelType::BIDy);
4034	}
4035
4036	const int numel_x = `10`;
4037
4038	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4039	at::Tensor input = at::randn({numel_x}, options);
4040
4041	FusionExecutor fe;
4042	fe.compileFusion(&fusion, {input});
4043	auto out = fe.runFusion({input});
4044
4045	auto aten_output = (input.mean({`0`}) + (input.var({`0`}, false) * numel_x))
4046	.unsqueeze(-`1`)
4047	.add(input);
4048
4049	testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
4050	}
4051
4052	TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) {
4053	Fusion fusion;
4054	FusionGuard fg(&fusion);
4055
4056	auto tv0 = makeSymbolicTensor(`2`);
4057	fusion.addInput(tv0);
4058
4059	auto tvs = Welford(tv0, {`0`});
4060	auto tv4 = add(tvs.avg, tvs.var_sum);
4061	auto tv5 = broadcast(tv4, {true, false});
4062	auto tv6 = add(tv0, tv5);
4063	fusion.addOutput(tv6);
4064
4065	std::vector<TensorView*> schedule_tvs = {
4066	tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
4067	for (auto tv : schedule_tvs) {
4068	tv->split(`0`, `2`);
4069	tv->axis(`0`)->parallelize(ParallelType::BIDx);
4070	tv->axis(`1`)->parallelize(ParallelType::TIDy);
4071	tv->axis(`2`)->parallelize(ParallelType::TIDx);
4072	}
4073	tv4->axis(`0`)->parallelize(ParallelType::TIDx);
4074
4075	const int numel_x = `10`;
4076	const int numel_y = `3`;
4077
4078	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4079	at::Tensor input = at::randn({numel_x, numel_y}, options);
4080
4081	FusionExecutor fe;
4082	fe.compileFusion(&fusion, {input});
4083	auto out = fe.runFusion({input});
4084
4085	auto aten_output = (input.mean({`0`}) + (input.var({`0`}, false) * numel_x))
4086	.unsqueeze(`0`)
4087	.add(input);
4088
4089	testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
4090	}
4091
4092	TEST_F(NVFuserTest, FusionIssue633_CUDA) {
4093	Fusion fusion;
4094	FusionGuard fg(&fusion);
4095
4096	const int dx = `10`;
4097	const int dy = `11`;
4098	const int dz = `12`;
4099
4100	auto tv0 = makeConcreteTensor({dx, dy, dz});
4101	fusion.addInput(tv0);
4102	auto tv1 = makeConcreteTensor({dx, dy, `1`});
4103	fusion.addInput(tv1);
4104	auto tv2 = add(tv0, tv1);
4105	fusion.addOutput(tv2);
4106
4107	tv2->merge(`1`);
4108	tv2->merge(`0`);
4109	tv2->split(-`1`, `128`);
4110
4111	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4112	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
4113
4114	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4115	at::Tensor t0 = at::randn({dx, dy, dz}, options);
4116	at::Tensor t1 = at::randn({dx, dy, `1`}, options);
4117	std::vector<IValue> aten_inputs = {t0, t1};
4118
4119	FusionExecutor fe;
4120	fe.compileFusion(&fusion, aten_inputs);
4121	auto cg_outputs = fe.runFusion(aten_inputs);
4122
4123	auto aten_output = t0 + t1;
4124
4125	testValidate(
4126	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4127	}
4128
4129	TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
4130	Fusion fusion;
4131	FusionGuard fg(&fusion);
4132
4133	std::vector<int64_t> shape{`17`, `19`};
4134
4135	auto tv0 = makeSymbolicTensor(`1`);
4136	fusion.addInput(tv0);
4137	auto tv1 = makeSymbolicTensor(`2`);
4138	fusion.addInput(tv1);
4139	auto tv2 = broadcast(tv0, {false, true});
4140	auto tv3 = add(tv1, tv2);
4141	fusion.addOutput(tv3);
4142
4143	tv3->split(`1`, `128`);
4144	tv0->computeAt(tv3, `2`);
4145
4146	for (auto tv : {tv2, tv3}) {
4147	tv->axis(-`1`)->parallelize(ParallelType::TIDx);
4148	}
4149
4150	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4151	at::Tensor t0 = at::randn({shape [`0`]}, options);
4152	at::Tensor t1 = at::randn(shape, options);
4153	std::vector<IValue> aten_inputs = {t0, t1};
4154
4155	FusionExecutor fe;
4156	fe.compileFusion(&fusion, aten_inputs);
4157	auto cg_outputs = fe.runFusion(aten_inputs);
4158
4159	auto t3 = t0.unsqueeze(-`1`).expand(shape) + t1;
4160
4161	testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
4162	}
4163
4164	TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
4165	Fusion fusion;
4166	FusionGuard fg(&fusion);
4167
4168	auto tv0 = makeContigTensor(`2`);
4169	auto tv1 = makeContigTensor(`2`);
4170	fusion.addInput(tv0);
4171	fusion.addInput(tv1);
4172
4173	auto tv2 = add(tv0, tv1);
4174	fusion.addOutput(tv2);
4175
4176	const int kTDX = `64`;
4177	const int kVecSize = `4`;
4178	const int kNumElems = kTDX * kVecSize;
4179
4180	tv2->split(`1`, kNumElems);
4181
4182	auto c0 = tv0->cacheAfter();
4183	auto c1 = tv1->cacheAfter();
4184	auto c2 = tv2->cacheBefore();
4185
4186	tv2->split(-`1`, kVecSize);
4187
4188	c0->computeAt(tv2, -`2`);
4189	c1->computeAt(tv2, -`2`);
4190
4191	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4192	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4193
4194	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4195	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
4196	tv2->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4197
4198	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4199	const int bx = `128`;
4200	const int by = `457`;
4201	at::Tensor t0 = at::randn({bx, by}, options);
4202	at::Tensor t1 = at::randn({bx, by}, options);
4203
4204	std::vector<IValue> aten_inputs = {t0, t1};
4205
4206	FusionExecutor fe;
4207	fe.compileFusion(&fusion, aten_inputs);
4208	auto cg_outputs = fe.runFusion(aten_inputs);
4209
4210	auto aten_output = t0 + t1;
4211	testValidate(
4212	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4213	}
4214
4215	TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
4216	Fusion fusion;
4217	FusionGuard fg(&fusion);
4218
4219	auto tv0 = makeContigTensor(`4`);
4220	auto tv1 = makeContigTensor(`4`);
4221	fusion.addInput(tv0);
4222	fusion.addInput(tv1);
4223
4224	auto tv2 = add(tv0, tv1);
4225	fusion.addOutput(tv2);
4226
4227	tv2->reorder({{`0`, `1`}, {`1`, `0`}});
4228	tv2->merge(-`2`);
4229
4230	const int kTDX = `64`;
4231	const int kVecSize = `2`;
4232	const int kNumElems = kTDX * kVecSize;
4233
4234	tv2->split(-`1`, kNumElems);
4235
4236	auto c0 = tv0->cacheAfter();
4237	auto c1 = tv1->cacheAfter();
4238	auto c2 = tv2->cacheBefore();
4239
4240	tv2->split(`0`, `128`);
4241	tv2->split(-`1`, kVecSize);
4242
4243	c0->computeAt(tv2, -`2`);
4244	c1->computeAt(tv2, -`2`);
4245
4246	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4247	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4248
4249	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4250	tv2->axis(`1`)->parallelize(ParallelType::BIDy);
4251	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
4252	tv2->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4253
4254	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4255	const int n = `32`;
4256	const int c = `127`;
4257	const int h = `51`;
4258	const int w = `23`;
4259	at::Tensor t0 = at::randn({n, c, h, w}, options);
4260	at::Tensor t1 = at::randn({n, c, h, w}, options);
4261
4262	std::vector<IValue> aten_inputs = {t0, t1};
4263
4264	FusionExecutor fe;
4265	fe.compileFusion(&fusion, aten_inputs);
4266	auto cg_outputs = fe.runFusion(aten_inputs);
4267
4268	auto aten_output = t0 + t1;
4269	testValidate(
4270	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4271	}
4272
4273	TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
4274	Fusion fusion;
4275	FusionGuard fg(&fusion);
4276
4277	constexpr int kNumDims = `4`;
4278	constexpr int kTDX = `64`;
4279	constexpr int kVecSize = `2`;
4280	constexpr int kNumElems = kTDX * kVecSize;
4281
4282	auto tv0 = makeSymbolicTensor(kNumDims);
4283	auto tv1 = makeSymbolicTensor(kNumDims);
4284	fusion.addInput(tv0);
4285	fusion.addInput(tv1);
4286
4287	auto tv2 = add(tv0, tv1);
4288	fusion.addOutput(tv2);
4289
4290	// Create caches for vectorization
4291	auto c0 = tv0->cacheAfter();
4292	auto c1 = tv1->cacheAfter();
4293	auto c2 = tv2->cacheBefore();
4294
4295	// Merge all dimensions together except inner-most dim
4296	for (const auto idx : c10::irange(kNumDims - `2`)) {
4297	tv2->merge(`0`);
4298	}
4299	// Split inner-most dim
4300	tv2->split(-`1`, kNumElems);
4301	tv2->split(-`1`, kVecSize);
4302	TransformPropagatorWithCheck propagator(tv2);
4303	MaxRootDomainInfoSpanningTree (tv2).traverse(&propagator);
4304
4305	c0->computeAt(tv2, -`2`);
4306	c1->computeAt(tv2, -`2`);
4307
4308	// Parallelization Strategy
4309	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4310	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4311
4312	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4313	tv2->axis(`2`)->parallelize(ParallelType::TIDx);
4314	tv2->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4315
4316	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4317	const int n = `5`;
4318	const int c = `3`;
4319	const int h = `51`;
4320	const int w = `257`;
4321	at::Tensor t0 = at::randn({n, c, h, w}, options);
4322	at::Tensor t1 = at::randn({n, c, h, w}, options);
4323
4324	std::vector<IValue> aten_inputs = {t0, t1};
4325
4326	FusionExecutor fe;
4327	fe.compileFusion(&fusion, aten_inputs);
4328	auto cg_outputs = fe.runFusion(aten_inputs);
4329
4330	auto aten_output = t0 + t1;
4331	testValidate(
4332	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4333	}
4334
4335	TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
4336	Fusion fusion;
4337	FusionGuard fg(&fusion);
4338
4339	constexpr int kNumDims = `4`;
4340	constexpr int kTDX = `64`;
4341	constexpr int kVecSize = `2`;
4342	constexpr int kNumElems = kTDX * kVecSize;
4343	std::vector<int64_t> bcast_shape{`1`, `1`, `1`, -`1`};
4344
4345	auto tv0 = makeContigTensor(kNumDims);
4346	auto tv1 = TensorViewBuilder ().shape(bcast_shape).build();
4347	fusion.addInput(tv0);
4348	fusion.addInput(tv1);
4349
4350	auto tv2 = add(tv0, tv1);
4351	fusion.addOutput(tv2);
4352
4353	// Create caches for vectorization
4354	auto c0 = tv0->cacheAfter();
4355	auto c1 = tv1->cacheAfter();
4356	auto c2 = tv2->cacheBefore();
4357
4358	// Merge all dimensions together
4359	// Backward merge order is necessary for vectorize validation
4360	for (int idx = kNumDims - `1`; idx > `0`; --idx) {
4361	tv2->merge(idx - `1`);
4362	}
4363	tv2->split(-`1`, kNumElems);
4364	tv2->split(-`1`, kVecSize);
4365	TransformPropagatorWithCheck propagator(tv2);
4366	MaxRootDomainInfoSpanningTree (tv2).traverse(&propagator);
4367
4368	c0->computeAt(tv2, -`2`);
4369	c1->computeAt(tv2, -`2`);
4370
4371	// Parallelization Strategy
4372	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4373	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4374
4375	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4376	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
4377	tv2->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4378
4379	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4380	const int n = `32`;
4381	const int c = `128`;
4382	const int h = `51`;
4383	const int w = `23`;
4384	at::Tensor t0 = at::randn({n, c, h, w}, options);
4385	at::Tensor t1 = at::randn({`1`, `1`, `1`, w}, options);
4386
4387	std::vector<IValue> aten_inputs = {t0, t1};
4388
4389	FusionExecutor fe;
4390	// TODO: throw assertion - cannot merge non-contiguous vectorization axes
4391	// Make sure compilation fails
4392	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4393	ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4394	}
4395
4396	TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
4397	Fusion fusion;
4398	FusionGuard fg(&fusion);
4399
4400	auto tv0 = makeContigTensor(`2`);
4401	auto tv1 = makeContigTensor(`2`);
4402
4403	fusion.addInput(tv0);
4404	fusion.addInput(tv1);
4405
4406	auto tv2 = add(tv0, tv1);
4407
4408	auto tv3 = sum(tv2, {-`1`});
4409
4410	fusion.addOutput(tv3);
4411
4412	auto c0 = tv0->cacheAfter();
4413	auto c1 = tv1->cacheAfter();
4414
4415	tv3->split(-`1`, `128` * `4`);
4416	tv3->split(-`1`, `4`);
4417	// Reduce outer dim first
4418	auto tv4 = tv3->rFactor({-`3`, -`1`});
4419	// Tv3 will reduce threads
4420
4421	tv0->computeAt(tv3, `1`);
4422	tv1->computeAt(tv3, `1`);
4423
4424	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
4425
4426	tv0->computeAt(tv4, -`2`);
4427	tv1->computeAt(tv4, -`2`);
4428
4429	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4430	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4431
4432	tv4->axis(-`2`)->parallelize(ParallelType::TIDx);
4433	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
4434
4435	tv2->computeAt(tv4, -`1`);
4436
4437	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4438	const int bx = `128`;
4439	const int by = `2050`;
4440	at::Tensor t0 = at::randn({bx, by}, options);
4441	at::Tensor t1 = at::randn({bx, by}, options);
4442
4443	std::vector<IValue> aten_inputs = {t0, t1};
4444
4445	FusionExecutor fe;
4446	fe.compileFusion(&fusion, aten_inputs);
4447	auto cg_outputs = fe.runFusion(aten_inputs);
4448
4449	auto aten_output = t0.add(t1).sum(`1`);
4450	testValidate(
4451	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4452	}
4453
4454	TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
4455	Fusion fusion;
4456	FusionGuard fg(&fusion);
4457
4458	auto tv0 = makeContigTensor(`2`);
4459	auto tv1 = makeContigTensor(`2`);
4460
4461	fusion.addInput(tv0);
4462	fusion.addInput(tv1);
4463
4464	auto tv2 = add(tv0, tv1);
4465	fusion.addOutput(tv2);
4466
4467	tv2->split(`1`, `16`);
4468	tv2->split(`1`, `64`);
4469
4470	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4471	tv2->axis(`2`)->parallelize(ParallelType::TIDx);
4472
4473	auto c0 = tv0->cacheAfter();
4474	auto c1 = tv1->cacheAfter();
4475	auto c2 = tv2->cacheBefore();
4476
4477	c0->computeAt(tv2, -`2`);
4478	c1->computeAt(tv2, -`2`);
4479
4480	std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4481	for (auto tv : vectorized_tvs) {
4482	tv->split(-`1`, `4`);
4483	// Vectorize the wrong dimension
4484	tv->axis(-`2`)->parallelize(ParallelType::MisalignedVectorize);
4485	}
4486
4487	FusionExecutor fe;
4488	// Make sure compilation fails
4489	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4490	ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4491	}
4492
4493	TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
4494	Fusion fusion;
4495	FusionGuard fg(&fusion);
4496
4497	auto tv0 = makeSymbolicTensor(`2`);
4498	auto tv1 = makeSymbolicTensor(`2`);
4499
4500	fusion.addInput(tv0);
4501	fusion.addInput(tv1);
4502
4503	auto tv2 = add(tv0, tv1);
4504	fusion.addOutput(tv2);
4505
4506	const int kTDX = `64`;
4507	const int kVecSize = `4`;
4508	const int kNumElems = kTDX * kVecSize;
4509
4510	tv2->split(`1`, kNumElems);
4511
4512	auto c0 = tv0->cacheAfter();
4513	auto c1 = tv1->cacheAfter();
4514
4515	tv2->split(-`1`, kVecSize);
4516
4517	c0->computeAt(tv2, -`2`);
4518	c1->computeAt(tv2, -`2`);
4519
4520	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4521	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4522
4523	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4524	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
4525
4526	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4527	const int bx = `128`;
4528	const int by = `2049`;
4529	at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice (`3`)});
4530	at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice (`3`)});
4531	std::vector<IValue> aten_inputs = {t0, t1};
4532
4533	FusionExecutor fe;
4534	fe.compileFusion(&fusion, aten_inputs);
4535	auto cg_outputs = fe.runFusion(aten_inputs);
4536
4537	auto aten_output = t0 + t1;
4538	testValidate(
4539	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4540	}
4541
4542	TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
4543	Fusion fusion;
4544	FusionGuard fg(&fusion);
4545
4546	auto tv0 = makeSymbolicTensor(`2`);
4547	auto tv1 = makeSymbolicTensor(`2`);
4548
4549	fusion.addInput(tv0);
4550	fusion.addInput(tv1);
4551
4552	auto tv2 = add(tv0, tv1);
4553	fusion.addOutput(tv2);
4554
4555	const int kTDX = `64`;
4556	const int kVecSize = `4`;
4557	const int kNumElems = kTDX * kVecSize;
4558
4559	tv2->split(`1`, kNumElems);
4560
4561	auto c0 = tv0->cacheAfter();
4562	auto c1 = tv1->cacheAfter();
4563	auto c2 = tv2->cacheBefore();
4564
4565	tv2->split(-`1`, kVecSize);
4566
4567	c0->computeAt(tv2, -`2`);
4568	c1->computeAt(tv2, -`2`);
4569
4570	c0->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4571	c1->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4572
4573	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4574	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
4575	tv2->axis(-`1`)->parallelize(ParallelType::MisalignedVectorize);
4576
4577	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4578	const int bx = `128`;
4579	const int by = `2049`;
4580	at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice (`3`)});
4581	at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice (`3`)});
4582	std::vector<IValue> aten_inputs = {t0, t1};
4583
4584	FusionExecutor fe;
4585	fe.compileFusion(&fusion, aten_inputs);
4586
4587	// Failure because the input + output tensors do not have the same stride
4588	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4589	ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
4590	}
4591
4592	TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
4593	Fusion fusion;
4594	FusionGuard fg(&fusion);
4595
4596	auto tv0 = makeSymbolicTensor(`2`);
4597
4598	auto tv1 = makeSymbolicTensor(`2`);
4599	fusion.addInput(tv0);
4600	fusion.addInput(tv1);
4601
4602	auto tv2 = add(tv0, tv1);
4603	fusion.addOutput(tv2);
4604
4605	tv2->split(`1`, `16`);
4606	tv2->split(`1`, `64`);
4607
4608	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4609	tv2->axis(`2`)->parallelize(ParallelType::TIDx);
4610
4611	auto c0 = tv0->cacheAfter();
4612	auto c1 = tv1->cacheAfter();
4613	auto c2 = tv2->cacheBefore();
4614
4615	c0->computeAt(tv2, -`2`);
4616	c1->computeAt(tv2, -`2`);
4617
4618	std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4619	for (auto tv : vectorized_tvs) {
4620	tv->split(-`1`, `4`);
4621	tv->axis(-`1`)->parallelize(ParallelType::Vectorize);
4622	}
4623
4624	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4625	const int bx = `128`;
4626	const int by = `2048`;
4627	at::Tensor t0 = at::randn({bx, by}, options);
4628	at::Tensor t1 = at::randn({bx, by}, options);
4629
4630	std::vector<IValue> aten_inputs = {t0, t1};
4631
4632	FusionExecutor fe;
4633	fe.compileFusion(&fusion, aten_inputs);
4634	auto cg_outputs = fe.runFusion(aten_inputs);
4635
4636	auto aten_output = t0 + t1;
4637	testValidate(
4638	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4639	}
4640
4641	TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
4642	Fusion fusion;
4643	FusionGuard fg(&fusion);
4644
4645	auto tv0 = makeSymbolicTensor(`2`);
4646
4647	auto tv1 = makeSymbolicTensor(`2`);
4648	fusion.addInput(tv0);
4649	fusion.addInput(tv1);
4650
4651	auto tv2 = add(tv0, tv1);
4652	fusion.addOutput(tv2);
4653
4654	tv2->split(`1`, `16`);
4655	tv2->split(`1`, `64`);
4656
4657	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4658	tv2->axis(`2`)->parallelize(ParallelType::TIDx);
4659
4660	auto c0 = tv0->cacheAfter();
4661	auto c1 = tv1->cacheAfter();
4662	auto c2 = tv2->cacheBefore();
4663
4664	c0->computeAt(tv2, -`2`);
4665	c1->computeAt(tv2, -`2`);
4666
4667	std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4668	for (auto tv : vectorized_tvs) {
4669	tv->split(-`1`, `4`);
4670	// Vectorize the wrong dimension
4671	tv->axis(-`2`)->parallelize(ParallelType::Vectorize);
4672	}
4673
4674	FusionExecutor fe;
4675	// Make sure compilation fails
4676	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4677	ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4678	}
4679
4680	TEST_F(NVFuserTest, FusionVectorization3_CUDA) {
4681	Fusion fusion;
4682	FusionGuard fg(&fusion);
4683
4684	auto tv0 = makeSymbolicTensor(`2`);
4685
4686	auto tv1 = makeSymbolicTensor(`2`);
4687	fusion.addInput(tv0);
4688	fusion.addInput(tv1);
4689
4690	auto tv2 = add(tv0, tv1);
4691	fusion.addOutput(tv2);
4692
4693	tv2->split(`1`, `16`);
4694	tv2->split(`1`, `64`);
4695
4696	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
4697	tv2->axis(`2`)->parallelize(ParallelType::TIDx);
4698
4699	auto c0 = tv0->cacheAfter();
4700	auto c1 = tv1->cacheAfter();
4701	auto c2 = tv2->cacheBefore();
4702
4703	c0->computeAt(tv2, -`2`);
4704	c1->computeAt(tv2, -`2`);
4705
4706	std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
4707	for (auto tv : vectorized_tvs) {
4708	tv->split(-`1`, `4`);
4709	tv->axis(-`1`)->parallelize(ParallelType::Vectorize);
4710	}
4711
4712	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4713	const int bx = `128`;
4714	const int by = `2049`;
4715	at::Tensor t0 = at::randn({bx, by}, options);
4716	at::Tensor t1 = at::randn({bx, by}, options);
4717	std::vector<IValue> aten_inputs = {t0, t1};
4718
4719	FusionExecutor fe;
4720	fe.compileFusion(&fusion, aten_inputs);
4721	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4722	ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
4723
4724	aten_inputs [`0`] = t0.index({"...", Slice (`1`)});
4725	aten_inputs [`1`] = t1.index({"...", Slice (`1`)});
4726	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4727	ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
4728
4729	t0 = at::randn({bx, `2048`}, options).index({"...", Slice (`4`)});
4730	t1 = at::randn({bx, `2048`}, options).index({"...", Slice (`4`)});
4731	aten_inputs = {t0, t1};
4732	auto cg_outputs = fe.runFusion(aten_inputs);
4733
4734	auto aten_output = t0 + t1;
4735	testValidate(
4736	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4737	}
4738
4739	TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
4740	Fusion fusion;
4741	FusionGuard fg(&fusion);
4742
4743	auto tv0 = makeSymbolicTensor(`2`);
4744
4745	auto tv1 = makeSymbolicTensor(`2`);
4746	fusion.addInput(tv0);
4747	fusion.addInput(tv1);
4748
4749	auto tv2 = add(tv0, tv1);
4750
4751	auto tv3 = sum(tv2, {-`1`});
4752
4753	fusion.addOutput(tv3);
4754
4755	tv3->split(-`1`, `128` * `4`);
4756	tv3->split(-`1`, `4`);
4757	// Reduce outer dim first
4758	auto tv4 = tv3->rFactor({-`3`, -`1`});
4759	// Tv3 will reduce threads
4760
4761	auto tv6 = tv0->cacheAfter();
4762	auto tv7 = tv1->cacheAfter();
4763
4764	tv0->computeAt(tv3, `1`);
4765	tv1->computeAt(tv3, `1`);
4766
4767	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
4768
4769	tv0->computeAt(tv4, -`2`);
4770	tv1->computeAt(tv4, -`2`);
4771
4772	tv6->axis(-`1`)->parallelize(ParallelType::Vectorize);
4773	tv7->axis(-`1`)->parallelize(ParallelType::Vectorize);
4774
4775	tv4->axis(-`2`)->parallelize(ParallelType::TIDx);
4776	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
4777
4778	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4779	const int bx = `128`;
4780	const int by = `2048`;
4781	at::Tensor t0 = at::randn({bx, by}, options);
4782	at::Tensor t1 = at::randn({bx, by}, options);
4783
4784	std::vector<IValue> aten_inputs = {t0, t1};
4785
4786	FusionExecutor fe;
4787	fe.compileFusion(&fusion, aten_inputs);
4788	auto cg_outputs = fe.runFusion(aten_inputs);
4789
4790	auto aten_output = t0.add(t1).sum(`1`);
4791	testValidate(
4792	&fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
4793
4794	auto t3 = t0.add(t1).sum(`1`);
4795
4796	testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
4797	}
4798
4799	// Unswitched loops with extent one may omit else clause.
4800	TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
4801	Fusion fusion;
4802	FusionGuard fg(&fusion);
4803
4804	// Progressively broadcast tensors
4805	TensorView* tv0 = makeSymbolicTensor(`1`);
4806	fusion.addInput(tv0);
4807	TensorView* tv1 = makeSymbolicTensor(`2`);
4808	fusion.addInput(tv1);
4809	TensorView* tv2 = makeSymbolicTensor(`3`);
4810	fusion.addInput(tv2);
4811
4812	TensorView* tv3 = broadcast(tv0, {false, true});
4813	TensorView* tv4 = add(tv3, tv1);
4814	TensorView* tv5 = add(tv4, tv2);
4815
4816	fusion.addOutput(tv5);
4817
4818	// Split inner dimension
4819	tv5->split(`1`, `8`);
4820	// Merge middle dims with outer dimensions
4821	tv5->merge(`2`);
4822	tv5->merge(`0`);
4823
4824	// tv5[I0I1o, I1iI2]
4825	// Get a dim of size 1 to unswitch
4826	tv5->split(`0`, `1`, false);
4827
4828	// Compute everything inline
4829	tv0->computeAt(tv5, -`1`);
4830
4831	tv5->axis(`0`)->parallelize(ParallelType::Unswitch);
4832	tv5->axis(`1`)->parallelize(ParallelType::BIDx);
4833	tv5->axis(`2`)->parallelize(ParallelType::TIDx);
4834
4835	// Make sure the unswitched loop does not have an else clause.
4836	GpuLower gpulw(&fusion);
4837	TORCH_CHECK(!UnswitchInElseChecker::check(gpulw));
4838
4839	const int x = `11`;
4840	const int y = `12`;
4841	const int z = `13`;
4842	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4843	at::Tensor t0 = at::randn({x}, options);
4844	at::Tensor t1 = at::randn({x, y}, options);
4845	at::Tensor t2 = at::randn({z, x, y}, options);
4846	std::vector<IValue> aten_inputs = {t0, t1, t2};
4847
4848	FusionExecutor fe;
4849	fe.compileFusion(&fusion, aten_inputs);
4850	auto cg_outputs = fe.runFusion(aten_inputs);
4851	auto t6 = (t0.unsqueeze(-`1`) + t1).unsqueeze(`0`) + t2;
4852
4853	testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__);
4854	}
4855
4856	// The unswitched loop has extent one but inner loops don't. The else
4857	// part should not be omitted.
4858	TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
4859	Fusion fusion;
4860	FusionGuard fg(&fusion);
4861
4862	const int x = `15`;
4863	auto tv0 = makeConcreteTensor({x});
4864	fusion.addInput(tv0);
4865
4866	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4867	fusion.addOutput(tv1);
4868
4869	tv1->split(-`1`, `4`);
4870	tv1->split(-`2`, `1`);
4871
4872	tv1->axis(-`2`)->parallelize(ParallelType::Unswitch);
4873
4874	// Make sure the size-one unswitched loop does not omit the else clause.
4875	GpuLower gpulw(&fusion);
4876	TORCH_CHECK(UnswitchInElseChecker::check(gpulw));
4877
4878	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
4879	at::Tensor t0 = at::randn({x}, options);
4880	std::vector<IValue> aten_inputs = {t0};
4881
4882	FusionExecutor fe;
4883	fe.compileFusion(&fusion, aten_inputs);
4884	auto cg_outputs = fe.runFusion(aten_inputs);
4885	auto t1 = t0 + `1`;
4886
4887	testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
4888	}
4889
4890	TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
4891	Fusion fusion;
4892	FusionGuard fg(&fusion);
4893
4894	auto tv0 = makeSymbolicTensor(`1`);
4895	fusion.addInput(tv0);
4896
4897	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4898	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4899	fusion.addOutput(tv2);
4900
4901	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
4902	tv2->axis(-`1`)->parallelize(ParallelType::TIDy);
4903
4904	// Invalid as tv1 and tv2 do have the same ParallelType
4905	FusionExecutor fe;
4906	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4907	ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4908	}
4909
4910	TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
4911	Fusion fusion;
4912	FusionGuard fg(&fusion);
4913
4914	auto tv0 = makeSymbolicTensor(`1`);
4915	fusion.addInput(tv0);
4916
4917	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4918	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4919	fusion.addOutput(tv2);
4920
4921	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
4922	tv2->axis(-`1`)->parallelize(ParallelType::TIDy);
4923	tv1->setMemoryType(MemoryType::Shared);
4924
4925	// tv1 and tv2 do have the same ParallelType, but tv1 is on shared
4926	// memory, so it is valid
4927	FusionExecutor fe;
4928	fe.compileFusion(&fusion);
4929	}
4930
4931	TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
4932	Fusion fusion;
4933	FusionGuard fg(&fusion);
4934
4935	auto tv0 = makeSymbolicTensor(`1`);
4936	fusion.addInput(tv0);
4937
4938	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4939	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4940	fusion.addOutput(tv2);
4941
4942	tv1->split(-`1`, `4`);
4943	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
4944	tv2->split(-`1`, `4`);
4945	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
4946
4947	tv1->setMemoryType(MemoryType::Global);
4948
4949	// tv1 and tv2 have the same shape and ParallelType
4950	FusionExecutor fe;
4951	fe.compileFusion(&fusion);
4952	}
4953
4954	TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
4955	Fusion fusion;
4956	FusionGuard fg(&fusion);
4957
4958	auto tv0 = makeSymbolicTensor(`1`);
4959	fusion.addInput(tv0);
4960
4961	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4962	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4963	fusion.addOutput(tv2);
4964
4965	tv1->split(-`1`, `4`);
4966	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
4967	tv2->split(-`1`, `8`);
4968	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
4969
4970	tv1->setMemoryType(MemoryType::Global);
4971
4972	// tv1 and tv2 do not have the same shape but global memory comm is supported.
4973	FusionExecutor fe;
4974	fe.compileFusion(&fusion);
4975	}
4976
4977	TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
4978	Fusion fusion;
4979	FusionGuard fg(&fusion);
4980
4981	auto tv0 = makeSymbolicTensor(`1`);
4982	fusion.addInput(tv0);
4983
4984	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
4985	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
4986	fusion.addOutput(tv2);
4987
4988	tv1->split(-`1`, `4`);
4989	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
4990	tv1->setMemoryType(MemoryType::Shared);
4991
4992	tv2->split(-`1`, `8`);
4993	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
4994
4995	// tv1 and tv2 do not have the same shape, but tv1 is on shared
4996	// memory, so it is valid
4997	FusionExecutor fe;
4998	fe.compileFusion(&fusion);
4999	}
5000
5001	// See issue #995
5002	TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) {
5003	Fusion fusion;
5004	FusionGuard fg(&fusion);
5005
5006	int64_t W = `5`, X = `6`, Y = `7`, Z = `8`;
5007
5008	auto tv0 = makeConcreteTensor({X, Y, Z});
5009	auto tv1 = makeConcreteTensor({W, X, Y, Z});
5010	fusion.addInput(tv0);
5011	fusion.addInput(tv1);
5012
5013	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
5014	auto tv3 = broadcast(tv2, {true, false, false, false});
5015	auto tv4 = add(tv3, tv1);
5016	fusion.addOutput(tv4);
5017
5018	tv4->merge(`0`);
5019	tv4->merge(`0`);
5020	tv4->merge(`0`);
5021	tv4->split(`0`, `4`);
5022	tv4->split(`0`, `3`);
5023	tv4->split(`0`, `2`);
5024
5025	TransformPropagatorWithCheck propagator(tv4);
5026	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
5027
5028	tv0->computeAt(tv2, `2`);
5029	tv3->computeAt(tv4, `2`);
5030
5031	tv4->axis(`0`)->parallelize(ParallelType::BIDx);
5032	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
5033	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
5034	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
5035	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
5036
5037	// Validation should throw an exception saying the first axes of tv2
5038	// and tv3 have incompatible parallelization. See also issue #995.
5039	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
5040	ASSERT_ANY_THROW(fusion.printKernel());
5041	}
5042
5043	// Repro of #2046
5044	TEST_F(NVFuserTest, FusionValidateParallelize7_CUDA) {
5045	Fusion fusion;
5046	FusionGuard fg(&fusion);
5047
5048	auto tv0 = makeSymbolicTensor(`2`);
5049	fusion.addInput(tv0);
5050
5051	auto tv1 = set(tv0);
5052	auto tv2 = set(tv1);
5053	auto tv3 = set(tv1);
5054	fusion.addOutput(tv2);
5055	fusion.addOutput(tv3);
5056
5057	tv1->setMemoryType(MemoryType::Global);
5058
5059	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
5060	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
5061
5062	tv2->axis(`1`)->parallelize(ParallelType::TIDy);
5063	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
5064
5065	// tv2 uses tv1 but is not parallelized with BIDx, so a grid sync is
5066	// required. It should be placed as a top-level expression.
5067
5068	GpuLower gpulw(&fusion);
5069	TORCH_CHECK(
5070	std::any_of(
5071	gpulw.kernel()->topLevelExprs().begin(),
5072	gpulw.kernel()->topLevelExprs().end(),
5073	[](Expr* expr) { return expr->isA<kir::GridSync>(); }),
5074	"Grid sync not found");
5075	}
5076
5077	TEST_F(NVFuserTest, FusionDAGMerging_CUDA) {
5078	Fusion fusion;
5079	FusionGuard fg(&fusion);
5080
5081	auto tv0 = makeSymbolicTensor(`5`);
5082	auto tv1 = makeSymbolicTensor(`1`);
5083	fusion.addInput(tv0);
5084	fusion.addInput(tv1);
5085
5086	// Branch 0
5087	auto tv2 = sum(tv0, {`0`}); // 0
5088	auto tv3 = sum(tv2, {`0`}); // 1
5089	auto tv4 = sum(tv3, {`0`}); // 2
5090	auto tv5 = sum(tv4, {`0`}); // 3
5091
5092	// Branch 1
5093	auto tv6 = add(tv1, IrBuilder::create<Double>(`1`)); // 4
5094
5095	// Merge
5096	auto tv7 = add(tv6, tv5); // 5
5097
5098	// Maximum expected output groups (can improve overtime):
5099	// {0}, {1}, {2}, {3,4,5}
5100	// without final merge would have been {0}, {1}, {2}, {3,4}, {5}
5101
5102	fusion.addOutput(tv7);
5103
5104	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5105	at::Tensor t0 = at::randn({`2`, `2`, `2`, `2`, `2`}, options);
5106	at::Tensor t1 = at::randn({`2`}, options);
5107
5108	std::vector<at::Tensor> aten_inputs = {t0, t1};
5109
5110	KernelArgumentHolder args(KernelIndexMode::INT32);
5111	args.setDeviceIndex(`0`);
5112	args.push(aten_inputs);
5113
5114	auto fusion_segments = fusion.segment(args);
5115	TORCH_CHECK(fusion_segments ->groups().size() <= `4`);
5116	}
5117
5118	TEST_F(NVFuserTest, FusionDAGScalarMerging_CUDA) {
5119	auto fusion = std::make_unique<Fusion>();
5120	FusionGuard fg(fusion.get());
5121
5122	auto tv0 = makeSymbolicTensor(`3`);
5123	auto i0 = IrBuilder::create<Double>();
5124
5125	fusion ->addInput(tv0);
5126	fusion ->addInput(i0);
5127
5128	auto i1 = add(i0, IrBuilder::create<Double>(`1.0`));
5129	auto i2 = mul(i1, i1);
5130	auto i3 = add(i2, i1);
5131
5132	// Branch 0
5133	auto tv1 = sum(tv0, {`0`}); // 0
5134	auto tv2 = add(tv1, i2);
5135	// Branch 1
5136	auto tv3 = sum(tv2, {`0`}); // 1
5137	auto tv4 = add(tv3, i3);
5138
5139	auto tv5 = add(tv4, i0);
5140
5141	fusion ->addOutput(tv5);
5142
5143	FusionExecutorCache executor_cache(std::move(fusion));
5144
5145	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5146	at::Tensor t0 = at::randn({`16`, `16`, `16`}, options);
5147	double s0 = `0.5`;
5148
5149	auto s1 = s0 + `1.0`;
5150	auto s2 = s1 * s1;
5151	auto s3 = s2 + s1;
5152	auto t1 = t0.sum({`0`});
5153	auto t2 = t1 + s2;
5154	auto t3 = sum(t2, {`0`});
5155	auto t4 = t3 + s3;
5156	auto t5 = t4 + s0;
5157
5158	auto outputs = executor_cache.runFusionWithInputs({t0, s0});
5159
5160	TORCH_CHECK(
5161	executor_cache.getMostRecentKernelRuntime()->isSegmented(),
5162	"segmentation didn't happen");
5163	TORCH_CHECK(
5164	executor_cache.getMostRecentKernelRuntime()
5165	->fusionSegments()
5166	->groups()
5167	.size() == `2`,
5168	"segmentation didn't happen as expected");
5169
5170	testValidate(
5171	executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__);
5172	}
5173
5174	TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
5175	Fusion fusion;
5176	FusionGuard fg(&fusion);
5177
5178	constexpr int M = `10`;
5179	constexpr int N = `20`;
5180	constexpr int K = `20`;
5181
5182	auto tv0 = makeSymbolicTensor(`3`);
5183	auto tv1 = sum(tv0, {{`1`, `2`}});
5184	fusion.addInput(tv0);
5185	fusion.addOutput(tv1);
5186
5187	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
5188	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
5189
5190	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5191	at::manual_seed(`0`);
5192	at::Tensor t0 = at::randn({M, N, K}, options);
5193	std::vector<IValue> aten_inputs = {t0};
5194
5195	FusionExecutor fe;
5196	fe.compileFusion(&fusion, aten_inputs);
5197	auto outputs = fe.runFusion(aten_inputs);
5198	at::Tensor aten_output = t0.sum({`1`, `2`});
5199	testValidate(
5200	&fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
5201	}
5202
5203	TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
5204	Fusion fusion;
5205	FusionGuard fg(&fusion);
5206
5207	constexpr int M = `10`;
5208	constexpr int N = `20`;
5209	constexpr int K = `20`;
5210
5211	auto tv0 = makeSymbolicTensor(`3`);
5212	auto tvs = Welford(tv0, {{`1`, `2`}});
5213	fusion.addInput(tv0);
5214	auto tv_avg = tvs.avg;
5215	auto tv_M2 = tvs.var_sum;
5216	auto tv_N = tvs.n;
5217	fusion.addOutput(tv_avg);
5218	fusion.addOutput(tv_M2);
5219
5220	tv_avg->axis(-`1`)->parallelize(ParallelType::TIDx);
5221	tv_avg->axis(`0`)->parallelize(ParallelType::BIDx);
5222
5223	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5224	at::manual_seed(`0`);
5225	at::Tensor t0 = at::randn({M, N, K}, options);
5226	std::vector<IValue> aten_inputs = {t0};
5227
5228	FusionExecutor fe;
5229	fe.compileFusion(&fusion, aten_inputs);
5230	auto outputs = fe.runFusion(aten_inputs);
5231	at::Tensor aten_avg = t0.mean({`1`, `2`});
5232	at::Tensor aten_M2 = t0.var({`1`, `2`}, false) * N * K;
5233	testValidate(
5234	&fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__);
5235	}
5236
5237	// See Issue #716
5238	TEST_F(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) {
5239	Fusion fusion;
5240	FusionGuard fg(&fusion);
5241
5242	constexpr int M = `10`;
5243	constexpr int N = `11`;
5244
5245	auto tv0 = makeSymbolicTensor(`1`);
5246	fusion.addInput(tv0);
5247
5248	std::vector<int> reduction_axes = {`1`};
5249	std::vector<bool> broadcast_mask = {false, true};
5250
5251	auto tv0_bcast = broadcast(tv0, broadcast_mask);
5252	auto path1_bcast = add(tv0_bcast, IrBuilder::create<Double>(`1.0`));
5253	auto path1 = sum(path1_bcast, reduction_axes);
5254	fusion.addOutput(path1);
5255
5256	auto p = path1->split(`1`, `1`);
5257	path1->rFactor({`1`});
5258	path1->axis(`0`)->parallelize(ParallelType::BIDx);
5259	tv0->computeAt(path1, `1`);
5260
5261	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5262	at::manual_seed(`0`);
5263	at::Tensor t0 = at::randn({M}, options);
5264	at::Tensor t0_ref = t0.clone();
5265	std::vector<IValue> aten_inputs = {t0};
5266
5267	FusionExecutor fe;
5268	fe.compileFusion(&fusion, aten_inputs);
5269
5270	// inplace op, we are adding t0 to itself
5271	auto outputs = fe.runFusion(aten_inputs, {t0});
5272
5273	TORCH_CHECK(outputs[`0`].allclose(t0_ref.add(`1`)));
5274	}
5275
5276	TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) {
5277	Fusion fusion;
5278	FusionGuard fg(&fusion);
5279
5280	auto tv0 = makeSymbolicTensor(`2`);
5281	fusion.addInput(tv0);
5282	auto tv1 = sum(tv0, {`0`});
5283	fusion.addOutput(tv1);
5284
5285	auto tv2 = tv0->cacheAfter();
5286
5287	const int bdimx = `128`;
5288	tv1->split(`1`, bdimx);
5289	tv1->split(`1`, `4`);
5290	tv1->split(`1`, `1`);
5291
5292	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
5293	tv1->axis(`2`)->parallelize(ParallelType::Unroll);
5294	tv1->split(`0`, `10`);
5295	tv0->computeAt(tv1, `4`);
5296
5297	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
5298
5299	int numel_x = `650`;
5300	int numel_y = `102`;
5301
5302	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5303	at::Tensor input = at::randn({numel_x, numel_y}, options);
5304	at::Tensor cg_output = at::empty({numel_y}, options);
5305
5306	FusionExecutor fe;
5307	fe.compileFusion(&fusion, {input});
5308	fe.runFusion({input}, {cg_output});
5309
5310	auto aten_output = input.to(at::kDouble).sum({`0`});
5311
5312	testValidate(
5313	&fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
5314	}
5315
5316	TEST_F(NVFuserTest, FusionIssue728_CUDA) {
5317	Fusion fusion;
5318	FusionGuard fg(&fusion);
5319
5320	auto tv0 = makeSymbolicTensor(`1`);
5321	fusion.addOutput(tv0);
5322	auto tv1 = makeSymbolicTensor(`1`);
5323	fusion.addOutput(tv1);
5324	auto tv2 = makeSymbolicTensor(`1`);
5325	fusion.addOutput(tv2);
5326
5327	auto tv3 = add(tv0, IrBuilder::create<Double>(`1`));
5328	auto tv4 = add(tv3, tv1);
5329	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
5330	auto tv6 = add(tv2, IrBuilder::create<Double>(`1`));
5331	fusion.addOutput(tv5);
5332	fusion.addOutput(tv6);
5333
5334	// tv0 -> tv3 -+
5335	// tv1 --------+-> tv4 -> tv5
5336	//
5337	// tv2 -> tv6
5338
5339	auto all_vals_under_tv3 =
5340	DependencyCheck::getAllValsBetween({tv3}, fusion.outputs());
5341	std::unordered_set<Val*> included_tensors({tv3, tv4, tv5});
5342	for (auto tv : included_tensors) {
5343	TORCH_CHECK(
5344	std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) !=
5345	all_vals_under_tv3.end(),
5346	"TV",
5347	tv->name(),
5348	" not found");
5349	}
5350	for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
5351	if (included_tensors.find(tv) == included_tensors.end()) {
5352	TORCH_CHECK(
5353	std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) ==
5354	all_vals_under_tv3.end(),
5355	"TV",
5356	tv->name(),
5357	" should not be found");
5358	}
5359	}
5360
5361	auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs());
5362	TORCH_CHECK(no_dependency.empty(), "No val should be returned");
5363
5364	auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6});
5365	TORCH_CHECK(no_dep_path.empty(), "No val should be returned");
5366
5367	auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5});
5368	TORCH_CHECK(no_dep_path2.empty(), "No val should be returned");
5369
5370	auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3});
5371	TORCH_CHECK(
5372	just_tv3.size() == `1` && *(just_tv3.begin()) == tv3,
5373	"Only tv3 should be included");
5374	}
5375
5376	TEST_F(NVFuserTest, FusionIssue757_CUDA) {
5377	Fusion fusion;
5378	FusionGuard fg(&fusion);
5379
5380	auto tv0 = makeSymbolicTensor(`2`);
5381	fusion.addInput(tv0);
5382	auto tv1 = sum(tv0, {`1`});
5383	auto tv2 = broadcast(tv1, {false, true});
5384	auto tv3 = makeSymbolicTensor(`2`);
5385	fusion.addInput(tv3);
5386	auto tv4 = add(tv2, tv3);
5387	fusion.addOutput(tv4);
5388
5389	tv1->computeAt(tv4, -`1`);
5390
5391	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
5392	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
5393	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
5394
5395	int numel_x = `650`;
5396	int numel_y = `102`;
5397
5398	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5399	at::Tensor t0 = at::randn({numel_x, numel_y}, options);
5400	at::Tensor t3 = at::randn({numel_x, numel_y}, options);
5401	std::vector<IValue> inputs = {t0, t3};
5402
5403	FusionExecutor fe;
5404	fe.compileFusion(&fusion, inputs);
5405	auto outputs = fe.runFusion(inputs);
5406
5407	auto t1 = t0.sum({`1`});
5408	auto t2 = t1.unsqueeze(-`1`).expand({numel_x, numel_y});
5409	auto t4 = t2 + t3;
5410
5411	testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
5412	}
5413
5414	// See issue #759
5415	TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
5416	Fusion fusion;
5417	FusionGuard fg(&fusion);
5418
5419	auto tv0 = makeSymbolicTensor(`2`);
5420	fusion.addInput(tv0);
5421	auto tv1 = sum(tv0, {`1`});
5422	auto tv2 = broadcast(tv1, {false, true});
5423	auto tv3 = makeSymbolicTensor(`2`);
5424	fusion.addInput(tv3);
5425	auto tv4 = add(tv2, tv3);
5426	fusion.addOutput(tv4);
5427
5428	tv4->split(`0`, `4`);
5429	tv1->computeAt(tv4, -`1`);
5430
5431	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
5432	tv2->axis(`1`)->parallelize(ParallelType::TIDy);
5433	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
5434	tv4->axis(`1`)->parallelize(ParallelType::TIDy);
5435	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
5436
5437	int numel_x = `100`;
5438	int numel_y = `101`;
5439
5440	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5441	at::Tensor t0 = at::randn({numel_x, numel_y}, options);
5442	at::Tensor t3 = at::randn({numel_x, numel_y}, options);
5443	std::vector<IValue> inputs = {t0, t3};
5444
5445	FusionExecutor fe;
5446	fe.compileFusion(&fusion, inputs);
5447	auto outputs = fe.runFusion(inputs);
5448
5449	auto t1 = t0.sum({`1`});
5450	auto t2 = t1.unsqueeze(-`1`).expand({numel_x, numel_y});
5451	auto t4 = t2 + t3;
5452
5453	testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
5454	}
5455
5456	TEST_F(NVFuserTest, FusionSegmentVerticalMerge_CUDA) {
5457	auto fusion = std::make_unique<Fusion>();
5458	FusionGuard fg(fusion.get());
5459
5460	auto tv0 = makeSymbolicTensor(`3`);
5461
5462	fusion ->addInput(tv0);
5463	// {first kernel}
5464	auto tv1 = sum(tv0, {`0`});
5465	auto tv2 = add(tv1, tv0);
5466	auto tv3 = sum(tv2, {`0`});
5467	auto tv4 = add(tv3, tv0);
5468	auto tv5 = sum(tv4, {`0`});
5469	auto tv6 = sum(tv5, {`0`});
5470	// {second kernel}
5471	auto tv7 = add(tv6, tv5);
5472	auto tv8 = add(tv7, tv5);
5473	auto tv9 = sum(tv8, {`0`});
5474
5475	fusion ->addOutput(tv9);
5476
5477	SegmentCandidateFinderOptions segment_options;
5478	segment_options.run_herrmann_merge = false;
5479	segment_options.run_final_merge = false;
5480
5481	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5482	at::Tensor t0 = at::randn({`2`, `2`, `2`}, options);
5483
5484	KernelArgumentHolder args(KernelIndexMode::INT32);
5485	args.setDeviceIndex(`0`);
5486	args.push(t0);
5487
5488	auto segmented_fusion =
5489	SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
5490
5491	TORCH_CHECK(segmented_fusion ->groups().size() == `2`);
5492	}
5493
5494	TEST_F(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) {
5495	auto fusion = std::make_unique<Fusion>();
5496	FusionGuard fg(fusion.get());
5497
5498	auto tv0 = makeSymbolicTensor(`3`);
5499	auto i0 = IrBuilder::create<Double>();
5500
5501	fusion ->addInput(tv0);
5502	fusion ->addInput(i0);
5503
5504	// Branch 0 {first kernel}
5505	auto tv1 = sum(tv0, {`0`});
5506	auto tv2 = add(tv0, i0);
5507	auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2);
5508	auto tv4 = sum(tv3, {`0`});
5509
5510	// Branch 1 {first kernel}
5511	auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3);
5512	auto tv6 = sum(tv5, {`0`});
5513
5514	// Incompatible {second kernel}
5515	auto tv7 = sum(tv6, {`0`});
5516
5517	fusion ->addOutput(tv1);
5518	fusion ->addOutput(tv4);
5519	fusion ->addOutput(tv7);
5520
5521	SegmentCandidateFinderOptions segment_options;
5522	segment_options.run_herrmann_merge = false;
5523	segment_options.run_final_merge = false;
5524
5525	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5526	at::Tensor t0 = at::randn({`2`, `2`, `2`}, options);
5527
5528	KernelArgumentHolder args(KernelIndexMode::INT32);
5529	args.setDeviceIndex(`0`);
5530	args.push(t0);
5531	c10::IValue scalar = `1.0`;
5532	args.push(scalar);
5533
5534	auto segmented_fusion =
5535	SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
5536
5537	TORCH_CHECK(segmented_fusion ->groups().size() == `2`);
5538	}
5539
5540	TEST_F(NVFuserTest, FusionSegmentMixReduction_CUDA) {
5541	auto fusion = std::make_unique<Fusion>();
5542	FusionGuard fg(fusion.get());
5543
5544	auto tv0 = makeSymbolicTensor(`3`);
5545
5546	fusion ->addInput(tv0);
5547
5548	// def of tv1 in kernel 1 through horizontal
5549	auto tv1 = sum(tv0, {`0`, `1`});
5550	// kernel 2
5551	auto tv2 = sum(tv0, {`2`});
5552	auto tv3 = broadcast(tv2, {false, false, true});
5553	auto tv4 = add(tv0, tv3);
5554	auto tv5 = sum(tv4, {`2`});
5555	// end of kernel 2
5556	// kernel 1
5557	auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0);
5558	auto tv7 = sum(tv6, {`0`, `1`});
5559	auto tv8 = sum(tv6, {`0`, `1`});
5560
5561	fusion ->addOutput(tv1);
5562	fusion ->addOutput(tv5);
5563	fusion ->addOutput(tv7);
5564	fusion ->addOutput(tv8);
5565
5566	SegmentCandidateFinderOptions segment_options;
5567	segment_options.run_herrmann_merge = false;
5568	segment_options.run_final_merge = false;
5569
5570	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5571	at::Tensor t0 = at::randn({`2`, `2`, `2`}, options);
5572
5573	KernelArgumentHolder args(KernelIndexMode::INT32);
5574	args.setDeviceIndex(`0`);
5575	args.push(t0);
5576
5577	auto segmented_fusion =
5578	SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
5579
5580	TORCH_CHECK(segmented_fusion ->groups().size() <= `2`);
5581	}
5582
5583	TEST_F(NVFuserTest, FusionSBAR_CUDA) {
5584	Fusion fusion;
5585	FusionGuard fg(&fusion);
5586
5587	// N, H, W, C format
5588	std::vector<int64_t> input_shape{`656`, `7`, `7`, `64`};
5589
5590	auto x = makeContigTensor(`4`);
5591	auto y = makeContigTensor(`4`);
5592	auto weight = makeContigTensor(`1`);
5593	auto bias = makeContigTensor(`1`);
5594
5595	fusion.addInput(x);
5596	fusion.addInput(y);
5597	fusion.addInput(weight);
5598	fusion.addInput(bias);
5599
5600	const size_t kNumberOfDims = x->nDims();
5601	std::vector<bool> broadcast_mask(kNumberOfDims, false);
5602	for (const auto axis : c10::irange(kNumberOfDims - `1`)) {
5603	broadcast_mask [axis] = true;
5604	}
5605
5606	auto weight_bcast = broadcast(weight, broadcast_mask);
5607	auto scale = mul(x, weight_bcast);
5608	auto bias_bcast = broadcast(bias, broadcast_mask);
5609	auto scale_bias = add(scale, bias_bcast);
5610	auto scale_bias_add = add(scale_bias, y);
5611	auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add);
5612
5613	fusion.addOutput(scale_bias_add_relu);
5614
5615	// inputs
5616	at::manual_seed(`0`);
5617	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5618	at::Tensor at_x = at::randn(input_shape, options);
5619	at::Tensor at_y = at::randn(input_shape, options);
5620	at::Tensor at_weight = at::ones({input_shape [`3`]}, options);
5621	at::Tensor at_bias = at::zeros({input_shape [`3`]}, options);
5622
5623	// inputs
5624	std::vector<c10::IValue> inputs = {at_x, at_y, at_weight, at_bias};
5625
5626	// outputs
5627	std::vector<at::Tensor> outputs;
5628
5629	auto lparams = schedulePointwise(&fusion, inputs);
5630
5631	FusionExecutor executor;
5632	executor.compileFusion(&fusion, inputs, lparams);
5633	outputs = executor.runFusion(inputs, lparams);
5634
5635	auto at_scale = at::mul(at_x, at_weight);
5636	auto at_scale_bias = at::add(at_scale, at_bias);
5637	auto pwise_add = at::add(at_scale_bias, at_y);
5638	auto output = at::relu(pwise_add);
5639
5640	testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__);
5641	}
5642
5643	TEST_F(NVFuserTest, FusionSingleElement_CUDA) {
5644	Fusion fusion;
5645	FusionGuard fg(&fusion);
5646
5647	auto tv0 = makeSymbolicTensor(`0`);
5648	fusion.addInput(tv0);
5649
5650	auto tv1 = add(tv0, IrBuilder::create<Double>(`2.5`));
5651
5652	auto tv2 = add(tv1, IrBuilder::create<Double>(`3.5`));
5653	fusion.addOutput(tv2);
5654
5655	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5656	at::Tensor input = at::randn({}, options);
5657
5658	at::Tensor cg_output = at::empty({}, options);
5659
5660	auto lparams = schedulePointwise(&fusion, {input});
5661
5662	FusionExecutor fe;
5663	fe.compileFusion(&fusion, {input}, lparams);
5664	fe.runFusion({input}, {cg_output}, lparams);
5665
5666	auto aten_output = input.add(`2.5`).add(`3.5`);
5667
5668	testValidate(
5669	&fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
5670	}
5671
5672	TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) {
5673	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5674	Fusion& fusion = *fusion_ptr.get();
5675	FusionGuard fg(&fusion);
5676
5677	int batch = `4`;
5678	int c = `4`;
5679	int h = `4`;
5680	int w = `4`;
5681	int numDims = `4`;
5682
5683	auto input = makeSymbolicTensor(numDims);
5684	fusion.addInput(input);
5685	auto weight = makeSymbolicTensor(`1`);
5686	fusion.addInput(weight);
5687	auto running_mean = makeSymbolicTensor(`1`);
5688	fusion.addInput(running_mean);
5689	auto running_var = makeSymbolicTensor(`1`);
5690	fusion.addInput(running_var);
5691	auto save_mean = makeSymbolicTensor(`1`);
5692	fusion.addInput(save_mean);
5693	auto save_invstd = makeSymbolicTensor(`1`);
5694	fusion.addInput(save_invstd);
5695
5696	auto grad_out_prev = makeSymbolicTensor(numDims);
5697	fusion.addInput(grad_out_prev);
5698	auto gt_0 =
5699	makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
5700	fusion.addInput(gt_0);
5701
5702	auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(`1`));
5703	auto gt_float = castOp(DataType::Float, gt_bool);
5704
5705	auto grad_out = mul(grad_out_prev, gt_float);
5706
5707	Val* eps_ptr = IrBuilder::create<Double>(`1e-5`);
5708
5709	auto grads = batch_norm_backward(
5710	input,
5711	grad_out,
5712	weight,
5713	running_mean,
5714	running_var,
5715	save_mean,
5716	save_invstd,
5717	true,
5718	eps_ptr,
5719	{true, true, true});
5720
5721	fusion.addOutput(grads.grad_input);
5722	fusion.addOutput(grads.grad_weight);
5723	fusion.addOutput(grads.grad_bias);
5724
5725	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5726	at::Tensor input0 = at::randn({batch, c, h, w}, options);
5727	at::Tensor input1 = at::randn({c}, options);
5728	at::Tensor input2 = at::randn_like(input1);
5729	at::Tensor input3 = at::randn_like(input1);
5730	at::Tensor input4 = at::randn_like(input1);
5731	at::Tensor input5 = at::randn_like(input1);
5732	at::Tensor input6 = at::randn_like(input0);
5733	at::Tensor input7 = at::randn_like(input0);
5734
5735	FusionExecutorCache fec(std::move(fusion_ptr));
5736	std::vector<IValue> inputs = {
5737	input0, input1, input2, input3, input4, input5, input6, input7};
5738	auto outputs = fec.runFusionWithInputs(inputs);
5739	}
5740
5741	// TODO: We only changed inputs, merge this with the test above.
5742	TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
5743	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5744	Fusion& fusion = *fusion_ptr.get();
5745	FusionGuard fg(&fusion);
5746
5747	int batch = `2`;
5748	int c = `81`;
5749	int h = `1`;
5750	int w = `1`;
5751	int numDims = `4`;
5752
5753	// auto input = makeSymbolicTensor(numDims);
5754	auto input = makeConcreteTensor({-`1`, -`1`, `1`, `1`});
5755	fusion.addInput(input);
5756	auto weight = makeSymbolicTensor(`1`);
5757	fusion.addInput(weight);
5758	auto running_mean = makeSymbolicTensor(`1`);
5759	fusion.addInput(running_mean);
5760	auto running_var = makeSymbolicTensor(`1`);
5761	fusion.addInput(running_var);
5762	auto save_mean = makeSymbolicTensor(`1`);
5763	fusion.addInput(save_mean);
5764	auto save_invstd = makeSymbolicTensor(`1`);
5765	fusion.addInput(save_invstd);
5766
5767	// auto grad_out_prev = makeSymbolicTensor(numDims);
5768	auto grad_out_prev = makeConcreteTensor({-`1`, -`1`, `1`, `1`});
5769	fusion.addInput(grad_out_prev);
5770	// auto gt_0 =
5771	// makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
5772	auto gt_0 = makeConcreteTensor({-`1`, -`1`, `1`, `1`});
5773	fusion.addInput(gt_0);
5774
5775	auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(`1`));
5776	auto gt_float = castOp(DataType::Float, gt_bool);
5777
5778	auto grad_out = mul(grad_out_prev, gt_float);
5779
5780	Val* eps_ptr = IrBuilder::create<Double>(`1e-5`);
5781
5782	auto grads = batch_norm_backward(
5783	input,
5784	grad_out,
5785	weight,
5786	running_mean,
5787	running_var,
5788	save_mean,
5789	save_invstd,
5790	true,
5791	eps_ptr,
5792	{true, true, true});
5793
5794	fusion.addOutput(grads.grad_input);
5795	fusion.addOutput(grads.grad_weight);
5796	fusion.addOutput(grads.grad_bias);
5797
5798	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5799	at::Tensor input0 = at::randn({batch, c, h, w}, options);
5800	at::Tensor input1 = at::randn({c}, options);
5801	at::Tensor input2 = at::randn_like(input1);
5802	at::Tensor input3 = at::randn_like(input1);
5803	at::Tensor input4 = at::randn_like(input1);
5804	at::Tensor input5 = at::randn_like(input1);
5805	at::Tensor input6 = at::randn_like(input0);
5806	at::Tensor input7 = at::randn_like(input0);
5807
5808	FusionExecutorCache fec(std::move(fusion_ptr));
5809	std::vector<IValue> inputs = {
5810	input0, input1, input2, input3, input4, input5, input6, input7};
5811	auto outputs = fec.runFusionWithInputs(inputs);
5812	}
5813
5814	TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
5815	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5816	Fusion& fusion = *fusion_ptr.get();
5817	FusionGuard fg(&fusion);
5818
5819	const bool kTraining = true;
5820	const float kMomentum = `0.1`;
5821	const float kEps = `1e-5`;
5822
5823	int batch = `14`;
5824	int c = `65`;
5825	int h = `7`;
5826	int w = `7`;
5827	int numDims = `4`;
5828
5829	auto input = makeSymbolicTensor(numDims);
5830	fusion.addInput(input);
5831	auto weight = makeSymbolicTensor(`1`);
5832	fusion.addInput(weight);
5833	auto bias = makeSymbolicTensor(`1`);
5834	fusion.addInput(bias);
5835	auto running_mean = makeSymbolicTensor(`1`);
5836	fusion.addInput(running_mean);
5837	auto running_var = makeSymbolicTensor(`1`);
5838	fusion.addInput(running_var);
5839
5840	auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
5841	auto eps_ptr = IrBuilder::create<Double>(kEps);
5842
5843	auto result = batch_norm(
5844	input,
5845	weight,
5846	bias,
5847	running_mean,
5848	running_var,
5849	kTraining,
5850	momentum_ptr,
5851	eps_ptr);
5852
5853	fusion.addOutput(result.output);
5854	fusion.addOutput(result.mean);
5855	fusion.addOutput(result.invstd);
5856
5857	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5858	at::Tensor input1 = at::randn({batch, c, h, w}, options);
5859	at::Tensor input2 = at::randn({c}, options);
5860	at::Tensor input3 = at::randn_like(input2);
5861	at::Tensor input4 = at::randn_like(input2);
5862	at::Tensor input5 = at::randn_like(input2);
5863
5864	auto input1_ref = input1.clone();
5865	auto input2_ref = input2.clone();
5866	auto input3_ref = input3.clone();
5867	auto input4_ref = input4.clone();
5868	auto input5_ref = input5.clone();
5869
5870	FusionExecutorCache fec(std::move(fusion_ptr));
5871	std::vector<IValue> aten_inputs = {input1, input2, input3, input4, input5};
5872	auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
5873
5874	auto at_results = at::native_batch_norm(
5875	input1_ref,
5876	input2_ref,
5877	input3_ref,
5878	input4_ref,
5879	input5_ref,
5880	kTraining,
5881	kMomentum,
5882	kEps);
5883
5884	auto at_output = std::get<`0`>(at_results);
5885	auto at_mean = std::get<`1`>(at_results);
5886	auto at_invstd = std::get<`2`>(at_results);
5887
5888	std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
5889
5890	testValidate(
5891	&fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
5892	}
5893
5894	TEST_F(NVFuserTest, FusionBNRepro2_CUDA) {
5895	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5896	Fusion& fusion = *fusion_ptr.get();
5897	FusionGuard fg(&fusion);
5898
5899	const bool kTraining = true;
5900	const float kMomentum = `0.1`;
5901	const float kEps = `1e-5`;
5902
5903	int batch = `2`;
5904	int c = `4`;
5905	int h = `17`;
5906	int w = `17`;
5907	int numDims = `4`;
5908
5909	auto input = makeSymbolicTensor(numDims);
5910	fusion.addInput(input);
5911
5912	Val* momentum_ptr = IrBuilder::create<Double>(kMomentum);
5913	Val* eps_ptr = IrBuilder::create<Double>(kEps);
5914
5915	auto result = batch_norm(
5916	input,
5917	nullptr,
5918	nullptr,
5919	nullptr,
5920	nullptr,
5921	kTraining,
5922	momentum_ptr,
5923	eps_ptr);
5924
5925	fusion.addOutput(result.output);
5926	fusion.addOutput(result.mean);
5927	fusion.addOutput(result.invstd);
5928
5929	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5930	at::Tensor input1 = at::randn({batch, c, h, w}, options);
5931
5932	auto input1_ref = input1.clone();
5933	at::Tensor r_m;
5934	at::Tensor r_v;
5935	at::Tensor weight;
5936	at::Tensor bias;
5937
5938	FusionExecutorCache fec(std::move(fusion_ptr));
5939	std::vector<IValue> aten_inputs = {input1};
5940	auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
5941
5942	auto at_results = at::native_batch_norm(
5943	input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps);
5944
5945	auto at_output = std::get<`0`>(at_results);
5946	auto at_mean = std::get<`1`>(at_results);
5947	auto at_invstd = std::get<`2`>(at_results);
5948
5949	std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
5950
5951	testValidate(
5952	&fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
5953	}
5954
5955	TEST_F(NVFuserTest, FusionZeroSizeTensorPW_CUDA) {
5956	Fusion fusion;
5957	FusionGuard fg(&fusion);
5958
5959	auto tv0 = makeSymbolicTensor(`1`);
5960	fusion.addInput(tv0);
5961
5962	auto tv1 = makeConcreteTensor({`0`});
5963	fusion.addInput(tv1);
5964
5965	auto tv2 = add(tv0, IrBuilder::create<Double>(`2.5`));
5966	fusion.addOutput(tv2);
5967
5968	// This test used to just have:
5969	// auto tv3 = makeConcreteTensor({0});
5970	// and somehow that was running through our system fine, but size-0 tensors
5971	// are not supported, so making sure this fails.
5972	auto tv3 = set(tv1);
5973	fusion.addOutput(tv3);
5974
5975	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
5976
5977	at::Tensor input0 = at::randn({`2`}, options);
5978	at::Tensor input1 = at::randn({`0`}, options);
5979	at::Tensor cg_output2 = at::empty({`2`}, options);
5980	at::Tensor cg_output3 = at::empty({`0`}, options);
5981
5982	// Fails at schedule pointwise because our (maybe only) size-0 check is in
5983	// binding input sizes which the scheduler ends up calling.
5984	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
5985	ASSERT_ANY_THROW(schedulePointwise(&fusion, {input0, input1}));
5986	}
5987
5988	TEST_F(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) {
5989	Fusion fusion;
5990	FusionGuard fg(&fusion);
5991
5992	auto tv0 = makeSymbolicTensor(`2`);
5993	fusion.addInput(tv0);
5994
5995	auto tv1 = makeConcreteTensor({`0`});
5996	fusion.addInput(tv1);
5997
5998	auto tv2 = sum(tv0, {`1`});
5999	fusion.addOutput(tv2);
6000
6001	auto tv3 = makeConcreteTensor({`0`});
6002	fusion.addOutput(tv3);
6003
6004	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6005
6006	at::Tensor input0 = at::randn({`2`, `4`}, options);
6007	at::Tensor input1 = at::randn({`0`}, options);
6008	at::Tensor cg_output2 = at::empty({`2`}, options);
6009	at::Tensor cg_output3 = at::empty({`0`}, options);
6010
6011	auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
6012	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
6013	scheduleReduction(&fusion, *reduction_params);
6014	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
6015
6016	auto lparams = reduction_params ->lparams;
6017	FusionExecutor fe;
6018	fe.compileFusion(&fusion, {input0, input1}, lparams);
6019	auto cg_outputs = fe.runFusion({input0, input1}, lparams);
6020	auto aten_output2 = input0.sum({`1`});
6021	at::Tensor aten_output3 = at::empty({`0`}, options);
6022
6023	testValidate(
6024	&fusion,
6025	cg_outputs,
6026	{input0, input1},
6027	{aten_output2, aten_output3},
6028	__LINE__,
6029	__FILE__,
6030	"",
6031	lparams);
6032	}
6033
6034	TEST_F(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) {
6035	Fusion fusion;
6036	FusionGuard fg(&fusion);
6037
6038	auto tv0 = makeSymbolicTensor(`2`);
6039	fusion.addInput(tv0);
6040
6041	auto tv1 = makeConcreteTensor({`0`});
6042	fusion.addInput(tv1);
6043
6044	auto tv2 = sum(tv0, {`0`});
6045	auto tv3 = broadcast(tv2, {true, false});
6046	auto tv4 = add(tv0, tv3);
6047	fusion.addOutput(tv4);
6048
6049	auto tv5 = makeConcreteTensor({`0`});
6050	fusion.addOutput(tv5);
6051
6052	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6053
6054	at::Tensor input0 = at::randn({`2`, `4`}, options);
6055	at::Tensor input1 = at::randn({`0`}, options);
6056	at::Tensor cg_output2 = at::empty({`2`, `4`}, options);
6057	at::Tensor cg_output3 = at::empty({`0`}, options);
6058
6059	auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1});
6060	TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
6061	schedulePersistentKernel(&fusion, *reduction_params);
6062
6063	auto lparams = reduction_params ->lparams;
6064	FusionExecutor fe;
6065	fe.compileFusion(&fusion, {input0, input1}, lparams);
6066	auto cg_outputs = fe.runFusion({input0, input1}, lparams);
6067	auto aten_output2 = input0.sum({`0`}).add(input0);
6068	at::Tensor aten_output3 = at::empty({`0`}, options);
6069
6070	testValidate(
6071	&fusion,
6072	cg_outputs,
6073	{input0, input1},
6074	{aten_output2, aten_output3},
6075	__LINE__,
6076	__FILE__,
6077	"",
6078	lparams);
6079	}
6080
6081	TEST_F(NVFuserTest, FusionSegmentIoAlias_CUDA) {
6082	auto fusion = std::make_unique<Fusion>();
6083	FusionGuard fg(fusion.get());
6084
6085	TensorView* tv0 = makeSymbolicTensor(`2`);
6086	TensorView* tv1 = makeSymbolicTensor(`1`);
6087	TensorView* tv2 = makeSymbolicTensor(`2`);
6088
6089	fusion ->addInput(tv0);
6090	fusion ->addInput(tv1);
6091	fusion ->addInput(tv2);
6092
6093	TensorView* tv3 = add(tv0, IrBuilder::create<Double>(`1`)); // Group 0
6094	TensorView* tv4 =
6095	max(tv3, {`0`}); // Group 0 (use max instead to avoid numerical issues)
6096	TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce,
6097	// keeps normalization scheduler away)
6098	TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce)
6099
6100	// Note: test alias;
6101	fusion ->aliasOutputToInput(tv6, tv0);
6102	// TODO: support output on aliased fusion #1488
6103	// remove tv7 after #1488
6104	// fusion->addOutput(tv6);
6105	TensorView* tv7 = add(tv6, IrBuilder::create<Double>(`1`)); // Group 0
6106	fusion ->addOutput(tv7);
6107
6108	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6109	at::Tensor t0 = at::randn({`128`, `65`}, options);
6110	at::Tensor t1 = at::randn({`65`}, options);
6111	at::Tensor t2 = at::randn({`128`, `65`}, options);
6112
6113	auto t3 = t0.add(`1.0`);
6114	auto t4 = std::get<`0`>(at::max(t3, `0`));
6115	auto t5 = t4.add(t1);
6116	auto t6 = t5.add(t2);
6117	auto t7 = t6.add(`1.0`);
6118
6119	FusionExecutorCache executor_cache(std::move(fusion));
6120
6121	auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
6122
6123	// TODO: support output on aliased fusion #1488
6124	// validating aliasing
6125	// TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr());
6126
6127	TORCH_CHECK(
6128	executor_cache.getMostRecentKernelRuntime()->isSegmented(),
6129	"segmentation didn't happen");
6130	TORCH_CHECK(
6131	executor_cache.getMostRecentKernelRuntime()
6132	->fusionSegments()
6133	->groups()
6134	.size() == `2`,
6135	"segmentation didn't happen as expected");
6136
6137	testValidate(
6138	executor_cache.fusion(), outputs, {t0, t1, t2}, {t7}, __LINE__, __FILE__);
6139	}
6140
6141	TEST_F(NVFuserTest, FusionWelford1Output_CUDA) {
6142	auto fusion_ptr = std::make_unique<Fusion>();
6143	auto fusion = fusion_ptr.get();
6144	FusionGuard fg(fusion);
6145
6146	auto tv0 = makeSymbolicTensor(`2`);
6147	fusion->addInput(tv0);
6148
6149	auto tvs = Welford(tv0, {`1`});
6150	fusion->addOutput(tvs.var_sum);
6151	FusionExecutorCache executor_cache(std::move(fusion_ptr));
6152
6153	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6154	at::Tensor t0 = at::randn({`128`, `65`}, options);
6155	auto outputs = executor_cache.runFusionWithInputs({t0});
6156
6157	auto t1 = t0.var({`1`}, false) * `65`;
6158	testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
6159	}
6160
6161	TEST_F(NVFuserTest, FusionTranslate1Welford_CUDA) {
6162	auto fusion_ptr = std::make_unique<Fusion>();
6163	auto fusion = fusion_ptr.get();
6164	FusionGuard fg(fusion);
6165
6166	auto tv0 = makeSymbolicTensor(`2`);
6167	fusion->addInput(tv0);
6168
6169	auto tvs = Welford(tv0, {`1`});
6170	auto tv_out = add(tv0, broadcast(tvs.avg, {false, true}));
6171	fusion->addOutput(tv_out);
6172	FusionExecutorCache executor_cache(std::move(fusion_ptr));
6173
6174	auto run_test = [&executor_cache,
6175	fusion](auto inner_size) -> FusionKernelRuntime* {
6176	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6177	at::Tensor t0 = at::randn({`128`, inner_size}, options);
6178	auto outputs = executor_cache.runFusionWithInputs({t0});
6179	// Square sums does not fit well in the testValidate assumptions,
6180	// so we just compare the divided output here.
6181	testValidate(
6182	fusion,
6183	outputs,
6184	{t0},
6185	{t0.add(t0.mean({`1`}).unsqueeze(`1`))},
6186	__LINE__,
6187	__FILE__);
6188
6189	return executor_cache.getMostRecentKernelRuntime();
6190	};
6191
6192	// Run a translated welford
6193	auto runtime1 = run_test (`64`);
6194	// Check it was translated
6195	TORCH_CHECK(
6196	runtime1->fusionSegments()->groups().size() == `1` &&
6197	runtime1->fusionSegments()->groups()[`0`]->exprs().size() > `2`);
6198
6199	// Run an un-translated welford
6200	auto runtime2 = run_test (`65536`);
6201
6202	bool found_welford = false;
6203	for (auto group : runtime2->fusionSegments()->groups()) {
6204	for (auto expr : group->exprs()) {
6205	if (expr->isA<WelfordOp>()) {
6206	found_welford = true;
6207	}
6208	}
6209	}
6210	TORCH_CHECK(found_welford);
6211	}
6212
6213	TEST_F(NVFuserTest, FusionTranslate2Welford_CUDA) {
6214	auto fusion_ptr = std::make_unique<Fusion>();
6215	auto fusion = fusion_ptr.get();
6216	FusionGuard fg(fusion);
6217
6218	auto tv0 = makeSymbolicTensor(`2`);
6219	fusion->addInput(tv0);
6220
6221	auto tvs1 = Welford(tv0, {`1`});
6222	auto tv_out1 = add(tv0, broadcast(tvs1.avg, {false, true}));
6223	fusion->addOutput(tv_out1);
6224
6225	auto tvs2 = Welford(tv0, {`1`});
6226	auto tv_out2 = add(tv0, broadcast(tvs2.avg, {false, true}));
6227	fusion->addOutput(tv_out2);
6228
6229	FusionExecutorCache executor_cache(std::move(fusion_ptr));
6230
6231	auto run_test = [&executor_cache,
6232	fusion](auto inner_size) -> FusionKernelRuntime* {
6233	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6234	at::Tensor t0 = at::randn({`128`, inner_size}, options);
6235	auto outputs = executor_cache.runFusionWithInputs({t0});
6236
6237	// Square sums does not fit well in the testValidate assumptions,
6238	// so we just compare the divided output here.
6239	auto out = t0.add(t0.mean({`1`}).unsqueeze(`1`));
6240	testValidate(fusion, outputs, {t0}, {out, out}, __LINE__, __FILE__);
6241
6242	return executor_cache.getMostRecentKernelRuntime();
6243	};
6244
6245	// Run a translated welford
6246	auto runtime1 = run_test (`64`);
6247	// Check it was translated
6248	TORCH_CHECK(
6249	runtime1->fusionSegments()->groups().size() == `1` &&
6250	runtime1->fusionSegments()->groups()[`0`]->exprs().size() > `4`);
6251
6252	// Run an un-translated welford
6253	auto runtime2 = run_test (`65536`);
6254	// // Check it was not translated
6255	bool found_welford = false;
6256	for (auto group : runtime2->fusionSegments()->groups()) {
6257	for (auto expr : group->exprs()) {
6258	if (expr->isA<WelfordOp>()) {
6259	found_welford = true;
6260	}
6261	}
6262	}
6263	TORCH_CHECK(found_welford);
6264	}
6265
6266	TEST_F(NVFuserTest, FusionLargeWelfordNormalization_CUDA) {
6267	auto fusion_ptr = std::make_unique<Fusion>();
6268	auto fusion = fusion_ptr.get();
6269	FusionGuard fg(fusion);
6270
6271	auto tv0 = makeSymbolicTensor(`2`);
6272	fusion->addInput(tv0);
6273
6274	auto tvs1 = Welford(tv0, {`1`});
6275	auto sum_of_tv0 = sum(tv0, {`1`});
6276
6277	fusion->addOutput(tvs1.var_sum);
6278	fusion->addOutput(sum_of_tv0);
6279
6280	FusionExecutorCache executor_cache(std::move(fusion_ptr));
6281
6282	auto run_test = [&executor_cache,
6283	fusion](auto inner_size) -> FusionKernelRuntime* {
6284	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6285	at::Tensor t0 = at::randn({`128`, inner_size}, options);
6286	auto outputs = executor_cache.runFusionWithInputs({t0});
6287
6288	auto t1 = t0.var({`1`}, false) * inner_size;
6289	auto t2 = t0.sum({`1`});
6290	testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__);
6291
6292	return executor_cache.getMostRecentKernelRuntime();
6293	};
6294
6295	auto runtime = run_test (`65536`);
6296	TORCH_CHECK(!runtime->isSegmented());
6297	}
6298
6299	TEST_F(NVFuserTest, FusionWelfordOuterPersistence_CUDA) {
6300	auto fusion_ptr = std::make_unique<Fusion>();
6301	auto fusion = fusion_ptr.get();
6302	FusionGuard fg(fusion);
6303
6304	auto tv0 = makeSymbolicTensor(`2`);
6305	fusion->addInput(tv0);
6306
6307	auto tvs1 = Welford(tv0, {`1`});
6308	auto sum_of_tv0 = sum(tv0, {`1`});
6309	auto sum_bcasted = broadcast(sum_of_tv0, {false, true});
6310	auto avg_bcasted = broadcast(tvs1.avg, {false, true});
6311	auto tv0_plus_sum = add(tv0, sum_bcasted);
6312	auto tv0_plus_avg = add(tv0, avg_bcasted);
6313
6314	fusion->addOutput(tv0_plus_sum);
6315	fusion->addOutput(tv0_plus_avg);
6316
6317	FusionExecutorCache executor_cache(std::move(fusion_ptr));
6318
6319	auto run_test = [&executor_cache,
6320	fusion](auto inner_size) -> FusionKernelRuntime* {
6321	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6322	at::Tensor t0 = at::randn({`128`, inner_size}, options);
6323	auto outputs = executor_cache.runFusionWithInputs({t0});
6324
6325	auto t1 = t0.to(c10::kDouble).mean({`1`}).unsqueeze(`1`) + t0;
6326	auto t2 = t0.to(c10::kDouble).sum({`1`}).unsqueeze(`1`) + t0;
6327	testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__);
6328
6329	return executor_cache.getMostRecentKernelRuntime();
6330	};
6331
6332	for (auto inner_size : {`4096`, `8192`, `32768`}) {
6333	auto runtime = run_test (inner_size);
6334	TORCH_CHECK(!runtime->isSegmented());
6335	}
6336	}
6337
6338	TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
6339	auto fusion = std::make_unique<Fusion>();
6340	FusionGuard fg(fusion.get());
6341
6342	auto tv0 = makeSymbolicTensor(`2`);
6343	auto tv1 = makeSymbolicTensor(`2`);
6344	fusion ->addInput(tv0);
6345	fusion ->addInput(tv1);
6346
6347	auto tv2 = sum(tv0, {`0`});
6348	auto tv3 = sum(tv1, {`1`});
6349	fusion ->addOutput(tv2);
6350	fusion ->addOutput(tv3);
6351
6352	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6353	at::Tensor t0 = at::randn({`16`, `16`}, options);
6354	at::Tensor t1 = at::randn({`16`, `16`}, options);
6355
6356	FusionExecutorCache fusion_executor_cache(std::move(fusion));
6357	fusion_executor_cache.runFusionWithInputs({t0, t1});
6358	}
6359
6360	TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
6361	auto fusion = std::make_unique<Fusion>();
6362	FusionGuard fg(fusion.get());
6363
6364	auto tv0 = makeSymbolicTensor(`1`);
6365	auto tv1 = makeSymbolicTensor(`2`);
6366	auto tv2 = makeSymbolicTensor(`4`);
6367	fusion ->addInput(tv0);
6368	fusion ->addInput(tv1);
6369
6370	auto tv3 = broadcast(tv0, {false, true, true, true});
6371	auto tv4 = broadcast(tv1, {false, false, true, true});
6372	auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2);
6373
6374	auto tv6 = add(tv3, tv5);
6375	auto tv7 = add(tv4, tv5);
6376	auto tv8 = add(tv3, tv4);
6377
6378	auto tv9 = add(tv6, tv7);
6379	auto tv10 = add(tv9, tv8);
6380
6381	fusion ->addOutput(tv10);
6382
6383	tv0->computeAt(tv10, -`2`);
6384	tv1->computeAt(tv10, -`2`);
6385	tv2->computeAt(tv10, -`2`);
6386
6387	TORCH_CHECK(tv3->getComputeAtPosition() == `1`);
6388	TORCH_CHECK(tv4->getComputeAtPosition() == `2`);
6389	TORCH_CHECK(tv5->getComputeAtPosition() == `3`);
6390
6391	TORCH_CHECK(tv6->getMaxProducerPosition() == `3`);
6392	TORCH_CHECK(tv7->getMaxProducerPosition() == `3`);
6393	TORCH_CHECK(tv8->getMaxProducerPosition() == `2`);
6394	}
6395
6396	TEST_F(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) {
6397	auto fusion = std::make_unique<Fusion>();
6398	FusionGuard fg(fusion.get());
6399
6400	auto tv0 = makeSymbolicTensor(`2`);
6401	auto tv1 = makeSymbolicTensor(`3`);
6402	fusion ->addInput(tv0);
6403	fusion ->addInput(tv1);
6404	auto tv2 = broadcast(tv0, {false, false, true});
6405	auto tv3 = add(tv2, tv1);
6406
6407	fusion ->addOutput(tv3);
6408	tv3->split(-`2`, `4`);
6409	tv3->reorder({{-`1`, -`2`}});
6410	tv0->computeAt(tv3, -`2`);
6411	tv1->computeAt(tv3, -`2`);
6412	TORCH_CHECK(tv2->getComputeAtPosition() == `2`);
6413	TORCH_CHECK(tv3->getMaxProducerPosition() == `2`);
6414	}
6415
6416	TEST_F(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) {
6417	auto fusion = std::make_unique<Fusion>();
6418	FusionGuard fg(fusion.get());
6419
6420	auto tv0 = makeSymbolicTensor(`2`);
6421	auto tv1 = makeSymbolicTensor(`4`);
6422	fusion ->addInput(tv0);
6423	fusion ->addInput(tv1);
6424	auto tv2 = broadcast(tv0, {false, false, true});
6425	auto tv3 = broadcast(tv2, {false, true, false, false});
6426	auto tv4 = add(tv3, tv1);
6427
6428	fusion ->addOutput(tv4);
6429	tv0->computeAt(tv4, -`1`);
6430	tv1->computeAt(tv4, -`1`);
6431	TORCH_CHECK(tv2->getComputeAtPosition() == `2`);
6432	TORCH_CHECK(tv3->getMaxProducerPosition() == `3`);
6433	}
6434
6435	TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) {
6436	auto fusion = std::make_unique<Fusion>();
6437	FusionGuard fg(fusion.get());
6438
6439	auto tv0 = makeSymbolicTensor(`2`);
6440	fusion ->addInput(tv0);
6441
6442	auto tv1 = sum(tv0, {`1`});
6443	auto tv2 = broadcast(tv1, {false, true});
6444	auto tv3 = add(tv2, tv0);
6445
6446	fusion ->addOutput(tv3);
6447
6448	tv1->split(`1`, `32`);
6449	auto tv1_rf = tv1->rFactor({`1`});
6450	TransformPropagatorWithCheck propagator(tv1_rf);
6451	MaxRootDomainInfoSpanningTree (tv1_rf).traverse(&propagator);
6452	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6453	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
6454	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6455	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
6456	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6457	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
6458	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6459	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6460
6461	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6462	at::Tensor input1 = at::randn({`16`, `128`}, options);
6463
6464	auto at_output = input1.sum({`1`}, true).add(input1);
6465
6466	FusionExecutor fe;
6467	fe.compileFusion(fusion.get(), {input1});
6468	auto outputs = fe.runFusion({input1});
6469
6470	testValidate(
6471	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6472	}
6473
6474	TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) {
6475	auto fusion = std::make_unique<Fusion>();
6476	FusionGuard fg(fusion.get());
6477
6478	auto tv0 = makeSymbolicTensor(`2`);
6479
6480	fusion ->addInput(tv0);
6481
6482	auto tv1 = sum(tv0, {`1`});
6483	auto tv2 = broadcast(tv1, {false, true});
6484	auto tv3 = add(tv2, tv0);
6485
6486	fusion ->addOutput(tv3);
6487
6488	// Schedule a persistent kernel
6489	auto tv0_cache = tv0->cacheAfter();
6490	tv1->split(`1`, `8`, false);
6491	auto tv1_rf = tv1->rFactor({`1`});
6492	tv1_rf->axis(`0`)->parallelize(ParallelType::BIDx);
6493	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6494	tv1_rf->axis(-`1`)->padToMultipleOfWarp(`32`);
6495	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6496	tv1->axis(-`1`)->padToMultipleOfWarp(`32`);
6497	TransformPropagatorWithCheck propagator(tv1_rf);
6498	MaxRootDomainInfoSpanningTree (tv1_rf).traverse(&propagator);
6499	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
6500	tv0->axis(-`1`)->padToMultipleOfWarp(`32`);
6501	tv0_cache->axis(-`1`)->parallelize(ParallelType::TIDx);
6502	tv0_cache->axis(-`1`)->padToMultipleOfWarp(`32`);
6503	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6504	tv2->axis(-`1`)->padToMultipleOfWarp(`32`);
6505	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6506	tv3->axis(-`1`)->padToMultipleOfWarp(`32`);
6507
6508	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6509
6510	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6511	at::Tensor input1 = at::randn({`16`, `127`}, options);
6512
6513	auto at_output = input1.sum({`1`}, true).add(input1);
6514
6515	FusionExecutor fe;
6516	fe.compileFusion(fusion.get(), {input1});
6517	auto outputs = fe.runFusion({input1});
6518	testValidate(
6519	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6520	}
6521
6522	TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
6523	auto fusion = std::make_unique<Fusion>();
6524	FusionGuard fg(fusion.get());
6525
6526	auto tv0 = makeSymbolicTensor(`3`);
6527
6528	fusion ->addInput(tv0);
6529
6530	auto tv1 = sum(tv0, {`1`, `2`});
6531	auto tv2 = broadcast(tv1, {false, true, true});
6532	auto tv3 = add(tv2, tv0);
6533
6534	fusion ->addOutput(tv3);
6535
6536	// Schedule a persistent kernel
6537	auto tv0_cache = tv0->cacheAfter();
6538	tv1->merge(`1`);
6539	tv1->split(`1`, `8`, false);
6540
6541	auto tv1_rf = tv1->rFactor({`1`});
6542	tv1_rf->axis(`0`)->parallelize(ParallelType::BIDx);
6543	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6544	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6545	tv1->axis(-`1`)->padToMultipleOfWarp();
6546	TransformPropagatorWithCheck propagator(tv1_rf);
6547	MaxRootDomainInfoSpanningTree (tv1_rf).traverse(&propagator);
6548	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
6549	tv0_cache->axis(-`1`)->parallelize(ParallelType::TIDx);
6550	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6551	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6552
6553	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6554
6555	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6556	at::Tensor input1 = at::randn({`16`, `17`, `128`}, options);
6557
6558	auto at_output = input1.sum({`1`, `2`}, true).add(input1);
6559
6560	FusionExecutor fe;
6561	fe.compileFusion(fusion.get(), {input1});
6562	auto outputs = fe.runFusion({input1});
6563	testValidate(
6564	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6565	}
6566
6567	TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) {
6568	auto fusion = std::make_unique<Fusion>();
6569	FusionGuard fg(fusion.get());
6570
6571	auto tv0 = makeSymbolicTensor(`3`);
6572
6573	fusion ->addInput(tv0);
6574
6575	auto tv1 = sum(tv0, {`1`, `2`});
6576	auto tv2 = broadcast(tv1, {false, true, true});
6577	auto tv3 = add(tv2, tv0);
6578
6579	fusion ->addOutput(tv3);
6580
6581	// Schedule a persistent kernel
6582	auto tv0_cache = tv0->cacheAfter();
6583	tv1->merge(`1`);
6584	tv1->split(`1`, `8`, false);
6585
6586	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6587	tv1->axis(-`1`)->padToMultipleOfWarp();
6588	TransformPropagatorWithCheck propagator(tv1);
6589	MaxRootDomainInfoSpanningTree (tv1).traverse(&propagator);
6590	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
6591	tv0_cache->axis(-`1`)->parallelize(ParallelType::TIDx);
6592	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6593	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6594
6595	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6596
6597	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6598	at::Tensor input1 = at::randn({`16`, `17`, `128`}, options);
6599
6600	auto at_output = input1.sum({`1`, `2`}, true).add(input1);
6601
6602	FusionExecutor fe;
6603	fe.compileFusion(fusion.get(), {input1});
6604	auto outputs = fe.runFusion({input1});
6605	testValidate(
6606	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6607	}
6608
6609	TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
6610	auto fusion = std::make_unique<Fusion>();
6611	FusionGuard fg(fusion.get());
6612
6613	auto tv0 = makeConcreteTensor({`17`, `18`, `128`, `1`});
6614
6615	fusion ->addInput(tv0);
6616
6617	auto tv1 = sum(tv0, {`1`, `2`, `3`});
6618	auto tv2 = broadcast(tv1, {false, true, true, true});
6619	auto tv3 = add(tv2, tv0);
6620
6621	fusion ->addOutput(tv3);
6622
6623	// Schedule a persistent kernel
6624	auto tv0_cache = tv0->cacheAfter();
6625	tv1->merge(`1`);
6626	tv1->split(`1`, `8`, false);
6627
6628	auto tv1_rf = tv1->rFactor({`1`});
6629	tv1_rf->axis(`0`)->parallelize(ParallelType::BIDx);
6630	tv1_rf->axis(-`2`)->parallelize(ParallelType::TIDx);
6631	tv1->axis(-`2`)->parallelize(ParallelType::TIDx);
6632	tv1->axis(-`2`)->padToMultipleOfWarp();
6633	TransformPropagatorWithCheck propagator(tv1_rf);
6634	MaxRootDomainInfoSpanningTree (tv1_rf).traverse(&propagator);
6635	tv0->axis(-`2`)->parallelize(ParallelType::TIDx);
6636	tv0_cache->axis(-`2`)->parallelize(ParallelType::TIDx);
6637	tv2->axis(-`2`)->parallelize(ParallelType::TIDx);
6638	tv3->axis(-`2`)->parallelize(ParallelType::TIDx);
6639
6640	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6641
6642	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6643	at::Tensor input1 = at::randn({`17`, `18`, `128`, `1`}, options);
6644
6645	auto at_output = input1.sum({`1`, `2`, `3`}, true).add(input1);
6646
6647	FusionExecutor fe;
6648	fe.compileFusion(fusion.get(), {input1});
6649	auto outputs = fe.runFusion({input1});
6650	testValidate(
6651	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6652	}
6653
6654	TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) {
6655	auto fusion = std::make_unique<Fusion>();
6656	FusionGuard fg(fusion.get());
6657
6658	auto tv0 = makeSymbolicTensor(`2`);
6659	auto tv_add = makeSymbolicTensor(`2`);
6660
6661	fusion ->addInput(tv0);
6662	fusion ->addInput(tv_add);
6663
6664	auto tv1 = sum(tv0, {`1`});
6665	auto tv2 = broadcast(tv1, {false, true});
6666	auto tv3 = add(tv2, tv0);
6667	auto tv4 = add(tv0, tv_add);
6668
6669	fusion ->addOutput(tv3);
6670	fusion ->addOutput(tv4);
6671
6672	// Schedule a persistent kernel
6673	auto tv0_cache = tv0->cacheAfter();
6674	tv1->split(`1`, `8`, false);
6675	auto tv1_rf = tv1->rFactor({`1`});
6676	tv1_rf->axis(`0`)->parallelize(ParallelType::BIDx);
6677	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6678	tv1_rf->axis(-`1`)->padToMultipleOfWarp(`32`);
6679	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6680	tv1->axis(-`1`)->padToMultipleOfWarp(`32`);
6681	TransformPropagatorWithCheck propagator(tv1_rf);
6682	MaxRootDomainInfoSpanningTree (tv1_rf).traverse(&propagator);
6683	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
6684	tv0->axis(-`1`)->padToMultipleOfWarp(`32`);
6685	tv0_cache->axis(-`1`)->parallelize(ParallelType::TIDx);
6686	tv0_cache->axis(-`1`)->padToMultipleOfWarp(`32`);
6687	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6688	tv2->axis(-`1`)->padToMultipleOfWarp(`32`);
6689	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6690	tv3->axis(-`1`)->padToMultipleOfWarp(`32`);
6691	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
6692	tv4->axis(-`1`)->padToMultipleOfWarp(`64`);
6693
6694	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6695
6696	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6697	at::Tensor input1 = at::randn({`16`, `128`}, options);
6698	at::Tensor input2 = at::randn({`16`, `128`}, options);
6699
6700	auto at_output = input1.sum({`1`}, true).add(input1);
6701
6702	FusionExecutor fe;
6703	fe.compileFusion(fusion.get(), {input1, input2});
6704	auto outputs = fe.runFusion({input1, input2});
6705	testValidate(
6706	fusion.get(),
6707	outputs,
6708	{input1, input2},
6709	{at_output, input1 + input2},
6710	__LINE__,
6711	__FILE__);
6712	}
6713
6714	TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
6715	auto fusion = std::make_unique<Fusion>();
6716	FusionGuard fg(fusion.get());
6717
6718	auto tv0 = makeSymbolicTensor(`2`);
6719
6720	fusion ->addInput(tv0);
6721
6722	auto tv1 = sum(tv0, {`1`});
6723	auto tv2 = broadcast(tv1, {false, true});
6724	auto tv3 = add(tv2, tv0);
6725
6726	fusion ->addOutput(tv3);
6727
6728	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6729	tv1->axis(-`1`)->padToMultipleOfWarp();
6730	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6731	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6732
6733	tv1->axis(`0`)->parallelize(ParallelType::TIDy);
6734	tv2->axis(`0`)->parallelize(ParallelType::TIDy);
6735	tv3->axis(`0`)->parallelize(ParallelType::TIDy);
6736
6737	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6738	at::Tensor input1 = at::randn({`16`, `31`}, options);
6739
6740	auto at_output = input1.sum({`1`}, true).add(input1);
6741
6742	FusionExecutor fe;
6743	fe.compileFusion(fusion.get(), {input1});
6744	auto outputs = fe.runFusion({input1});
6745	testValidate(
6746	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6747	}
6748
6749	TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
6750	auto fusion = std::make_unique<Fusion>();
6751	FusionGuard fg(fusion.get());
6752
6753	auto tv0 = makeSymbolicTensor(`2`);
6754	fusion ->addInput(tv0);
6755	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
6756	auto tv2 = sum(tv1, {`1`});
6757	fusion ->addOutput(tv2);
6758
6759	tv2->split(`1`, `8`);
6760	auto tv2_rf = tv2->rFactor({-`1`});
6761	tv2_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6762	tv2_rf->axis(-`1`)->padToMultipleOfWarp();
6763
6764	TransformPropagatorWithCheck propagator(tv2_rf);
6765	MaxRootDomainInfoSpanningTree (tv2_rf).traverse(&propagator);
6766
6767	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
6768	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6769	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
6770	tv2->axis(`1`)->parallelize(ParallelType::TIDy);
6771	tv0->computeAt(tv2, `2`);
6772
6773	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6774	at::Tensor input1 = at::randn({`16`, `31`}, options);
6775
6776	auto at_output = (input1 + `1`).sum({`1`});
6777
6778	FusionExecutor fe;
6779	fe.compileFusion(fusion.get(), {input1});
6780	auto outputs = fe.runFusion({input1});
6781	testValidate(
6782	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6783	}
6784
6785	TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
6786	auto fusion = std::make_unique<Fusion>();
6787	FusionGuard fg(fusion.get());
6788
6789	auto tv0 = makeSymbolicTensor(`2`);
6790
6791	fusion ->addInput(tv0);
6792
6793	auto tv1 = sum(tv0, {`1`});
6794	auto tv2 = broadcast(tv1, {false, true});
6795	auto tv3 = add(tv2, tv0);
6796
6797	fusion ->addOutput(tv3);
6798
6799	// Schedule a persistent kernel
6800	auto tv0_cache = tv0->cacheAfter();
6801	tv1->split(`1`, `8`, false);
6802	tv1->split(`0`, `4`);
6803	auto tv1_rf = tv1->rFactor({`2`});
6804
6805	tv1_rf->axis(`0`)->parallelize(ParallelType::BIDx);
6806	tv1_rf->axis(`1`)->parallelize(ParallelType::Unroll);
6807	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6808	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6809	tv1->axis(-`1`)->padToMultipleOfWarp();
6810	tv1->axis(`1`)->parallelize(ParallelType::Unroll);
6811	TransformPropagatorWithCheck propagator(tv1_rf);
6812	MaxRootDomainInfoSpanningTree (tv1_rf).traverse(&propagator);
6813	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
6814	tv0->axis(`1`)->parallelize(ParallelType::Unroll);
6815	tv0_cache->axis(-`1`)->parallelize(ParallelType::TIDx);
6816	tv0_cache->axis(`1`)->parallelize(ParallelType::Unroll);
6817	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
6818	tv2->axis(`1`)->parallelize(ParallelType::Unroll);
6819	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
6820	tv3->axis(`1`)->parallelize(ParallelType::Unroll);
6821
6822	tv0->computeAt(tv3, -`1`, ComputeAtMode::MostInlined);
6823
6824	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6825	at::Tensor input1 = at::randn({`16`, `128`}, options);
6826
6827	auto at_output = input1.sum({`1`}, true).add(input1);
6828
6829	FusionExecutor fe;
6830	fe.compileFusion(fusion.get(), {input1});
6831	auto outputs = fe.runFusion({input1});
6832	testValidate(
6833	fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
6834	}
6835
6836	// Repro of issue #1579
6837	TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) {
6838	Fusion fusion;
6839	FusionGuard fg(&fusion);
6840
6841	std::vector<int64_t> shape1 = {`1024`};
6842	std::vector<int64_t> shape2 = {`50`};
6843
6844	auto tv0 = makeConcreteTensor(shape1);
6845	fusion.addInput(tv0);
6846	auto tv1 = sum(tv0, {`0`});
6847	fusion.addOutput(tv1);
6848
6849	auto tv2 = makeConcreteTensor(shape2);
6850	fusion.addInput(tv2);
6851	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
6852	auto tv4 = sum(tv3, {`0`});
6853	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
6854	fusion.addOutput(tv5);
6855
6856	// Just to fill the smem buffer by a thread block of 1024 threads
6857	// with some values
6858	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
6859
6860	// Make the tv4_rf reduction a warp reduction to trigger the
6861	// bug. Since the smem buffer is filled with some values due to the
6862	// reduction of tv1, those values would be used by predicated-out
6863	// threads.
6864	tv4->split(-`1`, `10`);
6865	auto tv4_rf = tv4->rFactor({-`1`});
6866	tv4_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
6867	tv4_rf->axis(-`1`)->padToMultipleOfWarp();
6868
6869	tv4_rf->computeAt(tv4, `1`);
6870
6871	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6872	auto t0 = at::randn(shape1, options);
6873	auto t2 = at::randn(shape2, options);
6874
6875	FusionExecutor fe;
6876	fe.compileFusion(&fusion, {t0, t2});
6877	auto cg_outputs = fe.runFusion({t0, t2});
6878
6879	auto t1 = t0.sum({`0`});
6880	auto t4 = (t2 + `1`).sum({`0`}) + `1`;
6881
6882	testValidate(&fusion, cg_outputs, {t0, t2}, {t1, t4}, __LINE__, __FILE__);
6883	}
6884
6885	TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) {
6886	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6887	Fusion& fusion = *fusion_ptr.get();
6888	FusionGuard fg(&fusion);
6889
6890	int batch = `2`;
6891	int c = `1`;
6892	int h = `1`;
6893	int w = `1`;
6894	int numDims = `4`;
6895
6896	auto input = makeConcreteTensor({-`1`, `1`, `1`, `1`});
6897	fusion.addInput(input);
6898	auto bcast_bias = makeConcreteTensor({-`1`, `1`, `1`, `1`});
6899	fusion.addInput(bcast_bias);
6900
6901	std::vector<int64_t> at_sum_axes;
6902	std::vector<int> outer_reduction_axes;
6903	std::vector<bool> outer_broadcast_mask(numDims, false);
6904	Val* N = IrBuilder::create<Double>(`1`);
6905	for (const auto axis : c10::irange(numDims)) {
6906	if (axis != `1`) {
6907	outer_reduction_axes.push_back(axis);
6908	at_sum_axes.push_back(axis);
6909	outer_broadcast_mask [axis] = true;
6910	N = mul(N, input->domain()->domain()[axis]->extent());
6911	}
6912	}
6913
6914	auto output0 = mul(input, bcast_bias);
6915	fusion.addOutput(output0);
6916	auto output1 = sum(output0, outer_reduction_axes);
6917	fusion.addOutput(output1);
6918
6919	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6920	at::Tensor input0 = at::randn({batch, c, h, w}, options);
6921	at::Tensor input1 = at::randn({batch, c, h, w}, options);
6922
6923	auto at_output0 = input0.mul(input1);
6924	auto at_output1 = at_output0.sum(at_sum_axes);
6925
6926	FusionExecutorCache fec(std::move(fusion_ptr));
6927	std::vector<IValue> inputs = {input0, input1};
6928	auto outputs = fec.runFusionWithInputs(inputs);
6929
6930	testValidate(
6931	&fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__);
6932	}
6933
6934	TEST_F(NVFuserTest, FusionPredicateElimination1_CUDA) {
6935	Fusion fusion;
6936	FusionGuard fg(&fusion);
6937
6938	auto tv0 = makeSymbolicTensor(`1`);
6939	fusion.addInput(tv0);
6940
6941	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
6942	auto tv2 = add(tv1, IrBuilder::create<Double>(`2`));
6943	auto tv3 = add(tv2, IrBuilder::create<Double>(`3`));
6944
6945	fusion.addOutput(tv3);
6946
6947	tv3->split(`0`, `32`);
6948	tv0->computeAt(tv3, `1`);
6949
6950	tv2->axis(`1`)->parallelize(ParallelType::Unswitch);
6951
6952	{
6953	GpuLower gpulw(&fusion);
6954	TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
6955	}
6956
6957	tv2->axis(`1`)->parallelize(ParallelType::Serial);
6958	tv2->split(`1`, `5`);
6959
6960	{
6961	GpuLower gpulw(&fusion);
6962	TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
6963	}
6964	}
6965
6966	// Repro of issue #1571
6967	TEST_F(NVFuserTest, FusionPredicateElimination2_CUDA) {
6968	Fusion fusion;
6969	FusionGuard fg(&fusion);
6970
6971	std::vector<int64_t> shape({`10`, `11`});
6972
6973	auto tv0 = makeConcreteTensor(shape);
6974	fusion.addInput(tv0);
6975
6976	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
6977	auto tv2 = sum(tv1, {`1`});
6978	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
6979
6980	fusion.addOutput(tv3);
6981
6982	tv1->split(`1`, `4`);
6983	tv1->split(`0`, `4`);
6984	tv2->split(`1`, `4`);
6985	tv2->split(`0`, `4`);
6986
6987	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
6988	auto t0 = at::randn(shape, options);
6989
6990	FusionExecutor fe;
6991	fe.compileFusion(&fusion, {t0});
6992	auto cg_outputs = fe.runFusion({t0});
6993
6994	auto ref = (t0 + `1`).sum({`1`}) + `1`;
6995
6996	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
6997	}
6998
6999	TEST_F(NVFuserTest, FusionPredicateElimination3_CUDA) {
7000	Fusion fusion;
7001	FusionGuard fg(&fusion);
7002
7003	auto tv0 = makeSymbolicTensor(`1`);
7004	fusion.addInput(tv0);
7005
7006	auto tv1 = sum(tv0, {`0`});
7007	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
7008	fusion.addOutput(tv2);
7009
7010	auto tv3 = tv0->cacheAfter();
7011
7012	tv1->split(`0`, `10`);
7013	tv1->split(`0`, `33`);
7014	TransformPropagatorWithCheck propagator(tv1);
7015	MaxRootDomainInfoSpanningTree (tv1).traverse(&propagator);
7016
7017	auto tv4 = tv1->rFactor({-`1`});
7018	auto tv5 = tv1->rFactor({-`1`});
7019
7020	tv4->axis(`0`)->parallelize(ParallelType::BIDx);
7021	tv4->axis(`1`)->parallelize(ParallelType::TIDx);
7022	scheduler_utils::parallelizeAllLike(tv4);
7023
7024	GpuLower gpulw(&fusion);
7025
7026	// The fusion has three reductions: one within each thread, one
7027	// within each block, and another with the whole grid. All of them
7028	// should not need to be predicated as they use the same init value
7029	// and same reduction op.
7030	TORCH_CHECK(!PredicatedChecker::isPredicated(tv4, gpulw));
7031	TORCH_CHECK(!PredicatedChecker::isPredicated(tv5, gpulw));
7032	TORCH_CHECK(!PredicatedChecker::isPredicated(tv1, gpulw));
7033
7034	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7035
7036	for (auto size : {`1`, `2`, `999`, `1001`, `1234`, `10000`}) {
7037	auto t0 = at::randn({size}, options);
7038
7039	FusionExecutor fe;
7040	fe.compileFusion(&fusion, {t0});
7041	auto cg_outputs = fe.runFusion({t0});
7042
7043	auto ref = sum(t0) + `1`;
7044	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7045	}
7046	}
7047
7048	TEST_F(NVFuserTest, FusionPredicateElimination4_CUDA) {
7049	Fusion fusion;
7050	FusionGuard fg(&fusion);
7051
7052	auto tv0 = makeSymbolicTensor(`2`);
7053	fusion.addInput(tv0);
7054
7055	auto tv1 = sum(tv0, {`1`});
7056
7057	auto tv2 = sum(tv1, {`0`});
7058	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
7059	fusion.addOutput(tv3);
7060
7061	auto tv4 = max(tv1, {`0`});
7062	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
7063	fusion.addOutput(tv5);
7064
7065	tv1->split(`1`, `7`);
7066	tv1->split(`0`, `11`);
7067	tv1->reorder({{`1`, `2`}, {`2`, `1`}});
7068	TransformPropagatorWithCheck propagator(tv1);
7069	MaxRootDomainInfoSpanningTree (tv1).traverse(&propagator);
7070
7071	tv1->axis(`0`)->parallelize(ParallelType::TIDy);
7072	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
7073	scheduler_utils::parallelizeAllLike(tv1);
7074
7075	GpuLower gpulw(&fusion);
7076
7077	// tv2 uses the same op and init with tv1, so tv2 should be fine
7078	// without a predicate. However, tv4, while it uses the tv1 as its
7079	// input, the reduction op and init value is different from those of
7080	// tv1, so tv4 needs to be predicated.
7081	TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
7082	TORCH_CHECK(PredicatedChecker::isPredicated(tv4, gpulw));
7083
7084	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7085
7086	std::vector<int64_t> sizes = {`1`, `2`, `33`, `34`, `64`, `99`};
7087	for (auto s0 : sizes) {
7088	for (auto s1 : sizes) {
7089	auto t0 = at::randn({s0, s1}, options);
7090
7091	FusionExecutor fe;
7092	fe.compileFusion(&fusion, {t0});
7093	auto cg_outputs = fe.runFusion({t0});
7094
7095	auto t1 = t0.sum({`1`});
7096	auto t3 = t1.sum({`0`}) + `1`;
7097	auto t5 = std::get<`0`>(t1.max(`0`)) + `1`;
7098
7099	testValidate(&fusion, cg_outputs, {t0}, {t3, t5}, __LINE__, __FILE__);
7100	}
7101	}
7102	}
7103
7104	TEST_F(NVFuserTest, FusionPredicateElimination5_CUDA) {
7105	Fusion fusion;
7106	FusionGuard fg(&fusion);
7107
7108	auto tv0 = makeSymbolicTensor(`1`);
7109	fusion.addInput(tv0);
7110
7111	auto tv1 = set(tv0);
7112	auto tvs2 = Welford(tv1, {`0`});
7113	auto tv3 = set(tvs2.avg);
7114	fusion.addOutput(tv3);
7115
7116	tvs2.avg->split(`0`, `4`);
7117	TransformPropagatorWithCheck propagator(tvs2.avg);
7118	MaxRootDomainInfoSpanningTree (tvs2.avg).traverse(&propagator);
7119	auto avg_rf = ir_utils::rfactorHelper(tvs2.avg, {`1`});
7120
7121	avg_rf->axis(`0`)->parallelize(ParallelType::TIDx);
7122	scheduler_utils::parallelizeAllLike(avg_rf);
7123
7124	GpuLower gpulw(&fusion);
7125
7126	// The first per-thread welford needs to be predicated as the N
7127	// input is different from its init value. The second welford op
7128	// does not need a predicate.
7129	TORCH_CHECK(PredicatedChecker::isPredicated(avg_rf, gpulw));
7130	TORCH_CHECK(!PredicatedChecker::isPredicated(tvs2.avg, gpulw));
7131
7132	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7133
7134	std::vector<int64_t> sizes = {`1`, `2`, `33`, `34`, `64`, `99`};
7135	for (auto s0 : sizes) {
7136	auto t0 = at::randn({s0}, options);
7137
7138	FusionExecutor fe;
7139	fe.compileFusion(&fusion, {t0});
7140	auto cg_outputs = fe.runFusion({t0});
7141
7142	auto ref = t0.mean({`0`});
7143
7144	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7145	}
7146	}
7147
7148	TEST_F(NVFuserTest, FusionPredicateElimination6_CUDA) {
7149	Fusion fusion;
7150	FusionGuard fg(&fusion);
7151
7152	auto tv0 = makeConcreteTensor({`2`, `3`});
7153	fusion.addInput(tv0);
7154
7155	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
7156	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
7157	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
7158	auto tv4 = add(tv3, IrBuilder::create<Double>(`1`));
7159	fusion.addOutput(tv4);
7160
7161	tv4->split(`1`, `5`);
7162	TransformPropagatorWithCheck propagator(tv4);
7163	MaxRootDomainInfoSpanningTree (tv4).traverse(&propagator);
7164
7165	tv4->reorder({{`0`, `1`}, {`1`, `0`}});
7166	tv3->computeAt(tv4, `1`);
7167
7168	GpuLower gpulw(&fusion);
7169
7170	// The expression for tv2 is a local-to-local expression. It
7171	// satisfies all the requirements of predicate elimination, except
7172	// for the on on split root domains. As the second root axis of tv2
7173	// is split, its index exceeds its extent (i.e., 3 in this case)
7174	// without its predicate.
7175	TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
7176
7177	// Unlike tv2, tv3 is computed at tv4, so the second root axis does
7178	// have a zero domain. Its index should look like "i 5 + j", where*
7179	// i comes from the first root domain and j comes from the split
7180	// inner domain.
7181	TORCH_CHECK(!PredicatedChecker::isPredicated(tv3, gpulw));
7182
7183	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7184	auto t0 = at::randn({`2`, `3`}, options);
7185
7186	FusionExecutor fe;
7187	fe.compileFusion(&fusion, {t0});
7188	auto cg_outputs = fe.runFusion({t0});
7189
7190	auto ref = t0 + `4`;
7191
7192	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7193	}
7194
7195	TEST_F(NVFuserTest, FusionPredicateElimination7_CUDA) {
7196	Fusion fusion;
7197	FusionGuard fg(&fusion);
7198
7199	auto tv0 = makeSymbolicTensor(`1`);
7200	fusion.addInput(tv0);
7201
7202	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
7203	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
7204	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
7205	fusion.addOutput(tv3);
7206
7207	tv3->split(-`1`, `5`);
7208	tv3->split(-`1`, `4`);
7209	tv3->split(-`1`, `3`);
7210	TransformPropagatorWithCheck propagator(tv3);
7211	MaxRootDomainInfoSpanningTree (tv3).traverse(&propagator);
7212
7213	tv0->computeAt(tv3, `1`);
7214
7215	// The last split of tv2 is a non-divisible split, and omitting it
7216	// is invalid.
7217	GpuLower gpulw(&fusion);
7218	TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
7219
7220	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7221	auto t0 = at::randn({`123`}, options);
7222
7223	FusionExecutor fe;
7224	fe.compileFusion(&fusion, {t0});
7225	auto cg_outputs = fe.runFusion({t0});
7226
7227	auto ref = t0 + `3`;
7228
7229	testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
7230	}
7231
7232	TEST_F(NVFuserTest, FusionForceFp16Simple_CUDA) {
7233	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7234	auto fusion = fusion_ptr.get();
7235	FusionGuard fg(fusion);
7236
7237	auto tv0 = makeSymbolicTensor(`2`);
7238	auto tv1 = makeSymbolicTensor(`2`);
7239
7240	fusion->addInput(tv0);
7241	fusion->addInput(tv1);
7242
7243	// Group 1
7244	auto tv2 = sum(tv0, {`1`});
7245	auto tv3 = broadcast(tv2, {false, true});
7246
7247	// Group 2
7248	auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
7249	auto tv5 = castOp(DataType::Half, tv4);
7250
7251	fusion->addOutput(tv5);
7252
7253	FusionExecutorCache fec(std::move(fusion_ptr));
7254
7255	std::vector<int64_t> shape{`15`, `16`};
7256
7257	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7258	auto in0 = at::randn(shape, options);
7259	auto in1 = at::randn(shape, options);
7260	fec.runFusionWithInputs({in0, in1});
7261
7262	// Check the segmented edge is fp16
7263	auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7264	for (auto edge : segmented_fusion->edges()) {
7265	auto edge_tv = edge->val->as<TensorView>();
7266	TORCH_CHECK(edge_tv->getDataType() == DataType::Half);
7267	}
7268	}
7269
7270	TEST_F(NVFuserTest, FusionForceBf16Simple_CUDA) {
7271	#if !defined(USE_ROCM)
7272	// requires ampere+ GPU
7273	if (!deviceMajorMinorCheck(`8`)) {
7274	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
7275	return;
7276	}
7277
7278	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7279	auto fusion = fusion_ptr.get();
7280	FusionGuard fg(fusion);
7281
7282	auto tv0 = makeSymbolicTensor(`2`);
7283	auto tv1 = makeSymbolicTensor(`2`);
7284
7285	fusion->addInput(tv0);
7286	fusion->addInput(tv1);
7287
7288	// Group 1
7289	auto tv2 = sum(tv0, {`1`});
7290	auto tv3 = broadcast(tv2, {false, true});
7291
7292	// Group 2
7293	auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
7294	auto tv5 = castOp(DataType::BFloat16, tv4);
7295
7296	fusion->addOutput(tv5);
7297
7298	FusionExecutorCache fec(std::move(fusion_ptr));
7299
7300	std::vector<int64_t> shape{`15`, `16`};
7301
7302	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7303	auto in0 = at::randn(shape, options);
7304	auto in1 = at::randn(shape, options);
7305	fec.runFusionWithInputs({in0, in1});
7306
7307	// Check the segmented edge is bf16
7308	auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7309	for (auto edge : segmented_fusion->edges()) {
7310	auto edge_tv = edge->val->as<TensorView>();
7311	TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16);
7312	}
7313	#else
7314	GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
7315	#endif
7316	}
7317
7318	TEST_F(NVFuserTest, FusionForceFp16NotAllCast_CUDA) {
7319	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7320	auto fusion = fusion_ptr.get();
7321	FusionGuard fg(fusion);
7322
7323	auto tv0 = makeSymbolicTensor(`3`);
7324	auto tv1 = makeSymbolicTensor(`3`);
7325
7326	fusion->addInput(tv0);
7327	fusion->addInput(tv1);
7328
7329	// Group 1
7330	auto tv3 = sum(tv0, {`1`});
7331	auto tv4 = broadcast(tv3, {false, true, false});
7332	auto tv5 = sum(tv0, {`1`});
7333
7334	// Group 2
7335	auto tv6 = add(tv4, tv1); // edge tv4, expect cast
7336	auto tv7 = castOp(DataType::Half, tv6);
7337
7338	// Group 3
7339	auto tv8 = sum(tv5, {`1`}); // edge tv5, don't expect cast
7340
7341	fusion->addOutput(tv7);
7342	fusion->addOutput(tv8);
7343
7344	FusionExecutorCache fec(std::move(fusion_ptr));
7345
7346	std::vector<int64_t> shape{`16`, `16`, `16`};
7347
7348	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7349	auto in0 = at::randn(shape, options);
7350	auto in1 = at::randn(shape, options);
7351	fec.runFusionWithInputs({in0, in1});
7352
7353	auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7354	auto complete_fusion = segmented_fusion->completeFusion();
7355
7356	// Check that the edge that wasn't fp16 is the producer of the
7357	// reduction op, i.e. tv8 = sum(tv5,{1});.
7358	for (auto edge : segmented_fusion->edges()) {
7359	auto edge_tv = edge->val->as<TensorView>();
7360	if (edge_tv->getDataType() == DataType::Float) {
7361	auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
7362	TORCH_CHECK(consumer->isA<ReductionOp>());
7363	}
7364	}
7365	}
7366
7367	TEST_F(NVFuserTest, FusionForceBf16NotAllCast_CUDA) {
7368	#if !defined(USE_ROCM)
7369	// requires ampere+ GPU
7370	if (!deviceMajorMinorCheck(`8`)) {
7371	GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
7372	return;
7373	}
7374
7375	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7376	auto fusion = fusion_ptr.get();
7377	FusionGuard fg(fusion);
7378
7379	auto tv0 = makeSymbolicTensor(`3`);
7380	auto tv1 = makeSymbolicTensor(`3`);
7381
7382	fusion->addInput(tv0);
7383	fusion->addInput(tv1);
7384
7385	// Group 1
7386	auto tv3 = sum(tv0, {`1`});
7387	auto tv4 = broadcast(tv3, {false, true, false});
7388	auto tv5 = sum(tv0, {`1`});
7389
7390	// Group 2
7391	auto tv6 = add(tv4, tv1); // edge tv4, expect cast
7392	auto tv7 = castOp(DataType::BFloat16, tv6);
7393
7394	// Group 3
7395	auto tv8 = sum(tv5, {`1`}); // edge tv5, don't expect cast
7396
7397	fusion->addOutput(tv7);
7398	fusion->addOutput(tv8);
7399
7400	FusionExecutorCache fec(std::move(fusion_ptr));
7401
7402	std::vector<int64_t> shape{`16`, `16`, `16`};
7403
7404	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7405	auto in0 = at::randn(shape, options);
7406	auto in1 = at::randn(shape, options);
7407	fec.runFusionWithInputs({in0, in1});
7408
7409	auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
7410	auto complete_fusion = segmented_fusion->completeFusion();
7411
7412	// Check that the edge that wasn't fp16 is the producer of the
7413	// reduction op, i.e. tv8 = sum(tv5,{1});.
7414	for (auto edge : segmented_fusion->edges()) {
7415	auto edge_tv = edge->val->as<TensorView>();
7416	if (edge_tv->getDataType() == DataType::Float) {
7417	auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
7418	TORCH_CHECK(consumer->isA<ReductionOp>());
7419	}
7420	}
7421	#else
7422	GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
7423	#endif
7424	}
7425
7426	TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
7427	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7428	auto fusion = fusion_ptr.get();
7429	FusionGuard fg(fusion);
7430
7431	auto tv0 = makeConcreteTensor({`2`, `2`});
7432	auto tv1 = makeConcreteTensor({`2`, `2`, `2`});
7433
7434	fusion->addInput(tv0);
7435	fusion->addInput(tv1);
7436
7437	auto tv2 = mul(tv0, IrBuilder::create<Double>(`2`));
7438	auto tv3 = broadcast(tv2, {false, false, true});
7439	auto tv4 = add(tv3, tv1);
7440	auto tv5 = mul(tv4, IrBuilder::create<Double>(`3`));
7441	fusion->addOutput(tv5);
7442
7443	// t4 cannot inner re-use t2, because there's a broadcast
7444	// between them.
7445	tv0->computeAt(tv5, `1`, ComputeAtMode::BestEffort);
7446	tv3->computeAt(tv5, `2`, ComputeAtMode::BestEffort);
7447
7448	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7449	auto in0 = at::randn({`2`, `2`}, options);
7450	auto in1 = at::randn({`2`, `2`, `2`}, options);
7451
7452	auto at_output = ((in0 * `2`).unsqueeze(`2`) + in1) * `3`;
7453	FusionExecutor fe;
7454	fe.compileFusion(fusion, {in0, in1});
7455	auto outputs = fe.runFusion({in0, in1});
7456
7457	testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__);
7458	}
7459
7460	TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
7461	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7462	auto fusion = fusion_ptr.get();
7463	FusionGuard fg(fusion);
7464
7465	auto tv0 = makeConcreteTensor({`2`, `2`});
7466	auto tv1 = makeConcreteTensor({`2`, `2`, `2`});
7467
7468	fusion->addInput(tv0);
7469	fusion->addInput(tv1);
7470
7471	auto tv2 = mul(tv0, IrBuilder::create<Double>(`2`));
7472	auto tv3 = mul(tv0, IrBuilder::create<Double>(`3`));
7473	auto tv4 = mul(tv2, tv3);
7474	// Broadcast buffer can be reused through outer sharing
7475	auto tv5 = broadcast(tv4, {true, false, false});
7476	auto tv6 = mul(tv5, IrBuilder::create<Double>(`5`));
7477	auto tv7 = mul(tv6, tv1);
7478	auto tv8 = mul(tv7, IrBuilder::create<Double>(`7`));
7479	// tv9 shouldn't alias to avoid buffer over-subscription
7480	auto tv9 = broadcast(tv4, {true, false, false});
7481	auto tv10 = mul(tv9, IrBuilder::create<Double>(`9`));
7482	auto tv11 = add(tv5, tv9);
7483	fusion->addOutput(tv7);
7484	fusion->addOutput(tv11);
7485
7486	tv0->computeAt(tv5, `1`, ComputeAtMode::BestEffort);
7487	tv0->computeAt(tv9, `1`, ComputeAtMode::BestEffort);
7488
7489	tv5->computeAt(tv7, `1`, ComputeAtMode::BestEffort);
7490	tv5->computeAt(tv11, `1`, ComputeAtMode::BestEffort);
7491	tv9->computeAt(tv11, `1`, ComputeAtMode::BestEffort);
7492
7493	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7494	auto in0 = at::randn({`2`, `2`}, options);
7495	auto in1 = at::randn({`2`, `2`, `2`}, options);
7496	auto t2 = in0 * `2`;
7497	auto t3 = in0 * `3`;
7498	auto t4 = t2 * t3;
7499	auto t5 = t4.unsqueeze(`0`);
7500	auto t6 = t5 * `5`;
7501	auto t7 = t6 * in1;
7502	auto t8 = t7 * `7`;
7503	auto t9 = t4.unsqueeze(`0`);
7504	auto t10 = t9 * `9`;
7505	auto t11 = t5 + t9;
7506	FusionExecutor fe;
7507	fe.compileFusion(fusion, {in0, in1});
7508
7509	auto at_output = ((in0 * `2`).unsqueeze(`2`) + in1) * `3`;
7510	auto outputs = fe.runFusion({in0, in1});
7511
7512	testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__);
7513	}
7514
7515	TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
7516	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7517	auto fusion = fusion_ptr.get();
7518	FusionGuard fg(fusion);
7519
7520	auto tv0 = makeConcreteTensor({`256`, `512`});
7521
7522	fusion->addInput(tv0);
7523
7524	auto tv1 = mul(tv0, IrBuilder::create<Double>(`2`));
7525	auto tv2 = mul(tv1, IrBuilder::create<Double>(`2`));
7526	auto tv3 = mul(tv2, IrBuilder::create<Double>(`2`));
7527	auto tv4 = mul(tv3, IrBuilder::create<Double>(`2`));
7528	auto tv5 = mul(tv4, IrBuilder::create<Double>(`2`));
7529	auto tv6 = mul(tv5, IrBuilder::create<Double>(`2`));
7530
7531	fusion->addOutput(tv6);
7532
7533	tv0->computeAt(tv6, `1`, ComputeAtMode::BestEffort);
7534	tv6->axis(`0`)->parallelize(ParallelType::TIDx);
7535
7536	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7537	auto in0 = at::randn({`256`, `512`}, options);
7538
7539	FusionExecutor fe;
7540	fe.compileFusion(fusion, {in0});
7541	auto outputs = fe.runFusion({in0});
7542
7543	auto at_out = in0.mul(`2`).mul(`2`).mul(`2`).mul(`2`).mul(`2`).mul(`2`);
7544
7545	testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
7546	}
7547
7548	TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
7549	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7550	auto fusion = fusion_ptr.get();
7551	FusionGuard fg(fusion);
7552
7553	auto tv0 = makeConcreteTensor({`2`, `2`});
7554	auto tv1 = makeConcreteTensor({`2`, `2`, `2`});
7555
7556	fusion->addInput(tv0);
7557	fusion->addInput(tv1);
7558
7559	auto tv2 = mul(tv0, IrBuilder::create<Double>(`2`));
7560	auto tv3 = broadcast(tv2, {false, false, true});
7561	auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and
7562	// shouldn't outer alias on top
7563	auto tv5 = mul(tv4, IrBuilder::create<Double>(`3`));
7564	auto tv6 = mul(tv5, IrBuilder::create<Double>(`3`));
7565	fusion->addOutput(tv6);
7566
7567	tv0->computeAt(tv6, `1`, ComputeAtMode::BestEffort);
7568	tv4->computeAt(tv6, `2`, ComputeAtMode::BestEffort);
7569
7570	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7571	auto in0 = at::randn({`2`, `2`}, options);
7572	auto in1 = at::randn({`2`, `2`, `2`}, options);
7573	FusionExecutor fe;
7574	fe.compileFusion(fusion, {in0, in1});
7575	auto outputs = fe.runFusion({in0, in1});
7576
7577	auto at_out = (in0.mul(`2.0`).unsqueeze(`2`) + in1).mul(`3.0`).mul(`3.0`);
7578
7579	testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__);
7580	}
7581
7582	TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
7583	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7584	auto fusion = fusion_ptr.get();
7585	FusionGuard fg(fusion);
7586
7587	auto tv0 = makeConcreteTensor({`3`, `3`, `3`});
7588
7589	fusion->addInput(tv0);
7590
7591	auto tv1 = sum(tv0, {`1`});
7592	auto tv2 = mul(tv1, IrBuilder::create<Double>(`2`));
7593	auto tv3 = mul(tv2, IrBuilder::create<Double>(`2`));
7594
7595	fusion->addOutput(tv3);
7596
7597	// In this case tv1 "reuses" allocation of tv2
7598	// due to the switched allocation order
7599	tv1->computeAt(tv2, `1`, ComputeAtMode::BestEffort);
7600
7601	tv0->axis(`0`)->parallelize(ParallelType::TIDx);
7602	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
7603	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
7604	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
7605
7606	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7607	auto in0 = at::randn({`3`, `3`, `3`}, options);
7608
7609	FusionExecutor fe;
7610	fe.compileFusion(fusion, {in0});
7611	auto outputs = fe.runFusion({in0});
7612
7613	auto at_out = in0.sum(`1`).mul(`2`).mul(`2`);
7614
7615	testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
7616	}
7617
7618	TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
7619	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7620	auto fusion = fusion_ptr.get();
7621	FusionGuard fg(fusion);
7622
7623	auto tv0 = makeConcreteTensor({`16`, `16`});
7624
7625	fusion->addInput(tv0);
7626
7627	auto tv1 = mul(tv0, IrBuilder::create<Double>(`3`));
7628	auto tv2 = mul(tv1, IrBuilder::create<Double>(`2`));
7629	auto tv3 = mul(tv2, IrBuilder::create<Double>(`2`));
7630	// tv1 used till here, cannot be reused by tv2 or tv3
7631	auto tv4 = mul(tv3, tv1);
7632
7633	fusion->addOutput(tv4);
7634
7635	tv0->computeAt(tv4, `1`);
7636
7637	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7638	auto in0 = at::randn({`16`, `16`}, options);
7639
7640	FusionExecutor fe;
7641	fe.compileFusion(fusion, {in0});
7642	auto cg_outputs = fe.runFusion({in0});
7643
7644	auto at_t0 = in0 * `3.0`;
7645	auto at_out = at_t0 * `2.0` * `2.0` * at_t0;
7646
7647	testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__);
7648	}
7649
7650	TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
7651	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
7652	auto fusion = fusion_ptr.get();
7653	FusionGuard fg(fusion);
7654
7655	auto tv0 = makeConcreteTensor({`2`, `2`});
7656	auto tv1 = makeConcreteTensor({`2`, `2`, `2`});
7657
7658	fusion->addInput(tv0);
7659	fusion->addInput(tv1);
7660
7661	auto tv2 = mul(tv0, IrBuilder::create<Double>(`2`));
7662	auto tv3 = mul(tv0, IrBuilder::create<Double>(`3`));
7663	auto tv4 = mul(tv2, tv3);
7664	auto tv5 = broadcast(tv4, {false, false, true});
7665	auto tv6 = mul(tv5, tv1);
7666	auto tv7 = mul(tv6, IrBuilder::create<Double>(`7`));
7667	fusion->addOutput(tv7);
7668
7669	// tv6 shouldn't re-use t2 or t3 because of
7670	// the broadcast in between
7671	tv0->computeAt(tv4, `1`, ComputeAtMode::BestEffort);
7672	tv4->computeAt(tv7, `2`, ComputeAtMode::BestEffort);
7673
7674	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7675	auto in0 = at::randn({`2`, `2`}, options);
7676	auto in1 = at::randn({`2`, `2`, `2`}, options);
7677	FusionExecutor fe;
7678	fe.compileFusion(fusion, {in0, in1});
7679	auto outputs = fe.runFusion({in0, in1});
7680
7681	auto t2 = in0 * `2`;
7682	auto t3 = in0 * `3`;
7683	auto t4 = t2 * t3;
7684	auto t5 = t4.unsqueeze(`2`);
7685	auto t6 = t5 * in1;
7686	auto t7 = t6 * `7`;
7687	testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__);
7688	}
7689
7690	TEST_F(NVFuserTest, FusionIssue970_CUDA) {
7691	Fusion fusion;
7692	FusionGuard fg(&fusion);
7693
7694	const int nelm = `10`;
7695
7696	// tv3 = tv0 + sum(tv0)
7697	auto tv0 = makeConcreteTensor({nelm, nelm});
7698	fusion.addInput(tv0);
7699	auto tv1 = sum(tv0, {`1`});
7700	auto tv2 = broadcast(tv1, {false, true});
7701	auto tv3 = add(tv2, tv0);
7702	fusion.addOutput(tv3);
7703
7704	tv1->split(`1`, `4`);
7705
7706	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7707	auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, `0`);
7708	at::manual_seed(`0`);
7709	at::Tensor t0 = at::randn({nelm, nelm}, options);
7710
7711	FusionExecutor fe;
7712	fe.compileFusion(&fusion, {t0});
7713	auto outputs = fe.runFusion({t0});
7714
7715	auto ref = sum(t0, {`1`}).unsqueeze(-`1`).expand({nelm, nelm}) + t0;
7716
7717	testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
7718	}
7719
7720	// Reproducer of #1016
7721	TEST_F(NVFuserTest, FusionIssue1016_CUDA) {
7722	Fusion fusion;
7723	FusionGuard fg(&fusion);
7724
7725	auto tv0 = makeSymbolicTensor(`2`);
7726	fusion.addInput(tv0);
7727
7728	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
7729	auto tv2 = add(tv1, IrBuilder::create<Double>(`2`));
7730
7731	fusion.addOutput(tv2);
7732
7733	tv1->setMemoryType(MemoryType::Shared);
7734
7735	tv2->split(-`1`, `8`);
7736
7737	int numel_x = `10`;
7738	int numel_y = `11`;
7739
7740	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7741	at::Tensor t0 = at::randn({numel_x, numel_y}, options);
7742	std::vector<IValue> inputs = {t0};
7743
7744	FusionExecutor fe;
7745	fe.compileFusion(&fusion, inputs);
7746	auto outputs = fe.runFusion(inputs);
7747
7748	auto ref = t0 + `1` + `2`;
7749
7750	testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
7751	}
7752
7753	// Reproducer of #1021
7754	TEST_F(NVFuserTest, FusionIssue1021_CUDA) {
7755	Fusion fusion;
7756	FusionGuard fg(&fusion);
7757
7758	auto tv0 = makeSymbolicTensor(`1`);
7759	fusion.addInput(tv0);
7760	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
7761	auto tv2 = broadcast(tv1, {false, true});
7762	fusion.addOutput(tv2);
7763
7764	auto tv3 = tv2->cacheBefore();
7765
7766	tv2->split(`0`, `2`);
7767
7768	tv1->computeAt(tv2, `1`);
7769
7770	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
7771	tv2->axis(`1`)->parallelize(ParallelType::Vectorize);
7772
7773	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7774	at::Tensor t0 = at::randn({`10`}, options);
7775	std::vector<IValue> inputs = {t0};
7776
7777	FusionExecutor fe;
7778	fe.compileFusion(&fusion, inputs);
7779	auto outputs = fe.runFusion(inputs);
7780
7781	auto ref = (t0 + `1`).unsqueeze(-`1`);
7782
7783	testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
7784	}
7785
7786	// Reproducer of issue #1053
7787	TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
7788	auto fusion = std::make_unique<Fusion>();
7789	FusionGuard fg(fusion.get());
7790
7791	auto tv0 = makeSymbolicTensor(`1`);
7792	fusion ->addInput(tv0);
7793	auto tv1 = sum(tv0, {`0`});
7794	fusion ->addOutput(tv1);
7795
7796	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
7797	fusion ->addOutput(tv2);
7798
7799	tv1->split(`0`, `8`);
7800	auto tv1_rf = tv1->rFactor({-`1`});
7801
7802	tv1_rf->computeAt(tv1, `1`);
7803
7804	tv1_rf->axis(-`1`)->parallelize(ParallelType::TIDx);
7805
7806	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
7807
7808	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7809	at::Tensor input1 = at::randn({`32`}, options);
7810
7811	auto at_tv1 = (input1).sum({`0`});
7812	auto at_tv2 = input1 + `1`;
7813
7814	FusionExecutor fe;
7815	fe.compileFusion(fusion.get(), {input1});
7816	auto outputs = fe.runFusion({input1});
7817	testValidate(
7818	fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
7819	}
7820
7821	TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
7822	auto fusion = std::make_unique<Fusion>();
7823	FusionGuard fg(fusion.get());
7824
7825	auto tv0 = makeSymbolicTensor(`1`);
7826	fusion ->addInput(tv0);
7827	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
7828	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
7829	fusion ->addOutput(tv1);
7830	fusion ->addOutput(tv2);
7831
7832	tv1->split(`0`, `8`, false);
7833	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
7834	tv2->split(`0`, `8`, false);
7835	tv2->axis(`1`)->parallelize(ParallelType::TIDx);
7836
7837	// The extents of tv1 and tv2 axes are equal even though their
7838	// actual values are not statically known
7839	GpuLower gpulw(fusion.get());
7840	const auto& pdmap = gpulw.parallelDimensionMap();
7841	for (const auto i : c10::irange(tv1->domain()->domain().size())) {
7842	auto dom1 = tv1->domain()->domain()[i];
7843	auto dom2 = tv2->domain()->domain()[i];
7844	TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent()));
7845	}
7846
7847	TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
7848	TORCH_CHECK(
7849	pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7850	pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7851
7852	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7853	at::Tensor input1 = at::randn({`32`}, options);
7854
7855	FusionExecutor fe;
7856	fe.compileFusion(fusion.get(), {input1});
7857	auto outputs = fe.runFusion({input1});
7858
7859	testValidate(
7860	fusion.get(),
7861	outputs,
7862	{input1},
7863	{input1 + `1`, input1 + `1`},
7864	__LINE__,
7865	__FILE__);
7866	}
7867
7868	TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
7869	auto fusion = std::make_unique<Fusion>();
7870	FusionGuard fg(fusion.get());
7871
7872	auto tv0 = makeSymbolicTensor(`1`);
7873	fusion ->addInput(tv0);
7874	auto tv1 = makeSymbolicTensor(`2`);
7875	fusion ->addInput(tv1);
7876	auto tv2 = broadcast(tv0, {false, true});
7877	auto tv3 = add(tv1, tv2);
7878	fusion ->addOutput(tv3);
7879
7880	tv3->split(-`1`, `8`, false);
7881	tv2->computeAt(tv3, -`1`);
7882
7883	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
7884	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
7885
7886	GpuLower gpulw(fusion.get());
7887	const auto& pdmap = gpulw.parallelDimensionMap();
7888	TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
7889	TORCH_CHECK(
7890	pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7891	pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7892
7893	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7894	at::Tensor input1 = at::randn({`11`}, options);
7895	at::Tensor input2 = at::randn({`11`, `13`}, options);
7896
7897	FusionExecutor fe;
7898	fe.compileFusion(fusion.get(), {input1, input2});
7899	auto outputs = fe.runFusion({input1, input2});
7900
7901	auto ref = input1.unsqueeze(-`1`) + input2;
7902
7903	testValidate(
7904	fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
7905	}
7906
7907	// Mix symbolic and concrete tensors
7908	TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
7909	auto fusion = std::make_unique<Fusion>();
7910	FusionGuard fg(fusion.get());
7911
7912	auto tv0 = makeSymbolicTensor(`1`);
7913	fusion ->addInput(tv0);
7914
7915	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
7916	fusion ->addOutput(tv2);
7917	auto tv3 = add(tv0, IrBuilder::create<Double>(`1`));
7918	fusion ->addOutput(tv3);
7919
7920	tv2->split(`0`, `10`);
7921	tv3->split(`0`, `20`);
7922
7923	auto tv4 = add(tv0, IrBuilder::create<Double>(`1`));
7924	fusion ->addOutput(tv4);
7925	auto tv5 = add(tv0, IrBuilder::create<Double>(`1`));
7926	fusion ->addOutput(tv5);
7927
7928	// Not mapped but equal extent
7929	tv4->split(`0`, `10`);
7930	tv5->split(`0`, `10`);
7931
7932	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
7933	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
7934
7935	tv4->axis(-`1`)->parallelize(ParallelType::TIDy);
7936	tv5->axis(-`1`)->parallelize(ParallelType::TIDy);
7937
7938	GpuLower gpulw(fusion.get());
7939	const auto& pdmap = gpulw.parallelDimensionMap();
7940	TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
7941	TORCH_CHECK(
7942	pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7943	pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7944	TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
7945	TORCH_CHECK(
7946	pdmap.get(ParallelType::TIDy)->isConst() &&
7947	pdmap.get(ParallelType::TIDy)->as<Int>()->value().value() == `10`);
7948
7949	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
7950	at::Tensor input1 = at::randn({`13`}, options);
7951
7952	FusionExecutor fe;
7953	fe.compileFusion(fusion.get(), {input1});
7954	auto outputs = fe.runFusion({input1});
7955
7956	testValidate(
7957	fusion.get(),
7958	outputs,
7959	{input1},
7960	{input1 + `1`, input1 + `1`, input1 + `1`, input1 + `1`},
7961	__LINE__,
7962	__FILE__);
7963	}
7964
7965	// Parallelizing merged broadcast domains
7966	TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
7967	Fusion fusion;
7968	FusionGuard fg(&fusion);
7969
7970	auto tv0 = makeSymbolicTensor(`1`);
7971	fusion.addInput(tv0);
7972	auto tv1 = makeSymbolicTensor(`2`);
7973	fusion.addInput(tv1);
7974	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
7975	auto tv3 = broadcast(tv2, {true, false});
7976	auto tv4 = add(tv3, tv1);
7977	fusion.addOutput(tv4);
7978
7979	tv4->split(`1`, `4`);
7980	tv4->reorder({{`1`, `2`}, {`2`, `1`}});
7981	tv4->merge(`0`);
7982	tv0->computeAt(tv4, `1`);
7983	tv1->computeAt(tv4, `1`);
7984
7985	// TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not
7986	// exact.
7987	tv4->axis(`0`)->parallelize(ParallelType::TIDx);
7988
7989	tv2->setMemoryType(MemoryType::Shared);
7990	tv3->setMemoryType(MemoryType::Shared);
7991
7992	GpuLower gpulw(&fusion);
7993	const auto& pdmap = gpulw.parallelDimensionMap();
7994	TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
7995	TORCH_CHECK(
7996	pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
7997	pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
7998
7999	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8000	at::Tensor input1 = at::randn({`13`}, options);
8001	at::Tensor input2 = at::randn({`15`, `13`}, options);
8002
8003	FusionExecutor fe;
8004	fe.compileFusion(&fusion, {input1, input2});
8005	auto outputs = fe.runFusion({input1, input2});
8006
8007	auto ref = (input1 + `1`).unsqueeze(`0`) + input2;
8008
8009	testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
8010	}
8011
8012	TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
8013	Fusion fusion;
8014	FusionGuard fg(&fusion);
8015
8016	auto tv0 = makeSymbolicTensor(`1`);
8017	fusion.addInput(tv0);
8018	auto tv1 = makeSymbolicTensor(`2`);
8019	fusion.addInput(tv1);
8020	auto tv3 = broadcast(tv0, {false, true});
8021	auto tv4 = add(tv3, tv1);
8022	fusion.addOutput(tv4);
8023
8024	tv4->split(`1`, `4`);
8025	tv0->computeAt(tv4, -`1`);
8026	tv1->computeAt(tv4, -`1`);
8027
8028	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
8029	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
8030	tv4->axis(-`2`)->parallelize(ParallelType::TIDy);
8031	tv3->axis(-`2`)->parallelize(ParallelType::TIDy);
8032
8033	GpuLower gpulw(&fusion);
8034	const auto& pdmap = gpulw.parallelDimensionMap();
8035	TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
8036	TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
8037	TORCH_CHECK(
8038	pdmap.get(ParallelType::TIDx)->isConst() &&
8039	pdmap.get(ParallelType::TIDx)->as<Int>()->value().value() == `4`);
8040	TORCH_CHECK(
8041	pdmap.get(ParallelType::TIDy)->isA<NamedScalar>() &&
8042	pdmap.get(ParallelType::TIDy)->as<NamedScalar>()->name() == "blockDim.y");
8043
8044	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8045	at::Tensor input1 = at::randn({`13`}, options);
8046	at::Tensor input2 = at::randn({`13`, `15`}, options);
8047
8048	FusionExecutor fe;
8049	fe.compileFusion(&fusion, {input1, input2});
8050	auto outputs = fe.runFusion({input1, input2});
8051
8052	auto ref = (input1).unsqueeze(-`1`) + input2;
8053
8054	testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
8055	}
8056
8057	TEST_F(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) {
8058	auto fusion_ptr = std::make_unique<Fusion>();
8059	auto& fusion = *fusion_ptr.get();
8060	FusionGuard fg(&fusion);
8061
8062	auto t0 = makeSymbolicTensor(`3`, DataType::Float);
8063	auto t1 = makeSymbolicTensor(`3`, DataType::Half);
8064	auto t3 = makeSymbolicTensor(`3`, DataType::Half);
8065	auto t5 = makeSymbolicTensor(`3`, DataType::Half);
8066	auto t7 = makeSymbolicTensor(`1`, DataType::Half);
8067	auto t11 = makeSymbolicTensor(`3`, DataType::Half);
8068	auto t13 = makeSymbolicTensor(`3`, DataType::Half);
8069	auto t15 = makeSymbolicTensor(`3`, DataType::Half);
8070	auto t17 = makeSymbolicTensor(`3`, DataType::Half);
8071	auto d56 = IrBuilder::create<Double>();
8072
8073	fusion.addInput(t0);
8074	fusion.addInput(t1);
8075	fusion.addInput(t3);
8076	fusion.addInput(t5);
8077	fusion.addInput(t7);
8078	fusion.addInput(t11);
8079	fusion.addInput(t13);
8080	fusion.addInput(t15);
8081	fusion.addInput(t17);
8082	fusion.addInput(d56);
8083
8084	auto t2 = castOp(DataType::Float, t1);
8085	auto t4 = castOp(DataType::Float, t3);
8086	auto t22 = sub(t2, t4);
8087	auto t6 = castOp(DataType::Float, t5);
8088	auto t23 = mul(t22, t6);
8089	auto t16 = castOp(DataType::Float, t15);
8090	auto t18 = castOp(DataType::Float, t17);
8091	auto t19 = add(t16, t18);
8092	auto t14 = castOp(DataType::Float, t13);
8093	auto t20 = add(t19, t14);
8094	auto t12 = castOp(DataType::Float, t11);
8095	auto t21 = add(t20, t12);
8096	auto t8 = castOp(DataType::Float, t7);
8097	auto t24 = broadcast(t8, {true, true, false});
8098	auto t25 = mul(t21, t24);
8099	auto t27 = sum(t25, {`2`});
8100	auto t28 = broadcast(t27, {false, false, true});
8101	auto t29 = mul(t25, t23);
8102	auto t30 = sum(t29, {`2`});
8103	auto t31 = broadcast(t30, {false, false, true});
8104	auto d59 =
8105	mul(t1->getRootDomain()[`2`]->extent(), IrBuilder::create<Double>(`1`));
8106	auto t26 = mul(d59, t25);
8107	auto txx = mul(t26, IrBuilder::create<Double>(`1`));
8108	auto t33 = sub(txx, t28);
8109	auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
8110	auto t35 = mul(d70, t6);
8111	auto t39 = sum(t21, {`0`, `1`});
8112	auto t47 = castOp(DataType::Half, t39);
8113	auto t37 = mul(t21, t23);
8114	auto t38 = sum(t37, {`0`, `1`});
8115	auto t46 = castOp(DataType::Half, t38);
8116	auto t32 = mul(t23, t31);
8117	auto t34 = sub(t33, t32);
8118	auto t36 = mul(t35, t34);
8119	auto t45 = castOp(DataType::Half, t36);
8120	auto t40 = mul(t36, t0);
8121	auto t41 = mul(t40, d56);
8122	auto t44 = castOp(DataType::Half, t41);
8123	auto t42 = sum(t41, {`0`, `1`});
8124	auto t43 = castOp(DataType::Half, t42);
8125
8126	fusion.addOutput(t43);
8127	fusion.addOutput(t44);
8128	fusion.addOutput(t45);
8129	fusion.addOutput(t46);
8130	fusion.addOutput(t47);
8131
8132	auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
8133	auto options_float =
8134	at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8135	at::Tensor at_t0 = at::randn({`128`, `64`, `1024`}, options_float);
8136	at::Tensor at_t1 = at::randn({`128`, `64`, `1024`}, options_half);
8137	at::Tensor at_t3 = at::randn({`128`, `64`, `1024`}, options_half);
8138	at::Tensor at_t5 = at::randn({`128`, `64`, `1024`}, options_half);
8139	at::Tensor at_t7 = at::randn({`1024`}, options_half);
8140	at::Tensor at_t11 = at::randn({`128`, `64`, `1024`}, options_half);
8141	at::Tensor at_t13 = at::randn({`128`, `64`, `1024`}, options_half);
8142	at::Tensor at_t15 = at::randn({`128`, `64`, `1024`}, options_half);
8143	at::Tensor at_t17 = at::randn({`128`, `64`, `1024`}, options_half);
8144	double at_d56 = `1.1111`;
8145
8146	std::vector<at::Tensor> aten_inputs = {
8147	at_t0, at_t1, at_t3, at_t5, at_t7, at_t11, at_t13, at_t15, at_t17};
8148
8149	c10::IValue val = at_d56;
8150
8151	KernelArgumentHolder args(KernelIndexMode::INT32);
8152	args.setDeviceIndex(`0`);
8153	args.push(aten_inputs);
8154	args.push(val);
8155
8156	for (auto _ : c10::irange(`5`)) {
8157	auto segmented_fusion =
8158	SegmentCandidateFinder::segment(fusion_ptr.get(), args);
8159	}
8160	}
8161
8162	TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
8163	Fusion fusion;
8164	FusionGuard fg(&fusion);
8165
8166	auto tv0 = makeSymbolicTensor(`1`);
8167	fusion.addInput(tv0);
8168
8169	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8170	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
8171	fusion.addOutput(tv2);
8172
8173	auto tv3 = add(tv0, IrBuilder::create<Double>(`1`));
8174	auto tv4 = add(tv3, IrBuilder::create<Double>(`1`));
8175	fusion.addOutput(tv4);
8176
8177	auto tv5 = add(tv0, IrBuilder::create<Double>(`1`));
8178	auto tv6 = add(tv5, IrBuilder::create<Double>(`1`));
8179	fusion.addOutput(tv6);
8180
8181	// Case 1: local memory tensor computed serially and used by
8182	// parallel threads
8183	tv2->split(-`1`, `4`);
8184	tv1->computeAt(tv2, -`2`);
8185	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
8186
8187	// Case 2: shared memory tensor computed serially and used by BID
8188	tv4->split(-`1`, `4`);
8189	tv3->computeAt(tv4, -`2`);
8190	tv4->axis(-`1`)->parallelize(ParallelType::BIDx);
8191	tv3->setMemoryType(MemoryType::Shared);
8192
8193	// Case 3: shared memory tensor computed by TID and used by BID
8194	tv6->split(-`1`, `4`);
8195	tv5->computeAt(tv6, -`2`);
8196	tv6->axis(-`1`)->parallelize(ParallelType::BIDx);
8197	tv5->axis(-`1`)->parallelize(ParallelType::TIDx);
8198	tv5->setMemoryType(MemoryType::Shared);
8199
8200	const int nx = `11`;
8201	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8202	at::Tensor t0 = at::randn({nx}, options);
8203	std::vector<IValue> aten_inputs = {t0};
8204
8205	FusionExecutor fe;
8206	fe.compileFusion(&fusion, aten_inputs);
8207	auto outputs = fe.runFusion(aten_inputs);
8208
8209	auto ref = t0 + `2`;
8210
8211	testValidate(
8212	&fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__);
8213	}
8214
8215	// Repro of issue #1105
8216	TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
8217	Fusion fusion;
8218	FusionGuard fg(&fusion);
8219
8220	auto tv0 = makeSymbolicTensor(`1`);
8221	fusion.addInput(tv0);
8222
8223	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8224	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
8225	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
8226
8227	fusion.addOutput(tv3);
8228
8229	tv1->setMemoryType(MemoryType::Shared);
8230	tv2->setMemoryType(MemoryType::Shared);
8231
8232	tv3->split(`0`, `4`);
8233	tv0->computeAt(tv3, `1`);
8234
8235	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
8236	tv2->axis(-`1`)->parallelize(ParallelType::TIDy);
8237	tv3->axis(-`1`)->parallelize(ParallelType::TIDz);
8238
8239	// Make sure a WAR sync is inserted at the end of the outer loop
8240	GpuLower gpulw(&fusion);
8241	for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
8242	if (auto loop = dynamic_cast<kir::ForLoop*>(kir_node)) {
8243	const auto& body = loop->body().exprs();
8244	TORCH_CHECK(!body.empty());
8245	auto last_expr = dynamic_cast<kir::BlockSync*>(body.back());
8246	TORCH_CHECK(last_expr != nullptr, "Invalid expr found");
8247	TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard");
8248	}
8249	}
8250
8251	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8252	at::Tensor t0 = at::randn({`17`}, options);
8253	std::vector<IValue> aten_inputs = {t0};
8254
8255	FusionExecutor fe;
8256	fe.compileFusion(&fusion, aten_inputs);
8257	auto outputs = fe.runFusion(aten_inputs);
8258
8259	auto ref1 = t0 + `3`;
8260
8261	testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__);
8262	}
8263
8264	TEST_F(NVFuserTest, FusionIssue1099_CUDA) {
8265	Fusion fusion;
8266	FusionGuard fg(&fusion);
8267
8268	auto tv0 = makeSymbolicTensor(`1`);
8269	fusion.addInput(tv0);
8270
8271	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8272	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
8273	fusion.addOutput(tv2);
8274
8275	auto tv3 = makeSymbolicTensor(`1`);
8276	fusion.addInput(tv3);
8277
8278	// Just to make TIDx/y/z non-exact
8279	auto tv4 = add(tv3, IrBuilder::create<Double>(`1`));
8280	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
8281	auto tv6 = add(tv5, IrBuilder::create<Double>(`1`));
8282	fusion.addOutput(tv6);
8283
8284	tv2->split(`0`, `4`);
8285	tv0->computeAt(tv2, `1`);
8286
8287	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
8288	tv1->axis(-`1`)->parallelize(ParallelType::TIDy);
8289	tv2->axis(-`1`)->parallelize(ParallelType::TIDz);
8290	tv2->axis(`0`)->parallelize(ParallelType::BIDx);
8291
8292	tv1->setMemoryType(MemoryType::Shared);
8293
8294	tv4->split(`0`, `5`);
8295	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
8296	tv4->setMemoryType(MemoryType::Shared);
8297	tv5->split(`0`, `6`);
8298	tv5->axis(-`1`)->parallelize(ParallelType::TIDy);
8299	tv5->setMemoryType(MemoryType::Shared);
8300	tv6->split(`0`, `7`);
8301	tv6->axis(-`1`)->parallelize(ParallelType::TIDz);
8302
8303	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8304	at::Tensor t0 = at::randn({`17`}, options);
8305	at::Tensor t3 = at::randn({`19`}, options);
8306	std::vector<IValue> aten_inputs = {t0, t3};
8307
8308	FusionExecutor fe;
8309	fe.compileFusion(&fusion, aten_inputs);
8310	auto outputs = fe.runFusion(aten_inputs);
8311
8312	auto ref_t2 = t0 + `2`;
8313	auto ref_t3 = t3 + `3`;
8314
8315	testValidate(
8316	&fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
8317	}
8318
8319	// Repro of issue #1080
8320	TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) {
8321	Fusion fusion;
8322	FusionGuard fg(&fusion);
8323
8324	auto tv0 = makeSymbolicTensor(`2`);
8325	fusion.addInput(tv0);
8326
8327	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8328	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
8329	fusion.addOutput(tv2);
8330
8331	tv2->split(`0`, `4`);
8332	tv0->computeAt(tv2, `2`);
8333
8334	tv2->split(-`1`, `8`);
8335	tv1->split(-`1`, `8`);
8336
8337	tv2->axis(`1`)->parallelize(ParallelType::Unswitch);
8338
8339	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
8340	tv2->axis(-`2`)->parallelize(ParallelType::TIDy);
8341
8342	// swap TIDx and TIDy
8343	tv1->axis(-`1`)->parallelize(ParallelType::TIDy);
8344	tv1->axis(-`2`)->parallelize(ParallelType::TIDx);
8345
8346	tv1->setMemoryType(MemoryType::Shared);
8347
8348	const int nx = `4`;
8349	const int ny = `10`;
8350	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8351	at::Tensor t0 = at::randn({nx, ny}, options);
8352	std::vector<IValue> aten_inputs = {t0};
8353
8354	FusionExecutor fe;
8355	fe.compileFusion(&fusion, aten_inputs);
8356	auto outputs = fe.runFusion(aten_inputs);
8357
8358	auto ref = t0 + `2`;
8359
8360	testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
8361	}
8362
8363	TEST_F(NVFuserTest, FusionIssue1189_CUDA) {
8364	Fusion fusion;
8365	FusionGuard fg(&fusion);
8366
8367	auto tv0 = makeConcreteTensor({`16`, `16`});
8368	auto tv1 = makeConcreteTensor({`16`, `16`});
8369
8370	auto tv0b = broadcast(tv0, {false, false, true});
8371	auto tv1b = broadcast(tv1, {false, false, true});
8372
8373	fusion.addInput(tv0b);
8374	fusion.addInput(tv1b);
8375
8376	auto tv2 = add(tv0b, tv1b);
8377	auto tv3 = sum(tv2, {`1`});
8378	fusion.addOutput(tv3);
8379
8380	auto parallelize = [](auto tv) {
8381	tv->axis(`0`)->parallelize(ParallelType::TIDx);
8382	tv->axis(`1`)->parallelize(ParallelType::BIDx);
8383	tv->axis(`2`)->parallelize(ParallelType::BIDy);
8384	};
8385
8386	parallelize (tv0b);
8387	parallelize (tv1b);
8388	parallelize (tv2);
8389	parallelize (tv3);
8390
8391	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8392	at::Tensor t0 = at::randn({`16`, `16`, `1`}, options);
8393	at::Tensor t1 = at::randn({`16`, `16`, `1`}, options);
8394
8395	FusionExecutor fe;
8396	fe.compileFusion(&fusion, {t0, t1});
8397	auto outputs = fe.runFusion({t0, t1});
8398
8399	auto ref = (t0 + t1).sum({`1`});
8400
8401	testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
8402	}
8403
8404	TEST_F(NVFuserTest, FusionIssue1052_CUDA) {
8405	Fusion fusion;
8406	FusionGuard fg(&fusion);
8407
8408	auto tv0 = makeSymbolicTensor(`1`);
8409	fusion.addInput(tv0);
8410	auto tv1 = makeSymbolicTensor(`1`);
8411	fusion.addInput(tv1);
8412
8413	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
8414	fusion.addOutput(tv2);
8415
8416	auto tv3 = add(tv1, IrBuilder::create<Double>(`1`));
8417	fusion.addOutput(tv3);
8418
8419	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
8420	tv3->axis(-`1`)->parallelize(ParallelType::TIDx);
8421
8422	scheduler_utils::parallelizeAllLike(tv2, {tv0});
8423	scheduler_utils::parallelizeAllLike(tv3, {tv1});
8424
8425	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8426	at::Tensor t0 = at::randn({`10`}, options);
8427	at::Tensor t1 = at::randn({`100`}, options);
8428	std::vector<IValue> aten_inputs = {t0, t1};
8429
8430	FusionExecutor fe;
8431	fe.compileFusion(&fusion, aten_inputs);
8432	auto outputs = fe.runFusion(aten_inputs);
8433
8434	auto ref_t2 = t0 + `1`;
8435	auto ref_t3 = t1 + `1`;
8436
8437	testValidate(
8438	&fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
8439	}
8440
8441	// Repro of issue #1115
8442	TEST_F(NVFuserTest, FusionPointwiseBroadcast_CUDA) {
8443	Fusion fusion;
8444	FusionGuard fg(&fusion);
8445
8446	std::vector<int64_t> input_shape{`3`, `17`, `80`};
8447	std::vector<int64_t> output_shape{`3`, `17`, `1`, `80`};
8448
8449	TensorView* x = makeSymbolicTensor(input_shape.size());
8450	TensorView* bias = makeSymbolicTensor(input_shape.size());
8451	fusion.addInput(x);
8452	fusion.addInput(bias);
8453
8454	auto x_add_bias = add(x, bias);
8455	auto x_bcast = broadcast(x_add_bias, {false, false, true, false});
8456	auto y = gelu(x_bcast);
8457	fusion.addOutput(y);
8458
8459	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8460	at::Tensor at_x = at::randn(input_shape, options);
8461	at::Tensor at_bias = at::randn(input_shape, options);
8462	std::vector<IValue> aten_inputs = {at_x, at_bias};
8463
8464	schedulePointwise(&fusion, aten_inputs);
8465
8466	FusionExecutor fe;
8467	fe.compileFusion(&fusion, aten_inputs);
8468	auto outputs = fe.runFusion(aten_inputs);
8469
8470	auto at_x_add_bias = at_x + at_bias;
8471	auto at_x_view = at::native::view(at_x_add_bias, output_shape);
8472	auto aten_y = at::gelu(at_x_view);
8473
8474	testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__);
8475	}
8476
8477	TEST_F(NVFuserTest, FusionPointwiseVectorize_CUDA) {
8478	Fusion fusion;
8479	FusionGuard fg(&fusion);
8480
8481	const int size = `1024` * `64`;
8482
8483	TensorView* x = makeContigTensor(`1`);
8484	fusion.addInput(x);
8485	auto y = sin(x);
8486	fusion.addOutput(y);
8487
8488	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8489
8490	// PyTorch's CUDA caching allocator should always return aligned pointer for
8491	// freshly allocated tensor
8492	at::Tensor at_x = at::randn({size}, options);
8493
8494	schedulePointwise(&fusion, {at_x});
8495
8496	for (auto x_consumer : ir_utils::consumerTvsOf(x)) {
8497	bool found_vec_in_input = false;
8498	for (auto id : x_consumer->domain()->domain()) {
8499	if (isParallelTypeVectorize(id->getParallelType())) {
8500	found_vec_in_input = true;
8501	break;
8502	}
8503	}
8504	TORCH_CHECK(found_vec_in_input, "Expect input to be vectorized");
8505	}
8506
8507	for (auto id : y->domain()->domain()) {
8508	if (isParallelTypeVectorize(id->getParallelType())) {
8509	return;
8510	}
8511	}
8512	TORCH_CHECK(false, "Expect output to be vectorized");
8513	}
8514
8515	TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) {
8516	Fusion fusion;
8517	FusionGuard fg(&fusion);
8518
8519	auto tv0 = makeSymbolicTensor(`1`);
8520	fusion.addInput(tv0);
8521
8522	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8523	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
8524	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
8525
8526	fusion.addOutput(tv3);
8527
8528	// Just set the dimension of TIDx
8529	auto tv4 = makeSymbolicTensor(`1`);
8530	fusion.addInput(tv4);
8531	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
8532	fusion.addOutput(tv5);
8533
8534	tv1->setMemoryType(MemoryType::Shared);
8535	tv2->setMemoryType(MemoryType::Shared);
8536
8537	tv5->axis(`0`)->parallelize(ParallelType::TIDx);
8538
8539	// tv1 and tv2 are on shared memory and are not parallelized with
8540	// TIDx. They should be predicated as they are redundant and can
8541	// interfere with smem aliasing (issue #1100).
8542
8543	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8544	at::Tensor t0 = at::randn({`10`}, options);
8545	at::Tensor t4 = at::randn({`1024`}, options);
8546	std::vector<IValue> aten_inputs = {t0, t4};
8547
8548	FusionExecutor fe;
8549	fe.compileFusion(&fusion, aten_inputs);
8550	auto outputs = fe.runFusion(aten_inputs);
8551
8552	auto ref1 = t0 + `3`;
8553	auto ref2 = t4 + `1`;
8554
8555	testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8556	}
8557
8558	TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
8559	Fusion fusion;
8560	FusionGuard fg(&fusion);
8561
8562	auto tv0 = makeSymbolicTensor(`1`);
8563	fusion.addInput(tv0);
8564
8565	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8566	fusion.addOutput(tv1);
8567
8568	auto tv2 = makeSymbolicTensor(`1`);
8569	fusion.addInput(tv2);
8570	auto tv3 = sum(tv2, {`0`});
8571	fusion.addOutput(tv3);
8572
8573	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
8574	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
8575
8576	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8577	at::Tensor t0 = at::randn({`17`}, options);
8578	at::Tensor t2 = at::randn({`19`}, options);
8579	std::vector<IValue> aten_inputs = {t0, t2};
8580
8581	FusionExecutor fe;
8582	fe.compileFusion(&fusion, aten_inputs);
8583	auto outputs = fe.runFusion(aten_inputs);
8584
8585	auto ref1 = t0 + `1`;
8586	auto ref2 = sum(t2);
8587
8588	testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8589	}
8590
8591	TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
8592	Fusion fusion;
8593	FusionGuard fg(&fusion);
8594
8595	auto tv0 = makeSymbolicTensor(`1`);
8596	fusion.addInput(tv0);
8597
8598	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8599	fusion.addOutput(tv1);
8600
8601	auto tv2 = makeSymbolicTensor(`1`);
8602	fusion.addInput(tv2);
8603	auto tv3 = Welford(tv2, {`0`}).avg;
8604	fusion.addOutput(tv3);
8605
8606	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
8607	tv3->axis(`0`)->parallelize(ParallelType::BIDx);
8608
8609	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8610	at::Tensor t0 = at::randn({`17`}, options);
8611	at::Tensor t2 = at::randn({`19`}, options);
8612	std::vector<IValue> aten_inputs = {t0, t2};
8613
8614	FusionExecutor fe;
8615	fe.compileFusion(&fusion, aten_inputs);
8616	auto outputs = fe.runFusion(aten_inputs);
8617
8618	auto ref1 = t0 + `1`;
8619	auto ref2 = mean(t2, {`0`});
8620
8621	testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8622	}
8623
8624	TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
8625	Fusion fusion;
8626	FusionGuard fg(&fusion);
8627
8628	auto tv0 = makeSymbolicTensor(`2`);
8629	fusion.addInput(tv0);
8630
8631	auto tv1 = sum(tv0, {`0`, `1`});
8632	fusion.addOutput(tv1);
8633
8634	auto tv2 = makeSymbolicTensor(`3`);
8635	fusion.addInput(tv2);
8636	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
8637	fusion.addOutput(tv3);
8638
8639	auto tv4 = makeSymbolicTensor(`3`);
8640	fusion.addInput(tv4);
8641	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
8642	fusion.addOutput(tv5);
8643
8644	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
8645	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
8646
8647	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
8648	tv3->axis(`1`)->parallelize(ParallelType::TIDy);
8649	tv3->axis(`2`)->parallelize(ParallelType::TIDz);
8650
8651	tv5->axis(`0`)->parallelize(ParallelType::BIDx);
8652	tv5->axis(`1`)->parallelize(ParallelType::BIDy);
8653	tv5->axis(`2`)->parallelize(ParallelType::BIDz);
8654
8655	// TODO: This needs a fix for issue #1102.
8656	// Also, need to allow predicated grid reductions.
8657	#if 0
8658	FusionExecutor fe;
8659	fe.compileFusion(&fusion);
8660
8661	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8662	at::Tensor t0 = at::randn({`2`, `3`}, options);
8663	at::Tensor t2 = at::randn({`5`, `6`, `7`}, options);
8664	at::Tensor t4 = at::randn({`8`, `9`, `10`}, options);
8665	std::vector<IValue> aten_inputs = {t0, t2, t4};
8666	auto outputs = fe.runFusion(aten_inputs);
8667
8668	auto ref1 = t0.sum(at::IntArrayRef{`0`, `1`});
8669	auto ref2 = t2 + `1`;
8670	auto ref3 = t4 + `1`;
8671
8672	testValidate(
8673	&fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
8674	#endif
8675	}
8676
8677	TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
8678	Fusion fusion;
8679	FusionGuard fg(&fusion);
8680
8681	auto tv0 = makeSymbolicTensor(`2`);
8682	fusion.addInput(tv0);
8683
8684	auto tvs = Welford(tv0, {`0`, `1`});
8685	fusion.addOutput(tvs.avg);
8686
8687	auto tv2 = makeSymbolicTensor(`3`);
8688	fusion.addInput(tv2);
8689	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
8690	fusion.addOutput(tv3);
8691
8692	auto tv4 = makeSymbolicTensor(`3`);
8693	fusion.addInput(tv4);
8694	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
8695	fusion.addOutput(tv5);
8696
8697	tvs.avg->axis(`0`)->parallelize(ParallelType::BIDx);
8698	tvs.avg->axis(`1`)->parallelize(ParallelType::TIDx);
8699
8700	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
8701	tv3->axis(`1`)->parallelize(ParallelType::TIDy);
8702	tv3->axis(`2`)->parallelize(ParallelType::TIDz);
8703
8704	tv5->axis(`0`)->parallelize(ParallelType::BIDx);
8705	tv5->axis(`1`)->parallelize(ParallelType::BIDy);
8706	tv5->axis(`2`)->parallelize(ParallelType::BIDz);
8707
8708	// TODO: needs a fix for issue #1102
8709	// Also, need to allow predicated grid reductions.
8710	#if 0
8711	FusionExecutor fe;
8712	fe.compileFusion(&fusion);
8713
8714	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8715	at::Tensor t0 = at::randn({`2`, `3`}, options);
8716	at::Tensor t2 = at::randn({`5`, `6`, `7`}, options);
8717	at::Tensor t4 = at::randn({`8`, `9`, `10`}, options);
8718	std::vector<IValue> aten_inputs = {t0, t2, t4};
8719	auto outputs = fe.runFusion(aten_inputs);
8720
8721	auto ref1 = t0.mean(at::IntArrayRef{`0`, `1`});
8722	auto ref2 = t2 + `1`;
8723	auto ref3 = t4 + `1`;
8724
8725	testValidate(
8726	&fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
8727	#endif
8728	}
8729
8730	// Repro of issue #1102
8731	TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
8732	Fusion fusion;
8733	FusionGuard fg(&fusion);
8734
8735	auto tv0 = makeSymbolicTensor(`1`);
8736	fusion.addInput(tv0);
8737
8738	// Just to make TIDx/y/z non-exact
8739	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
8740	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
8741	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
8742	fusion.addOutput(tv3);
8743
8744	auto tv4 = makeSymbolicTensor(`1`);
8745	fusion.addInput(tv4);
8746
8747	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
8748	auto tv6 = add(tv5, IrBuilder::create<Double>(`1`));
8749	auto tv7 = add(tv6, IrBuilder::create<Double>(`1`));
8750	auto tv8 = add(tv7, IrBuilder::create<Double>(`1`));
8751	auto tv9 = sum(tv8, {`0`});
8752	fusion.addOutput(tv9);
8753
8754	tv1->split(`0`, `5`);
8755	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
8756	tv1->setMemoryType(MemoryType::Shared);
8757	tv2->split(`0`, `6`);
8758	tv2->axis(-`1`)->parallelize(ParallelType::TIDy);
8759	tv2->setMemoryType(MemoryType::Shared);
8760	tv3->split(`0`, `7`);
8761	tv3->axis(-`1`)->parallelize(ParallelType::TIDz);
8762
8763	tv9->split(`0`, `4`);
8764	tv4->computeAt(tv9, `1`);
8765
8766	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
8767	tv5->axis(-`1`)->parallelize(ParallelType::TIDy);
8768	tv6->axis(-`1`)->parallelize(ParallelType::TIDz);
8769	tv7->axis(-`1`)->parallelize(ParallelType::TIDz);
8770	tv8->axis(-`1`)->parallelize(ParallelType::TIDz);
8771	tv9->axis(-`1`)->parallelize(ParallelType::TIDz);
8772	tv9->axis(`0`)->parallelize(ParallelType::BIDx);
8773
8774	tv5->setMemoryType(MemoryType::Shared);
8775
8776	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8777	at::Tensor t0 = at::randn({`17`}, options);
8778	at::Tensor t4 = at::randn({`19`}, options);
8779	std::vector<IValue> aten_inputs = {t0, t4};
8780
8781	FusionExecutor fe;
8782	fe.compileFusion(&fusion, aten_inputs);
8783	auto outputs = fe.runFusion(aten_inputs);
8784
8785	auto ref1 = t0 + `3`;
8786	auto ref2 = sum(t4 + `4`);
8787
8788	testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8789	}
8790
8791	// Repro of #1102 and #1129
8792	TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
8793	if (!deviceMajorMinorCheck(`7`)) {
8794	GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
8795	return;
8796	}
8797	Fusion fusion;
8798	FusionGuard fg(&fusion);
8799
8800	auto tv0 = makeSymbolicTensor(`1`);
8801	fusion.addInput(tv0);
8802	auto tv1 = makeSymbolicTensor(`1`);
8803	fusion.addInput(tv1);
8804
8805	auto tv2 = add(tv0, IrBuilder::create<Double>(`1`));
8806	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
8807	auto tv4 = add(tv3, IrBuilder::create<Double>(`1`));
8808	auto tv5 = add(tv4, IrBuilder::create<Double>(`1`));
8809	fusion.addOutput(tv5);
8810
8811	// Just to make TIDx/y/z non-exact
8812	auto tvx = add(tv1, IrBuilder::create<Double>(`1`));
8813	auto tvy = add(tvx, IrBuilder::create<Double>(`1`));
8814	auto tvz = add(tvy, IrBuilder::create<Double>(`1`));
8815	fusion.addOutput(tvz);
8816
8817	tv5->split(`0`, `4`);
8818	tv0->computeAt(tv5, `1`);
8819
8820	tv0->axis(-`1`)->parallelize(ParallelType::TIDx);
8821	tv2->axis(-`1`)->parallelize(ParallelType::TIDy);
8822	tv3->axis(-`1`)->parallelize(ParallelType::TIDz);
8823	tv4->axis(-`1`)->parallelize(ParallelType::TIDx);
8824	tv5->axis(-`1`)->parallelize(ParallelType::TIDy);
8825	tv5->axis(`0`)->parallelize(ParallelType::Unswitch);
8826
8827	tvx->split(`0`, `5`);
8828	tvx->axis(-`1`)->parallelize(ParallelType::TIDx);
8829	tvy->split(`0`, `6`);
8830	tvy->axis(-`1`)->parallelize(ParallelType::TIDy);
8831	tvz->split(`0`, `7`);
8832	tvz->axis(-`1`)->parallelize(ParallelType::TIDz);
8833
8834	for (auto tv : {tv2, tv3, tv4, tvx, tvy}) {
8835	tv->setMemoryType(MemoryType::Shared);
8836	}
8837
8838	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8839	at::Tensor t0 = at::randn({`17`}, options);
8840	at::Tensor t1 = at::randn({`19`}, options);
8841	std::vector<IValue> aten_inputs = {t0, t1};
8842
8843	FusionExecutor fe;
8844	fe.compileFusion(&fusion, aten_inputs);
8845	auto outputs = fe.runFusion(aten_inputs);
8846
8847	auto ref1 = t0 + `4`;
8848	auto ref2 = t1 + `3`;
8849
8850	testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
8851	}
8852
8853	// Repro of issue #1136
8854	TEST_F(NVFuserTest, FusionFloatPow_CUDA) {
8855	Fusion fusion;
8856	FusionGuard fg(&fusion);
8857
8858	auto tv0 = makeSymbolicTensor(`1`);
8859	fusion.addInput(tv0);
8860
8861	auto tv1 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(`4`));
8862	// To check if pow(tv0, 2) is replaced with tv0 tv0*
8863	auto tv2 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(`2`));
8864	// To check if pow(tv0, 2.0) is replaced with tv0 tv0*
8865	auto tv3 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(`2`));
8866	auto tv4 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(`3`));
8867	auto tv5 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(`3`));
8868	auto s = binaryOp(
8869	BinaryOpType::Pow,
8870	IrBuilder::create<Double>(`3`),
8871	IrBuilder::create<Double>(`3`));
8872	auto tv6 = add(tv0, s);
8873
8874	fusion.addOutput(tv1);
8875	fusion.addOutput(tv2);
8876	fusion.addOutput(tv3);
8877	fusion.addOutput(tv4);
8878	fusion.addOutput(tv5);
8879	fusion.addOutput(tv6);
8880
8881	tv1->split(`0`, `32`);
8882	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
8883	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
8884
8885	TransformPropagatorWithCheck propagator(tv1);
8886	MaxRootDomainInfoSpanningTree (tv1).traverse(&propagator);
8887	scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6});
8888
8889	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
8890	at::Tensor t0 = at::randn({`1000`}, options);
8891	// Negative inputs cause nan in Fuesr as use_fast_math is enabled
8892	t0 = abs(t0);
8893	std::vector<IValue> aten_inputs = {t0};
8894
8895	FusionExecutor fe;
8896	fe.compileFusion(&fusion, aten_inputs);
8897	auto outputs = fe.runFusion(aten_inputs);
8898
8899	auto p4 = at::pow(t0, `4`);
8900	auto p2 = at::pow(t0, `2`);
8901	auto p3 = at::pow(t0, `3`);
8902	auto t6 = t0 + std::pow(`3`, `3`);
8903
8904	testValidate(
8905	&fusion,
8906	outputs,
8907	aten_inputs,
8908	{p4, p2, p2, p3, p3, t6},
8909	__LINE__,
8910	__FILE__);
8911	}
8912
8913	TEST_F(NVFuserTest, FusionIssue1127_CUDA) {
8914	Fusion fusion;
8915	FusionGuard fg(&fusion);
8916
8917	const int numel = `4`;
8918
8919	auto tv0 = makeConcreteTensor({numel});
8920	fusion.addInput(tv0);
8921
8922	auto tv1 = sum(tv0, {`0`});
8923	auto tv2 = broadcast(tv1, {true});
8924
8925	auto tv3 = makeConcreteTensor({numel, numel});
8926	fusion.addInput(tv3);
8927
8928	auto tv4 = sum(tv3, {`1`});
8929
8930	auto tv5 = add(tv2, tv4);
8931	fusion.addOutput(tv5);
8932
8933	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
8934	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
8935	tv4->axis(`1`)->parallelize(ParallelType::TIDx);
8936	tv5->axis(`0`)->parallelize(ParallelType::TIDx);
8937
8938	// Lowering should fail since tv5 is predicated and paralellized with TIDx.
8939	// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
8940	ASSERT_ANY_THROW(fusion.printKernel());
8941	}
8942
8943	TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) {
8944	// This test may not pass if using a custom block sync as there may
8945	// be additional calls. Skip the test as it's not specifically
8946	// relevant with block synchronizatin.
8947	if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
8948	return;
8949	}
8950	auto g = std::make_shared<Graph>();
8951	const auto graph0_string = R"IR(
8952	graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]),
8953	%1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])):
8954	%o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6
8955	%3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9
8956	return (%3))IR";
8957	parseIR(graph0_string, g.get());
8958
8959	// strides are not yet supported in the irparser.
8960	{
8961	auto val = g ->block()->inputs()[`0`];
8962	val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
8963	{`8`, `4`, `10`, `16`}, {`640`, `1`, `64`, `4`}));
8964	}
8965
8966	{
8967	auto val = g ->block()->inputs()[`1`];
8968	val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
8969	{`8`, `4`, `10`, `16`}, {`640`, `160`, `16`, `1`}));
8970	}
8971
8972	for (auto node : g ->block()->nodes()) {
8973	for (auto val : node->outputs()) {
8974	if (val->isCompleteTensor())
8975	val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
8976	{`8`, `4`, `10`, `16`}, {`640`, `1`, `64`, `4`}));
8977	}
8978	}
8979
8980	auto fusion = parseJitIR(g);
8981	FusionGuard fg(fusion.get());
8982	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
8983	at::Tensor input0 =
8984	at::randn({`2`, `2`, `2`, `16`}, options).clone(c10::MemoryFormat::ChannelsLast);
8985	at::Tensor input1 = at::randn({`2`, `2`, `2`, `16`}, options);
8986	auto lparams = schedulePointwise(fusion.get(), {input0, input1});
8987
8988	// CONSIDER:
8989	// 1. this can be moved to a dedicated "golden" file
8990	// 2. use a fuzzy compare (ignore non-significant whitespaces for example)
8991	const std::string expected_kernel = R"(
8992	__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) {
8993	int64_t i165;
8994	i165 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
8995	if ((i165 < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) {
8996	__half T9[1];
8997	T9[0] = 0;
8998	T9[0]
8999	= T2[((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * ((T0.size[2] * T0.size[1]) * T0.size[3])) + ((((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * (T0.size[2] * T0.size[1])) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * T0.size[2]) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3])];
9000	__half T8[1];
9001	T8[0] = 0;
9002	T8[0]
9003	= T0[i165];
9004	float T3[1];
9005	T3[0]
9006	= __half2float(T9[0]);
9007	float T4[1];
9008	T4[0]
9009	= T3[0];
9010	float T1[1];
9011	T1[0]
9012	= __half2float(T8[0]);
9013	float T5[1];
9014	T5[0]
9015	= T1[0]
9016	* T4[0];
9017	float T6[1];
9018	T6[0]
9019	= relu(T5[0]);
9020	__half T10[1];
9021	T10[0]
9022	= __float2half(T6[0]);
9023	T7[i165]
9024	= T10[0];
9025	}
9026	}
9027	)";
9028
9029	const std::string actual_kernel =
9030	"\n" + codegen::generateCudaKernel(GpuLower (fusion.get()).kernel());
9031
9032	if (expected_kernel.size() != actual_kernel.size() \|\|
9033	expected_kernel.compare(actual_kernel) != `0`) {
9034	std::cerr
9035	<< " Codegen mismatch, codegen possibly changed, or is incorrect. "
9036	<< " \n ========= EXPECTED ========= \n"
9037	<< expected_kernel << "\n========= ACTUAL ========== \n"
9038	<< actual_kernel << "\n=================" << std::endl;
9039	auto it = std::mismatch(
9040	expected_kernel.begin(),
9041	expected_kernel.end(),
9042	actual_kernel.begin(),
9043	actual_kernel.end());
9044	std::string actual_mismatched_snippet(it.second, actual_kernel.end());
9045	actual_mismatched_snippet = actual_mismatched_snippet.substr(`0`, `10`);
9046	std::string expected_mismatched_snippet(it.first, expected_kernel.end());
9047	expected_mismatched_snippet = expected_mismatched_snippet.substr(`0`, `10`);
9048	std::cerr << "First mismatch found at: " << actual_mismatched_snippet
9049	<< ", expected: " << expected_mismatched_snippet << std::endl;
9050	TORCH_CHECK(false);
9051	}
9052
9053	// TODO: runFusion hits assertion. I'm probably doing something wrong here.
9054	// FusionExecutor fe;
9055	// fe.compileFusion(fusion.get());
9056	// auto outputs = fe.runFusion({input0, input1}, lparams);
9057	// at::Tensor output_ref = (input0 input1).relu();*
9058	// TORCH_CHECK(output_ref.equal(outputs[0]));
9059	}
9060
9061	TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
9062	Fusion fusion;
9063	FusionGuard fg(&fusion);
9064
9065	auto tv0 = makeConcreteTensor({`10`, `1024`});
9066	fusion.addInput(tv0);
9067
9068	auto tv1 = sum(tv0, {`1`});
9069	auto tv2 = add(tv1, IrBuilder::create<Double>(`1`));
9070	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
9071
9072	fusion.addOutput(tv3);
9073
9074	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
9075	tv2->computeAt(tv3, -`1`);
9076	tv3->axis(`0`)->parallelize(ParallelType::Unswitch);
9077
9078	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9079	at::Tensor t0 = at::randn({`10`, `1024`}, options);
9080	std::vector<IValue> aten_inputs = {t0};
9081
9082	FusionExecutor fe;
9083	fe.compileFusion(&fusion, aten_inputs);
9084	auto outputs = fe.runFusion(aten_inputs);
9085
9086	auto ref = sum(t0, {`1`}) + `2`;
9087
9088	testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
9089	}
9090
9091	TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) {
9092	Fusion fusion;
9093	FusionGuard fg(&fusion);
9094
9095	auto tv0 = makeSymbolicTensor(`1`);
9096	fusion.addInput(tv0);
9097
9098	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
9099	fusion.addOutput(tv1);
9100
9101	tv1->setContiguity(false);
9102
9103	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9104	at::Tensor at_input = at::randn({`10`}, options);
9105	at::Tensor at_output = at::empty_strided({`10`}, {`2`}, options);
9106
9107	FusionExecutor fe;
9108	fe.compileFusion(&fusion, {at_input});
9109	auto returned_outputs = fe.runFusion({at_input}, {at_output});
9110
9111	// Returned outputs should only contain one tensor that is the same
9112	// as the output tensor given to runFusion
9113	TORCH_CHECK(returned_outputs.size() == `1`);
9114	TORCH_CHECK(returned_outputs[`0`].is_same(at_output));
9115	TORCH_CHECK(!returned_outputs[`0`].is_contiguous());
9116
9117	auto at_ref = at_input + `1`;
9118
9119	testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__);
9120	}
9121
9122	TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
9123	Fusion fusion;
9124	FusionGuard fg(&fusion);
9125
9126	// Setup softmax fusion
9127	auto input = makeContigTensor(`2`);
9128	fusion.addInput(input);
9129	auto output = softmax(input, `1`);
9130	fusion.addOutput(output);
9131
9132	// Setup runtime input
9133	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9134	at::Tensor aten_input = at::randn({`8`, `16` * `197`}, options);
9135	std::vector<c10::IValue> aten_inputs({aten_input});
9136
9137	// Schedule through magic scheduler
9138	SchedulerRuntimeInfo runtime_info(&fusion, aten_inputs, true);
9139	TORCH_CHECK(SchedulerEntry::canSchedule(
9140	ScheduleHeuristic::Persistent, &fusion, runtime_info));
9141	auto scheduler = SchedulerEntry::makeEntry(
9142	ScheduleHeuristic::Persistent, &fusion, runtime_info);
9143	scheduler ->schedule(&fusion);
9144
9145	// Modify the schedule to use warp reduction
9146	auto used_vals = fusion.usedMathVals();
9147	for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
9148	for (IterDomain* id : tv->domain()->domain()) {
9149	if (id->getParallelType() == ParallelType::TIDx) {
9150	id->padToMultipleOfWarp();
9151	}
9152	}
9153	}
9154
9155	// Test result
9156	FusionExecutor fe;
9157	fe.compileFusion(&fusion, aten_inputs);
9158	auto outputs = fe.runFusion(aten_inputs);
9159	auto ref_output = at::_softmax(aten_input, `1`, false);
9160	testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
9161	}
9162
9163	TEST_F(NVFuserTest, FusionIssue1133_CUDA) {
9164	if (!deviceMajorMinorCheck(`7`)) {
9165	GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
9166	return;
9167	}
9168	Fusion fusion;
9169	FusionGuard fg(&fusion);
9170
9171	auto tv0 = makeSymbolicTensor(`2`);
9172	fusion.addInput(tv0);
9173
9174	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
9175	auto tv2 = sum(tv1, {`1`});
9176	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
9177
9178	fusion.addOutput(tv3);
9179
9180	tv0->computeAt(tv3, `1`);
9181
9182	const int split_factor = `32`;
9183
9184	tv2->split(-`1`, split_factor);
9185	tv1->computeAt(tv2, -`2`);
9186
9187	tv1->axis(-`1`)->parallelize(ParallelType::TIDx);
9188	tv2->axis(-`1`)->parallelize(ParallelType::TIDx);
9189
9190	tv3->axis(`0`)->parallelize(ParallelType::Unswitch);
9191
9192	tv1->setMemoryType(MemoryType::Shared);
9193	tv2->setMemoryType(MemoryType::Shared);
9194
9195	// Both tv1 and tv2 should be allocated at the top-level scope
9196	GpuLower gpulw(&fusion);
9197	bool tv1_validated = false;
9198	bool tv2_validated = false;
9199	for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
9200	if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node)) {
9201	auto size = alloc->size();
9202	if (!(alloc->buffer()->name() == `1` \|\| alloc->buffer()->name() == `2`)) {
9203	// There should be no allocation other than those for tv1 and tv2
9204	TORCH_CHECK(false, "Invalid allocation detected");
9205	}
9206	TORCH_CHECK(size->isA<Int>(), "Invalid allocation size");
9207	TORCH_CHECK(size->as<Int>()->isConst(), "Allocation not constant");
9208	auto size_int = size->as<Int>()->value().value();
9209	if (alloc->buffer()->name() == `1`) {
9210	TORCH_CHECK(
9211	size_int == split_factor,
9212	"Invalid allocation size: ",
9213	size->as<Int>()->value().value());
9214	tv1_validated = true;
9215	} else {
9216	TORCH_CHECK(
9217	size_int == `1`,
9218	"Invalid allocation size: ",
9219	size->as<Int>()->value().value());
9220	tv2_validated = true;
9221	}
9222	}
9223	}
9224
9225	TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation");
9226	TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation");
9227
9228	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9229	at::Tensor t0 = at::randn({`99`, `101`}, options);
9230	std::vector<IValue> aten_inputs = {t0};
9231
9232	FusionExecutor fe;
9233	fe.compileFusion(&fusion, aten_inputs);
9234	auto outputs = fe.runFusion(aten_inputs);
9235
9236	auto ref = (t0 + `1`).sum({`1`}) + `1`;
9237
9238	testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
9239	}
9240
9241	TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) {
9242	Fusion fusion;
9243	FusionGuard fg(&fusion);
9244
9245	auto tv0 = makeSymbolicTensor(`2`);
9246	fusion.addInput(tv0);
9247
9248	auto tv1 = sum(tv0, {`1`});
9249	fusion.addOutput(tv1);
9250
9251	tv1->split(`1`, `32`);
9252
9253	auto tv2 = tv1->rFactor({`1`});
9254
9255	// This merged domain is not contiguous.
9256	tv2->merge(`0`, `2`);
9257
9258	tv2->setMemoryType(MemoryType::Shared);
9259
9260	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9261	at::Tensor t0 = at::randn({`99`, `101`}, options);
9262	std::vector<IValue> aten_inputs = {t0};
9263
9264	FusionExecutor fe;
9265	fe.compileFusion(&fusion, aten_inputs);
9266	auto outputs = fe.runFusion(aten_inputs);
9267
9268	auto ref = t0.sum({`1`});
9269
9270	testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
9271	}
9272
9273	TEST_F(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) {
9274	Fusion fusion;
9275	FusionGuard fg(&fusion);
9276
9277	auto tv0 = makeSymbolicTensor(`2`);
9278	fusion.addInput(tv0);
9279
9280	auto tv1 = set(tv0);
9281	auto tv2 = sum(tv1, {`1`});
9282	auto tv3 = broadcast(tv2, {false, true});
9283	auto tv4 = set(tv1);
9284	auto tv5 = add(tv3, tv4);
9285	fusion.addOutput(tv5);
9286
9287	auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9288
9289	auto isTvWithinVec = [](std::vector<TensorView>& vec, TensorView tv) {
9290	return std::find(vec.begin(), vec.end(), tv) != vec.end();
9291	};
9292
9293	auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9294	std::vector<TensorView*>& buffer_vec,
9295	TensorView* tv) {
9296	auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9297	return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9298	};
9299
9300	auto& buffers = persistent_buffer_info.persistent_buffers;
9301	auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9302	auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9303	auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9304
9305	TORCH_INTERNAL_ASSERT(buffers.size() == `1`);
9306	TORCH_INTERNAL_ASSERT(resolution.size() == `1` && resolution[`0`].size() == `1`);
9307	TORCH_INTERNAL_ASSERT(projectable.size() == `1`);
9308	TORCH_INTERNAL_ASSERT(projectable_inputs.size() == `1`);
9309
9310	TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
9311	TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
9312	TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9313
9314	auto tv1_resolution_it = tvEntryInVecVec (resolution, buffers, tv1);
9315	TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9316
9317	TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
9318
9319	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9320	at::Tensor aten_t0 = at::randn({`99`, `101`}, options);
9321
9322	// Schedule through magic scheduler
9323	SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
9324	auto persistent_buffer_size =
9325	persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9326
9327	TORCH_INTERNAL_ASSERT(
9328	persistent_buffer_size.persistent_buffer_size ==
9329	static_cast<int64_t>(aten_t0.size(`1`) * dataTypeSize(DataType::Float)));
9330	TORCH_INTERNAL_ASSERT(
9331	persistent_buffer_size.projected_persistent_buffer_size ==
9332	static_cast<int64_t>(aten_t0.size(`1`) * dataTypeSize(DataType::Float)));
9333	}
9334
9335	TEST_F(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) {
9336	Fusion fusion;
9337	FusionGuard fg(&fusion);
9338
9339	auto tv0 = makeSymbolicTensor(`2`, DataType::Half);
9340	fusion.addInput(tv0);
9341
9342	auto tv1 = castOp(DataType::Float, tv0);
9343	auto tv2 = sum(tv1, {`1`});
9344	auto tv3 = broadcast(tv2, {false, true});
9345	auto tv4 = set(tv1);
9346	auto tv5 = add(tv3, tv4);
9347	auto tv6 = castOp(DataType::Half, tv5);
9348	fusion.addOutput(tv6);
9349
9350	auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9351
9352	auto isTvWithinVec = [](std::vector<TensorView>& vec, TensorView tv) {
9353	return std::find(vec.begin(), vec.end(), tv) != vec.end();
9354	};
9355
9356	auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9357	std::vector<TensorView*>& buffer_vec,
9358	TensorView* tv) {
9359	auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9360	return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9361	};
9362
9363	auto& buffers = persistent_buffer_info.persistent_buffers;
9364	auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9365	auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9366	auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9367
9368	TORCH_INTERNAL_ASSERT(buffers.size() == `1`);
9369	TORCH_INTERNAL_ASSERT(resolution.size() == `1` && resolution[`0`].size() == `1`);
9370	TORCH_INTERNAL_ASSERT(projectable.size() == `1`);
9371	TORCH_INTERNAL_ASSERT(projectable_inputs.size() == `1`);
9372
9373	TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
9374	TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
9375	TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9376
9377	auto tv1_resolution_it = tvEntryInVecVec (resolution, buffers, tv1);
9378	TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9379
9380	TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
9381
9382	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
9383	at::Tensor aten_t0 = at::randn({`99`, `101`}, options);
9384
9385	// Schedule through magic scheduler
9386	SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
9387	auto persistent_buffer_size =
9388	persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9389
9390	TORCH_INTERNAL_ASSERT(
9391	persistent_buffer_size.persistent_buffer_size ==
9392	static_cast<int64_t>(aten_t0.size(`1`) * dataTypeSize(DataType::Float)));
9393	TORCH_INTERNAL_ASSERT(
9394	persistent_buffer_size.projected_persistent_buffer_size ==
9395	static_cast<int64_t>(aten_t0.size(`1`) * dataTypeSize(DataType::Half)));
9396	}
9397
9398	TEST_F(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) {
9399	Fusion fusion;
9400	FusionGuard fg(&fusion);
9401
9402	auto tv0 = makeSymbolicTensor(`2`, DataType::Half);
9403	fusion.addInput(tv0);
9404
9405	auto tv1 = castOp(DataType::Float, tv0);
9406	auto tv2 = set(tv1);
9407	auto tv3 = sum(tv2, {`1`});
9408	auto tv4 = broadcast(tv3, {false, true});
9409
9410	auto tv5 = makeSymbolicTensor(`2`, DataType::Half);
9411	fusion.addInput(tv5);
9412
9413	auto tv6 = castOp(DataType::Float, tv5);
9414
9415	auto tv7 = add(tv6, tv4);
9416	auto tv8 = set(tv1);
9417	auto tv9 = add(tv7, tv8);
9418	auto tv10 = sum(tv9, {`1`});
9419	auto tv11 = broadcast(tv10, {false, true});
9420	auto tv12 = set(tv7);
9421	auto tv13 = add(tv12, tv11);
9422
9423	fusion.addOutput(tv13);
9424
9425	auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9426
9427	auto isTvWithinVec = [](std::vector<TensorView>& vec, TensorView tv) {
9428	return std::find(vec.begin(), vec.end(), tv) != vec.end();
9429	};
9430
9431	auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9432	std::vector<TensorView*>& buffer_vec,
9433	TensorView* tv) {
9434	auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9435	return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9436	};
9437
9438	auto& buffers = persistent_buffer_info.persistent_buffers;
9439	auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9440	auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9441	auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9442
9443	TORCH_INTERNAL_ASSERT(buffers.size() == `2`);
9444	TORCH_INTERNAL_ASSERT(
9445	resolution.size() == `2` && resolution[`0`].size() == `1` &&
9446	resolution[`1`].size() == `1`);
9447	TORCH_INTERNAL_ASSERT(projectable.size() == `1`);
9448	TORCH_INTERNAL_ASSERT(projectable_inputs.size() == `1`);
9449
9450	TORCH_INTERNAL_ASSERT(
9451	isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7));
9452	TORCH_INTERNAL_ASSERT(
9453	isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7));
9454
9455	TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9456
9457	auto tv1_resolution_it = tvEntryInVecVec (resolution, buffers, tv1);
9458	TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9459	TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9));
9460
9461	auto tv7_resolution_it = tvEntryInVecVec (resolution, buffers, tv7);
9462	TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end())
9463	TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13));
9464
9465	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
9466	at::Tensor aten_t0 = at::randn({`99`, `101`}, options);
9467	at::Tensor aten_t5 = at::randn({`99`, `101`}, options);
9468
9469	// Schedule through magic scheduler
9470	SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0, aten_t5}, true);
9471	auto persistent_buffer_size =
9472	persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9473
9474	TORCH_INTERNAL_ASSERT(
9475	persistent_buffer_size.persistent_buffer_size ==
9476	static_cast<int64_t>(
9477	aten_t0.size(`1`) * dataTypeSize(DataType::Float) * `2`));
9478	TORCH_INTERNAL_ASSERT(
9479	persistent_buffer_size.projected_persistent_buffer_size ==
9480	static_cast<int64_t>(
9481	aten_t0.size(`1`) *
9482	(dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))));
9483	}
9484
9485	TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) {
9486	Fusion fusion;
9487	FusionGuard fg(&fusion);
9488
9489	auto tv0 = makeSymbolicTensor(`2`, DataType::Half);
9490	fusion.addInput(tv0);
9491
9492	auto tv1 = castOp(DataType::Float, tv0);
9493	auto tv2 = set(tv1);
9494	auto tv3 = sum(tv2, {`1`});
9495	auto tv4 = broadcast(tv3, {false, true});
9496	auto tv5 = set(tv1);
9497	auto tv6 = add(tv4, tv5);
9498	auto tv7 = set(tv2);
9499	auto tv8 = add(tv7, tv6);
9500	auto tv9 = castOp(DataType::Half, tv8);
9501
9502	fusion.addOutput(tv9);
9503
9504	auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
9505
9506	auto isTvWithinVec = [](std::vector<TensorView>& vec, TensorView tv) {
9507	return std::find(vec.begin(), vec.end(), tv) != vec.end();
9508	};
9509
9510	auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
9511	std::vector<TensorView*>& buffer_vec,
9512	TensorView* tv) {
9513	auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
9514	return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
9515	};
9516
9517	auto& buffers = persistent_buffer_info.persistent_buffers;
9518	auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
9519	auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
9520	auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
9521
9522	TORCH_INTERNAL_ASSERT(buffers.size() == `2`);
9523	TORCH_INTERNAL_ASSERT(
9524	resolution.size() == `2` && resolution[`0`].size() == `1` &&
9525	resolution[`1`].size() == `1`);
9526
9527	TORCH_INTERNAL_ASSERT(projectable.size() == `2`);
9528	TORCH_INTERNAL_ASSERT(projectable_inputs.size() == `1`);
9529
9530	TORCH_INTERNAL_ASSERT(
9531	isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2));
9532	TORCH_INTERNAL_ASSERT(
9533	isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2));
9534
9535	TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
9536
9537	auto tv1_resolution_it = tvEntryInVecVec (resolution, buffers, tv1);
9538	TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
9539	TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6));
9540
9541	auto tv2_resolution_it = tvEntryInVecVec (resolution, buffers, tv2);
9542	TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end())
9543	TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8));
9544
9545	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
9546	at::Tensor aten_t0 = at::randn({`99`, `101`}, options);
9547
9548	// Schedule through magic scheduler
9549	SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
9550	auto persistent_buffer_size =
9551	persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
9552
9553	TORCH_INTERNAL_ASSERT(
9554	persistent_buffer_size.persistent_buffer_size ==
9555	static_cast<int64_t>(
9556	aten_t0.size(`1`) * dataTypeSize(DataType::Float) * `2`));
9557
9558	TORCH_INTERNAL_ASSERT(
9559	persistent_buffer_size.projected_persistent_buffer_size ==
9560	static_cast<int64_t>(aten_t0.size(`1`) * dataTypeSize(DataType::Half)));
9561	}
9562
9563	TEST_F(NVFuserTest, FusionPersistentBufferProjection_CUDA) {
9564	std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
9565	Fusion& fusion = *fusion_ptr.get();
9566	FusionGuard fg(&fusion);
9567
9568	auto tv0 = makeSymbolicTensor(`2`, DataType::Half);
9569	fusion.addInput(tv0);
9570
9571	auto tv1 = castOp(DataType::Float, tv0);
9572	auto tv2 = set(tv1);
9573	auto tv3 = sum(tv2, {`1`});
9574	auto tv4 = broadcast(tv3, {false, true});
9575	auto tv5 = set(tv1);
9576	auto tv6 = add(tv4, tv5);
9577	auto tv7 = set(tv2);
9578	auto tv8 = add(tv7, tv6);
9579	auto tv9 = castOp(DataType::Half, tv8);
9580
9581	fusion.addOutput(tv9);
9582
9583	reduction_scheduler_utils::projectPersistentBuffers(&fusion);
9584
9585	auto tv5_producers = ir_utils::producerTvsOf(tv5);
9586	auto tv7_producers = ir_utils::producerTvsOf(tv7);
9587
9588	// Projection should have broken these dependencies
9589
9590	TORCH_INTERNAL_ASSERT(
9591	std::find(tv5_producers.begin(), tv5_producers.end(), tv1) ==
9592	tv5_producers.end());
9593	TORCH_INTERNAL_ASSERT(
9594	std::find(tv7_producers.begin(), tv7_producers.end(), tv2) ==
9595	tv7_producers.end());
9596
9597	auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, `0`);
9598	at::Tensor aten_t0 = at::randn({`99`, `101`}, options);
9599
9600	FusionExecutorCache fec(std::move(fusion_ptr));
9601	auto cg_outputs = fec.runFusionWithInputs({aten_t0});
9602
9603	auto aten_t1 = aten_t0.to(c10::kDouble);
9604	auto aten_t3 = aten_t1.sum({`1`});
9605	auto aten_t4 = aten_t3.unsqueeze(`1`);
9606	auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1);
9607
9608	testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__);
9609	}
9610
9611	TEST_F(NVFuserTest, FusionIssue1223_CUDA) {
9612	if (!deviceMajorMinorCheck(`7`)) {
9613	GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
9614	return;
9615	}
9616	Fusion fusion;
9617	FusionGuard fg(&fusion);
9618
9619	auto tv0 = makeContigTensor(`2`);
9620	fusion.addInput(tv0);
9621
9622	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
9623	auto tv2 = sum(tv1, {`0`, `1`});
9624	fusion.addOutput(tv2);
9625
9626	auto tv3 = add(tv0, IrBuilder::create<Double>(`0`));
9627	fusion.addOutput(tv3);
9628
9629	tv2->split(`0`, `4`);
9630	tv2->split(`1`, `1`, false);
9631	tv2->split(-`1`, `4`);
9632
9633	tv2->axis(`1`)->parallelize(ParallelType::Unswitch);
9634	tv2->axis(-`3`)->parallelize(ParallelType::TIDx);
9635	tv2->axis(-`1`)->parallelize(ParallelType::TIDy);
9636
9637	tv1->computeAt(tv2, -`1`);
9638
9639	// Make TIDx and TIDy non-exact
9640	tv3->split(`0`, `32`);
9641	tv3->split(-`1`, `32`);
9642	tv3->axis(`1`)->parallelize(ParallelType::TIDx);
9643	tv3->axis(`3`)->parallelize(ParallelType::TIDy);
9644
9645	// The second axis of both tv1 and tv2 are fully unswitched, so they
9646	// don't need to predicate the parallel type usage of TIDy, whereas
9647	// the first axis is only partially unswitched, i.e., part of its
9648	// split output domains is outside the unswitched axis, so the first
9649	// axis, which uses TIDx, needs to predicate the parallel
9650	// dimension. Previously, as reported in issue #1223, unswitched
9651	// expressions didn't predicate parallel dimensions. It should be
9652	// fixed by PR #1222.
9653
9654	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9655	at::Tensor at_t0 = at::ones({`11`, `10`}, options);
9656
9657	FusionExecutor fe;
9658	fe.compileFusion(&fusion, {at_t0});
9659	auto cg_outputs = fe.runFusion({at_t0});
9660
9661	auto at_t1 = (at_t0 + `1`).sum();
9662
9663	testValidate(
9664	&fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__);
9665	}
9666
9667	// See #1247 and #1250
9668	TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) {
9669	Fusion fusion;
9670	FusionGuard fg(&fusion);
9671
9672	auto tv0 = makeContigTensor(`1`);
9673	fusion.addInput(tv0);
9674
9675	auto tv1 = add(tv0, IrBuilder::create<Double>(`1`));
9676	auto tv2 = min(tv1, {`0`});
9677
9678	fusion.addOutput(tv2);
9679
9680	// Make TIDx non-exact
9681	auto tv3 = makeContigTensor(`1`);
9682	fusion.addInput(tv3);
9683
9684	auto tv4 = add(tv3, IrBuilder::create<Double>(`1`));
9685	fusion.addOutput(tv4);
9686
9687	tv2->split(`0`, `4`);
9688	auto tv5 = tv2->rFactor({`1`});
9689
9690	tv0->computeAt(tv2, `1`);
9691
9692	tv2->axis(`0`)->parallelize(ParallelType::TIDx);
9693
9694	tv4->axis(`0`)->parallelize(ParallelType::TIDx);
9695
9696	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9697	at::Tensor at_t0 = at::randn({`9`}, options);
9698	at_t0 = at::abs(at_t0);
9699	at::Tensor at_t3 = at::randn({`128`}, options);
9700
9701	FusionExecutor fe;
9702	fe.compileFusion(&fusion, {at_t0, at_t3});
9703	auto cg_outputs = fe.runFusion({at_t0, at_t3});
9704
9705	auto at_t2 = (at_t0 + `1`).min();
9706	auto at_t4 = at_t3 + `1`;
9707
9708	testValidate(
9709	&fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
9710	}
9711
9712	TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) {
9713	Fusion fusion;
9714	FusionGuard fg(&fusion);
9715
9716	auto tv0 = makeContigTensor(`1`);
9717	fusion.addInput(tv0);
9718
9719	auto tv1 = min(tv0, {`0`});
9720	fusion.addOutput(tv1);
9721
9722	// Make TIDx non-exact
9723	auto tv2 = makeContigTensor(`1`);
9724	fusion.addInput(tv2);
9725
9726	auto tv3 = add(tv2, IrBuilder::create<Double>(`1`));
9727	fusion.addOutput(tv3);
9728
9729	tv1->split(`0`, `4`);
9730	auto tv4 = tv1->rFactor({`0`});
9731
9732	tv1->split(`0`, `3`);
9733
9734	// tv0->computeAt(tv1, 3);
9735	tv4->reorder({{`0`, `1`}});
9736	tv4->split(`0`, `3`);
9737	tv4->setMemoryType(MemoryType::Shared);
9738
9739	// tv0: [I]
9740	// tv4: [4/3, 3, I/4]
9741	// tv1: [4/3, 3]
9742
9743	tv1->axis(`0`)->parallelize(ParallelType::TIDx);
9744	scheduler_utils::parallelizeAllLike(tv1, {tv4});
9745
9746	tv3->axis(`0`)->parallelize(ParallelType::TIDx);
9747
9748	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9749	at::manual_seed(`0`);
9750	at::Tensor at_t0 = at::randn({`9`}, options);
9751	at_t0 = at::abs(at_t0);
9752	at::Tensor at_t3 = at::randn({`128`}, options);
9753
9754	FusionExecutor fe;
9755	fe.compileFusion(&fusion, {at_t0, at_t3});
9756	auto cg_outputs = fe.runFusion({at_t0, at_t3});
9757
9758	auto at_t2 = std::get<`0`>(at_t0.min(`0`));
9759	auto at_t4 = at_t3 + `1`;
9760
9761	testValidate(
9762	&fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
9763	}
9764
9765	TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) {
9766	// https://github.com/csarofeen/pytorch/issues/1692
9767	Fusion fusion;
9768	FusionGuard fg(&fusion);
9769
9770	auto tv0 = makeSymbolicTensor(`3`);
9771	fusion.addInput(tv0);
9772
9773	auto tv1 = sum(tv0, {`1`, `2`});
9774	fusion.addOutput(tv1);
9775
9776	tv1->split(`2`, `4`);
9777	tv1->split(`1`, `3`);
9778	tv1->merge(`2`, `3`);
9779	auto rf = tv1->rFactor({-`1`});
9780
9781	tv1->split(`0`, `256`);
9782	tv1->axis(`0`)->parallelize(ParallelType::BIDx);
9783	tv1->axis(`1`)->parallelize(ParallelType::TIDx);
9784	rf->computeAt(tv1, -`1`);
9785
9786	auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, `0`);
9787	at::manual_seed(`0`);
9788
9789	auto at_in = at::randn({`6`, `6`, `6`}, options);
9790	auto at_out = at_in.sum({`1`, `2`});
9791
9792	FusionExecutor fe;
9793	fe.compileFusion(&fusion, {at_in});
9794	auto cg_outputs = fe.runFusion({at_in});
9795
9796	testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__);
9797	}
9798
9799	} // namespace jit
9800	} // namespace torch
9801	#endif // #if defined(USE_CUDA)
9802

Browse the source code of pytorch/third_party/nvfuser/test/test_gpu2.cpp