1#if defined(USE_CUDA)
2#include <gmock/gmock-matchers.h>
3#include <gtest/gtest.h>
4
5#include <arith.h>
6#include <codegen.h>
7#include <disjoint_set.h>
8#include <executor.h>
9#include <executor_launch_params.h>
10#include <expr_evaluator.h>
11#include <fusion.h>
12#include <fusion_segmenter.h>
13#include <grouped_reduction.h>
14#include <inlining.h>
15#include <ir_all_nodes.h>
16#include <ir_builder.h>
17#include <ir_graphviz.h>
18#include <ir_iostream.h>
19#include <ir_utils.h>
20#include <iter_visitor.h>
21#include <kernel_cache.h>
22#include <kernel_expr_evaluator.h>
23#include <kernel_ir.h>
24#include <kernel_ir_dispatch.h>
25#include <lower2device.h>
26#include <lower_magic_zero.h>
27#include <mutator.h>
28#include <ops/all_ops.h>
29#include <parser.h>
30#include <register_interface.h>
31#include <root_domain_map.h>
32#include <scheduler/all_schedulers.h>
33#include <scheduler/reduction_utils.h>
34#include <scheduler/utils.h>
35#include <test/test_gpu_validator.h>
36#include <test/test_utils.h>
37#include <transform_replay.h>
38#include <transform_rfactor.h>
39
40#include <test/cpp/jit/test_utils.h>
41#include <torch/csrc/jit/api/function_impl.h>
42#include <torch/csrc/jit/codegen/cuda/interface.h>
43#include <torch/csrc/jit/ir/irparser.h>
44#include <torch/torch.h>
45
46#include <ATen/cuda/CUDAContext.h>
47#include <ATen/cuda/Exceptions.h>
48#include <c10/cuda/CUDAStream.h>
49
50#include <algorithm>
51#include <iostream>
52#include <sstream>
53#include <thread>
54
55// Tests go in torch::jit
56namespace torch {
57namespace jit {
58
59using namespace torch::jit::fuser::cuda;
60using namespace at::indexing;
61
62TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
63 Fusion fusion;
64 FusionGuard fg(&fusion);
65
66 auto tv0 = makeSymbolicTensor(1);
67 fusion.addInput(tv0);
68
69 auto tv1 = sum(tv0, {0});
70 fusion.addOutput(tv1);
71
72 // [I]
73 tv1->split(0, 5);
74 // [ceilDiv(I, 5), 5]
75
76 // This second split is non-divisible. The split domain must be predicated.
77 tv1->split(1, 3);
78 // [ceilDiv(I, 5), 2, 3]
79
80 auto tv2 = sum(tv0, {0});
81 fusion.addOutput(tv2);
82
83 // tv2 shouldn't need to have another predicate
84 tv2->split(0, 4);
85 tv2->split(1, 2);
86
87 GpuLower gpulw(&fusion);
88 TORCH_CHECK(
89 gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
90 "There must be no split to validate");
91 TORCH_CHECK(
92 gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
93 "Only tv1 should have a non-divisible predicate.");
94 for (auto tv : {loweredTv(tv1, gpulw)}) {
95 auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
96 TORCH_CHECK(
97 it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
98 "No info found for ",
99 tv);
100 const auto& splits_to_predicate = it->second;
101 TORCH_CHECK(
102 splits_to_predicate.size() == 1,
103 "There must be one split to predicate");
104 }
105
106 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
107 at::manual_seed(0);
108 at::Tensor t0 = at::randn({24}, options);
109
110 FusionExecutor fe;
111 fe.compileFusion(&fusion, {t0});
112 auto cg_outputs = fe.runFusion({t0});
113
114 auto ref = t0.sum();
115
116 testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__);
117}
118
119// Repro of issue #1074
120TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
121 Fusion fusion;
122 FusionGuard fg(&fusion);
123
124 auto tv0 = makeSymbolicTensor(2);
125 fusion.addInput(tv0);
126 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
127 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
128 fusion.addOutput(tv2);
129
130 tv2->split(0, 2);
131 tv2->split(-1, 4);
132 tv2->reorder({{1, 2}, {2, 1}});
133 tv0->computeAt(tv2, 2);
134
135 tv2->split(-1, 3);
136
137 // To make the sanitizer catch the invalid accesses. Not necessary
138 // to expose the bug.
139 tv1->setMemoryType(MemoryType::Shared);
140
141 GpuLower gpulw(&fusion);
142 TORCH_CHECK(
143 gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
144 "There must be no split to validate");
145 TORCH_CHECK(
146 gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
147 "Only tv2 should have a non-divisible predicate.");
148 for (auto tv : {loweredTv(tv2, gpulw)}) {
149 auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
150 TORCH_CHECK(
151 it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
152 "No info found for ",
153 tv);
154 const auto& splits_to_predicate = it->second;
155 TORCH_CHECK(
156 splits_to_predicate.size() == 1,
157 "There must be one split to predicate");
158 }
159
160 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
161 at::manual_seed(0);
162 at::Tensor t0 = at::randn({13, 17}, options);
163
164 FusionExecutor fe;
165 fe.compileFusion(&fusion, {t0});
166 auto cg_outputs = fe.runFusion({t0});
167
168 auto ref = t0 + 2;
169
170 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
171}
172
173// Similar to FusionNonDivisibleSplit1 but with unswitch
174TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
175 Fusion fusion;
176 FusionGuard fg(&fusion);
177
178 auto tv0 = makeSymbolicTensor(1);
179 fusion.addInput(tv0);
180
181 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
182 auto tv2 = sum(tv1, {0});
183 fusion.addOutput(tv2);
184
185 tv2->split(0, 5);
186 tv2->split(1, 3);
187
188 tv0->computeAt(tv2, -1);
189
190 tv2->axis(0)->parallelize(ParallelType::Unswitch);
191
192 GpuLower gpulw(&fusion);
193 TORCH_CHECK(
194 gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
195 "There must be no split to validate");
196 TORCH_CHECK(
197 gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
198 "Both tv1 and tv2 should have a non-divisible predicate.");
199 for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
200 auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
201 TORCH_CHECK(
202 it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
203 "No info found for ",
204 tv);
205 const auto& splits_to_predicate = it->second;
206 TORCH_CHECK(
207 splits_to_predicate.size() == 1,
208 "There must be one split to predicate");
209 }
210
211 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
212 at::manual_seed(0);
213 at::Tensor t0 = at::randn({24}, options);
214
215 FusionExecutor fe;
216 fe.compileFusion(&fusion, {t0});
217 auto cg_outputs = fe.runFusion({t0});
218
219 auto ref = (t0 + 1).sum();
220
221 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
222}
223
224// Non-divisible split through merge
225TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
226 Fusion fusion;
227 FusionGuard fg(&fusion);
228
229 auto tv0 = makeSymbolicTensor(2);
230 fusion.addInput(tv0);
231
232 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
233 auto tv2 = sum(tv1, {0, 1});
234 fusion.addOutput(tv2);
235
236 tv2->split(0, 5);
237 tv2->merge(1, 2);
238 tv2->split(1, 3);
239
240 tv0->computeAt(tv2, -1);
241
242 GpuLower gpulw(&fusion);
243 TORCH_CHECK(
244 gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
245 "There must be no split to validate");
246 TORCH_CHECK(
247 gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
248 "Both tv1 and tv2 should have a non-divisible predicate.");
249 for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
250 auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
251 TORCH_CHECK(
252 it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
253 "No info found for ",
254 tv);
255 const auto& splits_to_predicate = it->second;
256 TORCH_CHECK(
257 splits_to_predicate.size() == 1,
258 "There must be one split to predicate");
259 }
260
261 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
262 at::manual_seed(0);
263 at::Tensor t0 = at::randn({24, 2}, options);
264
265 FusionExecutor fe;
266 fe.compileFusion(&fusion, {t0});
267 auto cg_outputs = fe.runFusion({t0});
268
269 auto ref = (t0 + 1).sum();
270
271 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
272}
273
274// Nested splits
275TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
276 Fusion fusion;
277 FusionGuard fg(&fusion);
278
279 auto tv0 = makeSymbolicTensor(1);
280 fusion.addInput(tv0);
281
282 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
283 auto tv2 = sum(tv1, {0});
284 fusion.addOutput(tv2);
285
286 // [I]
287 tv2->split(0, 8);
288 // [I/8, 8]
289 tv2->split(1, 2);
290 // [I/8, 4, 2]
291 tv2->split(1, 3); // non-divisible split of outer output
292 // [I/8, 2, 3, 2]
293
294 tv0->computeAt(tv2, -1);
295
296 GpuLower gpulw(&fusion);
297 TORCH_CHECK(
298 gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
299 "There must be no split to validate");
300 TORCH_CHECK(
301 gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
302 "Both tv1 and tv2 should have a non-divisible predicate.");
303 for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
304 auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
305 TORCH_CHECK(
306 it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
307 "No info found for ",
308 tv);
309 const auto& splits_to_predicate = it->second;
310 TORCH_CHECK(
311 splits_to_predicate.size() == 1,
312 "There must be one split to predicate");
313 }
314
315 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
316 at::manual_seed(0);
317 at::Tensor t0 = at::randn({24}, options);
318
319 FusionExecutor fe;
320 fe.compileFusion(&fusion, {t0});
321 auto cg_outputs = fe.runFusion({t0});
322
323 auto ref = (t0 + 1).sum();
324
325 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
326}
327
328// Vectorized non-divisible split. Must be validated at run time
329TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
330 Fusion fusion;
331 FusionGuard fg(&fusion);
332
333 auto tv0 = makeContigTensor(1);
334 fusion.addInput(tv0);
335
336 auto tv1 = set(tv0);
337 fusion.addOutput(tv1);
338
339 tv1->split(0, 8, false);
340 tv1->split(1, 4);
341
342 tv1->axis(-1)->parallelize(ParallelType::Vectorize);
343
344 GpuLower gpulw(&fusion);
345 TORCH_CHECK(
346 gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
347 "There should be one split to validate");
348 for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
349 const auto& splits_to_predicate = kv.second;
350 TORCH_CHECK(
351 splits_to_predicate.empty(),
352 "There must be no split to predicate, but tensor t",
353 kv.first->name(),
354 " has:",
355 splits_to_predicate);
356 }
357
358 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
359 at::manual_seed(0);
360 auto t0 = at::randn({32}, options);
361
362 FusionExecutor fe;
363 fe.compileFusion(&fusion, {t0});
364 auto cg_outputs = fe.runFusion({t0});
365
366 auto ref = t0;
367
368 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
369
370 auto t0_non_divisible = at::randn({8}, options);
371 // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
372 // illegal. The run-time validation of vectorization should throw an error.
373 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
374 ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
375}
376
377// If a split is validated at run time, it's not necessary to predicate.
378TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
379 Fusion fusion;
380 FusionGuard fg(&fusion);
381
382 auto tv0 = makeContigTensor(1);
383 fusion.addInput(tv0);
384
385 auto tv1 = set(tv0);
386 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
387 auto tv3 = sum(tv2, {0});
388 fusion.addOutput(tv3);
389
390 tv3->split(0, 8, false);
391 tv3->split(1, 4);
392 TransformPropagatorWithCheck propagator(tv3);
393 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
394
395 tv3->axis(1)->parallelize(ParallelType::TIDx);
396 scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
397
398 tv1->axis(2)->parallelize(ParallelType::Vectorize);
399
400 GpuLower gpulw(&fusion);
401 TORCH_CHECK(
402 gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
403 "There should be one split to validate");
404 for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
405 const auto& splits_to_predicate = kv.second;
406 TORCH_CHECK(
407 splits_to_predicate.empty(),
408 "There must be no split to predicate, but tensor t",
409 kv.first->name(),
410 " has:",
411 splits_to_predicate);
412 }
413
414 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
415 at::manual_seed(0);
416
417 auto t0 = at::randn({1024}, options);
418
419 FusionExecutor fe;
420 fe.compileFusion(&fusion, {t0});
421 auto cg_outputs = fe.runFusion({t0});
422
423 auto ref = (t0 + 1).sum();
424
425 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
426}
427
428TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) {
429 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
430 Fusion& fusion = *fusion_ptr.get();
431 FusionGuard fg(&fusion);
432
433 std::vector<int64_t> input_shape_0 = {10, 20};
434 std::vector<int64_t> input_shape_1 = {15};
435
436 TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
437 TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
438 fusion.addInput(in_0);
439 fusion.addInput(in_1);
440
441 TensorView* out_0 = add(in_0, IrBuilder::create<Double>(0.f));
442 TensorView* out_1 = add(in_1, IrBuilder::create<Double>(2.f));
443
444 fusion.addOutput(out_0);
445 fusion.addOutput(out_1);
446
447 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
448 at::Tensor at_in_0 = at::randn(input_shape_0, options);
449 at::Tensor at_in_1 = at::randn(input_shape_1, options);
450 std::vector<IValue> aten_inputs = {at_in_0, at_in_1};
451
452 FusionExecutorCache fec(std::move(fusion_ptr));
453 auto outputs = fec.runFusionWithInputs(aten_inputs);
454
455 auto t1 = at_in_1 + 2;
456
457 auto runtime = fec.getMostRecentKernelRuntime();
458 TORCH_INTERNAL_ASSERT(runtime->isSegmented());
459 TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
460
461 testValidate(
462 &fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__);
463}
464
465TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) {
466 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
467 Fusion& fusion = *fusion_ptr.get();
468 FusionGuard fg(&fusion);
469
470 std::vector<int64_t> input_shape_0 = {4, 4};
471 std::vector<int64_t> input_shape_1 = {3, 4, 4};
472 std::vector<int64_t> input_shape_2 = {2, 8, 4, 4};
473
474 TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
475 TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
476 TensorView* in_2 = makeSymbolicTensor(input_shape_2.size());
477
478 fusion.addInput(in_0);
479 fusion.addInput(in_1);
480 fusion.addInput(in_2);
481
482 TensorView* out_0 = add(in_0, in_1);
483 TensorView* out_1 = add(in_0, in_2);
484
485 fusion.addOutput(out_0);
486 fusion.addOutput(out_1);
487
488 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
489 at::Tensor at_in_0 = at::randn(input_shape_0, options);
490 at::Tensor at_in_1 = at::randn(input_shape_1, options);
491 at::Tensor at_in_2 = at::randn(input_shape_2, options);
492
493 std::vector<IValue> aten_inputs = {at_in_0, at_in_1, at_in_2};
494
495 FusionExecutorCache fec(std::move(fusion_ptr));
496 auto outputs = fec.runFusionWithInputs(aten_inputs);
497
498 auto t0 = at_in_0 + at_in_1;
499 auto t1 = at_in_0 + at_in_2;
500
501 auto runtime = fec.getMostRecentKernelRuntime();
502 TORCH_INTERNAL_ASSERT(runtime->isSegmented());
503 TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
504
505 testValidate(
506 &fusion,
507 outputs,
508 {at_in_0, at_in_1, at_in_2},
509 {t0, t1},
510 __LINE__,
511 __FILE__);
512}
513
514TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) {
515 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
516 Fusion& fusion = *fusion_ptr.get();
517 FusionGuard fg(&fusion);
518
519 auto t0 = makeContigTensor(1);
520 auto t1 = makeContigTensor(2);
521
522 fusion.addInput(t0);
523 fusion.addInput(t1);
524
525 auto t2 = broadcast(t0, {true, false});
526 auto t3 = add(t1, t2);
527 auto t4 = add(t3, t2);
528 auto t5 = sum(t4, {1});
529 auto t6 = broadcast(t5, {false, true});
530 auto t7 = add(t3, t6);
531
532 fusion.addOutput(t7);
533
534 t3->computeAt(t7, -1, ComputeAtMode::MostInlined);
535
536 TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == 1);
537}
538
539TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) {
540 Fusion fusion;
541 FusionGuard fg(&fusion);
542
543 auto tv0 = makeContigTensor(1);
544 fusion.addInput(tv0);
545
546 auto tv1 = set(tv0);
547 auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
548 auto tv3 = set(tv2);
549 fusion.addOutput(tv3);
550
551 tv1->setMemoryType(MemoryType::Shared);
552
553 tv3->split(-1, 128);
554 tv3->split(-1, 32);
555 TransformPropagatorWithCheck propagator(tv3);
556 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
557
558 tv0->computeAt(tv3, 1);
559
560 tv3->axis(-2)->parallelize(ParallelType::BIDx);
561 tv3->axis(-1)->parallelize(ParallelType::TIDx);
562 scheduler_utils::parallelizeAllLike(tv3);
563
564 tv1->doubleBuffer();
565
566 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
567 at::manual_seed(0);
568 auto t0 = at::randn({1000}, options);
569
570 FusionExecutor fe;
571 fe.compileFusion(&fusion, {t0});
572 auto cg_outputs = fe.runFusion({t0});
573
574 auto ref = t0 + 1;
575
576 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
577}
578
579TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) {
580 Fusion fusion;
581 FusionGuard fg(&fusion);
582
583 auto tv0 = makeContigTensor(1);
584 fusion.addInput(tv0);
585
586 auto tv1 = set(tv0);
587 auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
588 auto tv3 = set(tv2);
589 fusion.addOutput(tv3);
590
591 tv3->split(-1, 128);
592 tv3->split(-1, 32);
593 TransformPropagatorWithCheck propagator(tv3);
594 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
595
596 tv0->computeAt(tv3, -1);
597
598 tv3->axis(-2)->parallelize(ParallelType::BIDx);
599 tv3->axis(-1)->parallelize(ParallelType::TIDx);
600 scheduler_utils::parallelizeAllLike(tv3);
601
602 tv1->doubleBuffer();
603
604 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
605 at::manual_seed(0);
606 auto t0 = at::randn({1000}, options);
607
608 FusionExecutor fe;
609 fe.compileFusion(&fusion, {t0});
610 auto cg_outputs = fe.runFusion({t0});
611
612 auto ref = t0 + 1;
613
614 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
615}
616
617TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) {
618 Fusion fusion;
619 FusionGuard fg(&fusion);
620
621 auto tv0 = makeContigTensor(1);
622 fusion.addInput(tv0);
623
624 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
625 auto tv2 = set(tv1);
626 auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
627 fusion.addOutput(tv3);
628
629 tv1->setMemoryType(MemoryType::Shared);
630
631 tv3->split(-1, 128);
632 tv3->split(-1, 32);
633 TransformPropagatorWithCheck propagator(tv3);
634 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
635
636 tv0->computeAt(tv3, 1);
637
638 // tv2 is invalid to double-buffer as its producer, tv1, is
639 // computed inside the double-buffering loop.
640 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
641 ASSERT_ANY_THROW(tv2->doubleBuffer());
642
643 // Moving tv2 inner makes tv1 large enough to double-buffer tv2
644 tv2->computeAt(tv3, 2);
645
646 tv2->doubleBuffer();
647
648 tv3->axis(-1)->parallelize(ParallelType::TIDx);
649 scheduler_utils::parallelizeAllLike(tv3);
650
651 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
652 at::manual_seed(0);
653 auto t0 = at::randn({1000}, options);
654
655 FusionExecutor fe;
656 fe.compileFusion(&fusion, {t0});
657 auto cg_outputs = fe.runFusion({t0});
658
659 auto ref = t0 + 2;
660
661 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
662}
663
664// Double buffering smem to local and unswitch
665TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) {
666 Fusion fusion;
667 FusionGuard fg(&fusion);
668
669 auto tv0 = makeContigTensor(1);
670 fusion.addInput(tv0);
671
672 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
673 auto tv2 = set(tv1);
674 auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
675 fusion.addOutput(tv3);
676
677 tv1->setMemoryType(MemoryType::Shared);
678
679 tv3->split(-1, 128);
680 tv3->split(-1, 32);
681 tv3->split(-1, 8);
682 TransformPropagatorWithCheck propagator(tv3);
683 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
684
685 tv0->computeAt(tv3, 2);
686 tv2->computeAt(tv3, -1);
687
688 tv3->axis(-1)->parallelize(ParallelType::TIDx);
689 tv3->axis(1)->parallelize(ParallelType::Unswitch);
690 scheduler_utils::parallelizeAllLike(tv3);
691
692 tv2->doubleBuffer();
693
694 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
695 at::manual_seed(0);
696 auto t0 = at::randn({1000}, options);
697
698 FusionExecutor fe;
699 fe.compileFusion(&fusion, {t0});
700 auto cg_outputs = fe.runFusion({t0});
701
702 auto ref = t0 + 2;
703
704 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
705}
706
707// Double buffering gmem to shared and unswitch
708TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) {
709 Fusion fusion;
710 FusionGuard fg(&fusion);
711
712 auto tv0 = makeContigTensor(1);
713 fusion.addInput(tv0);
714
715 auto tv1 = set(tv0);
716 auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
717 fusion.addOutput(tv2);
718
719 tv1->setMemoryType(MemoryType::Shared);
720
721 tv2->split(-1, 128);
722 tv2->split(-1, 32);
723 tv2->split(-1, 8);
724 TransformPropagatorWithCheck propagator(tv2);
725 MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
726
727 tv0->computeAt(tv2, 2);
728 tv1->computeAt(tv2, -1);
729
730 tv2->axis(-1)->parallelize(ParallelType::TIDx);
731 tv2->axis(1)->parallelize(ParallelType::Unswitch);
732 scheduler_utils::parallelizeAllLike(tv2);
733
734 tv1->doubleBuffer();
735
736 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
737 at::manual_seed(0);
738 auto t0 = at::randn({1000}, options);
739
740 FusionExecutor fe;
741 fe.compileFusion(&fusion, {t0});
742 auto cg_outputs = fe.runFusion({t0});
743
744 auto ref = t0 + 1;
745
746 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
747}
748
749// Double buffering smem to local and unroll
750TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) {
751 Fusion fusion;
752 FusionGuard fg(&fusion);
753
754 auto tv0 = makeContigTensor(1);
755 fusion.addInput(tv0);
756
757 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
758 auto tv2 = set(tv1);
759 auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
760 fusion.addOutput(tv3);
761
762 tv1->setMemoryType(MemoryType::Shared);
763
764 tv3->split(-1, 128);
765 tv3->split(-1, 16);
766 tv3->split(-2, 4);
767 tv3->split(-2, 2);
768 TransformPropagatorWithCheck propagator(tv3);
769 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
770
771 tv0->computeAt(tv3, 1);
772 tv2->computeAt(tv3, -1);
773
774 tv3->axis(2)->parallelize(ParallelType::Unroll);
775 tv3->axis(4)->parallelize(ParallelType::TIDx);
776
777 tv2->doubleBuffer();
778
779 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
780 at::manual_seed(0);
781 auto t0 = at::randn({199}, options);
782
783 FusionExecutor fe;
784 fe.compileFusion(&fusion, {t0});
785 auto cg_outputs = fe.runFusion({t0});
786
787 auto ref = t0 + 2;
788
789 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
790}
791
792// Double buffering and vectorize
793TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) {
794 Fusion fusion;
795 FusionGuard fg(&fusion);
796
797 auto tv0 = makeContigTensor(1);
798 fusion.addInput(tv0);
799
800 auto tv1 = set(tv0);
801 auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
802 fusion.addOutput(tv2);
803
804 tv2->split(-1, 128);
805 tv2->split(-1, 4);
806 TransformPropagatorWithCheck propagator(tv2);
807 MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
808
809 tv1->computeAt(tv2, 2);
810
811 tv2->axis(-2)->parallelize(ParallelType::TIDx);
812
813 tv1->axis(-1)->parallelize(ParallelType::Vectorize);
814
815 tv1->doubleBuffer();
816
817 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
818 at::manual_seed(0);
819 auto t0 = at::randn({200}, options);
820
821 FusionExecutor fe;
822 fe.compileFusion(&fusion, {t0});
823 auto cg_outputs = fe.runFusion({t0});
824
825 auto ref = t0 + 1;
826
827 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
828}
829
830// Multiple tensors to double-buffer
831TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) {
832 Fusion fusion;
833 FusionGuard fg(&fusion);
834
835 auto tv0 = makeContigTensor(1);
836 fusion.addInput(tv0);
837 auto tv1 = makeContigTensor(1);
838 fusion.addInput(tv1);
839
840 auto tv2 = set(tv0);
841 auto tv3 = set(tv1);
842 auto tv4 = add(tv2, tv3);
843 fusion.addOutput(tv4);
844
845 tv4->split(0, 32);
846 tv4->split(0, 4);
847 TransformPropagatorWithCheck propagator(tv4);
848 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
849
850 tv0->computeAt(tv4, 1);
851 tv1->computeAt(tv4, 1);
852
853 tv4->axis(-1)->parallelize(ParallelType::TIDx);
854 scheduler_utils::parallelizeAllLike(tv4);
855
856 tv2->doubleBuffer();
857 tv3->doubleBuffer();
858
859 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
860 at::manual_seed(0);
861 auto t0 = at::randn({100}, options);
862 auto t1 = at::randn({100}, options);
863
864 FusionExecutor fe;
865 fe.compileFusion(&fusion, {t0, t1});
866 auto cg_outputs = fe.runFusion({t0, t1});
867
868 auto ref = t0 + t1;
869
870 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
871}
872
873// Nested double buffering from gmem to smem and smem to register
874TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) {
875 Fusion fusion;
876 FusionGuard fg(&fusion);
877
878 auto tv0 = makeContigTensor(1);
879 fusion.addInput(tv0);
880 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
881 auto out = tv1;
882 fusion.addOutput(out);
883
884 auto tv2 = tv0->cacheAfter();
885 auto tv3 = tv2->cacheAfter();
886
887 out->split(0, 32);
888 out->split(0, 4);
889 TransformPropagatorWithCheck propagator(out);
890 MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
891
892 tv2->setMemoryType(MemoryType::Shared);
893
894 tv2->computeAt(out, 1);
895 tv3->computeAt(out, -1);
896
897 out->axis(-1)->parallelize(ParallelType::TIDx);
898 scheduler_utils::parallelizeAllLike(out);
899
900 tv2->doubleBuffer();
901 tv3->doubleBuffer();
902
903 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
904 at::manual_seed(0);
905 auto t0 = at::randn({1001}, options);
906
907 FusionExecutor fe;
908 fe.compileFusion(&fusion, {t0});
909 auto cg_outputs = fe.runFusion({t0});
910
911 auto ref = t0 + 1;
912
913 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
914}
915
916// FusionSmemBlockGemmCache + double buffering at both smem and local
917TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
918 Fusion fusion;
919 FusionGuard fg(&fusion);
920
921 // Algorithm
922 TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
923 TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
924 TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
925 TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
926 TensorView* tv4 = mul(tv2, tv3); // M, K, N
927 TensorView* tv5 = sum(tv4, {1}); // M, R, N
928 fusion.addInput(tv0);
929 fusion.addInput(tv1);
930 fusion.addOutput(tv5);
931
932 TensorView* tv6 = tv5->cacheBefore();
933
934 // For smem double buffering
935 auto tv0_cache_local = tv0->cacheAfter();
936 auto tv1_cache_local = tv1->cacheAfter();
937
938 // For register double buffering
939 auto tv0_cache_smem = tv0->cacheAfter();
940 auto tv1_cache_smem = tv1->cacheAfter();
941
942 const int BSX = 32;
943 const int TSX = 8;
944
945 // [M, K, N]
946 tv6->split(-1, BSX);
947 tv6->split(-1, TSX);
948 tv6->split(1, BSX);
949 tv6->split(0, BSX);
950 tv6->split(1, TSX);
951 // [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX]
952 tv6->reorder(
953 {{4, 7}, {7, 6}, {6, 5}, {2, 4}, {1, 3}, {3, 2}, {5, 1}, {0, 0}});
954 // [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX]
955
956 auto tv6_rf = tv6->rFactor({-1});
957
958 TransformPropagatorWithCheck propagator(tv6_rf);
959 MaxRootDomainInfoSpanningTree(tv6_rf).traverse(&propagator);
960
961 tv0->computeAt(tv6, 3);
962 tv1->computeAt(tv6, 3);
963
964 tv6_rf->computeAt(tv6, -1);
965 tv0_cache_local->computeAt(tv6_rf, -1);
966 tv1_cache_local->computeAt(tv6_rf, -1);
967
968 tv0_cache_smem->setMemoryType(MemoryType::Shared);
969 tv1_cache_smem->setMemoryType(MemoryType::Shared);
970
971 tv5->axis(0)->parallelize(ParallelType::BIDx);
972 tv5->axis(1)->parallelize(ParallelType::BIDy);
973 tv5->axis(-3)->parallelize(ParallelType::TIDy);
974 tv5->axis(-1)->parallelize(ParallelType::TIDx);
975
976 scheduler_utils::parallelizeAllLike(tv5);
977
978 tv0_cache_local->doubleBuffer();
979 tv1_cache_local->doubleBuffer();
980
981 tv0_cache_smem->doubleBuffer();
982 tv1_cache_smem->doubleBuffer();
983
984 constexpr int M = 154, K = 45, N = 1524;
985
986 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
987 at::Tensor t0 = at::randn({M, K}, options);
988 at::Tensor t1 = at::randn({K, N}, options);
989 at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
990
991 std::vector<IValue> aten_inputs = {t0, t1};
992
993 FusionExecutor fe;
994 fe.compileFusion(&fusion, aten_inputs);
995 auto cg_outputs = fe.runFusion(aten_inputs);
996
997 testValidate(
998 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
999 // The smem cache write in this test case is redundant predicated,
1000 // and also double buffered. Currently we are relying on WAR sync
1001 // insertion to ensure ordering of double buffered tensor access.
1002 // The check below makes sure that the sync is inserted so that the
1003 // test isn't running on a race condition.
1004 TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0);
1005}
1006
1007TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
1008 std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};
1009
1010 for (auto mem_type : mem_types) {
1011 Fusion fusion;
1012 FusionGuard fg(&fusion);
1013
1014 auto tv0 = makeContigTensor(1);
1015 fusion.addInput(tv0);
1016
1017 auto tv1 = set(tv0);
1018 auto tv2 = set(tv1);
1019 auto tv3 = set(tv2);
1020 fusion.addOutput(tv3);
1021
1022 tv1->setMemoryType(mem_type);
1023
1024 tv3->split(-1, 4);
1025 TransformPropagatorWithCheck propagator(tv3);
1026 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
1027
1028 tv1->computeAt(tv3, -2);
1029
1030 tv2->axis(-1)->parallelize(ParallelType::Vectorize);
1031
1032 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1033 at::manual_seed(0);
1034 auto t0 = at::randn({15}, options);
1035 FusionExecutor fe;
1036 fe.compileFusion(&fusion);
1037
1038 // This should throw an exception as the extent of t0 is not
1039 // divisible by the vector width
1040 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1041 ASSERT_ANY_THROW(fe.runFusion({t0}));
1042
1043 auto t1 = at::randn({16}, options);
1044 auto cg_outputs = fe.runFusion({t1});
1045
1046 auto ref = t1;
1047
1048 testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__);
1049 }
1050}
1051
1052TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) {
1053 Fusion fusion;
1054 FusionGuard fg(&fusion);
1055
1056 auto tv0 = makeConcreteTensor({10, 1});
1057 fusion.addInput(tv0);
1058 auto tv1 = makeConcreteTensor({10, 20});
1059 fusion.addInput(tv1);
1060 auto tv2 = makeConcreteTensor({10, 10});
1061 fusion.addInput(tv2);
1062
1063 // Not concretized
1064 auto tv3 = sum(tv2, {1});
1065 auto tv4 = broadcast(tv3, {false, true});
1066 auto tv5 = add(tv0, tv4);
1067 fusion.addOutput(tv5);
1068
1069 // Concretized
1070 auto tv6 = sum(tv2, {1});
1071 auto tv7 = broadcast(tv6, {false, true});
1072 auto tv8 = add(tv1, tv7);
1073 fusion.addOutput(tv8);
1074
1075 for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) {
1076 tv->axis(1)->parallelize(ParallelType::TIDx);
1077 }
1078
1079 GpuLower gpulw(&fusion);
1080 TORCH_CHECK(!gpulw.concretizedBroadcastDomains()->isConcretized(
1081 loweredTv(tv4, gpulw)->axis(1)));
1082 TORCH_CHECK(gpulw.concretizedBroadcastDomains()->isConcretized(
1083 loweredTv(tv7, gpulw)->axis(1)));
1084
1085 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1086 at::manual_seed(0);
1087 auto t0 = at::randn({10, 1}, options);
1088 auto t1 = at::randn({10, 20}, options);
1089 auto t2 = at::randn({10, 10}, options);
1090 std::vector<IValue> aten_inputs = {t0, t1, t2};
1091
1092 FusionExecutor fe;
1093 fe.compileFusion(&fusion, aten_inputs);
1094 auto outputs = fe.runFusion(aten_inputs);
1095
1096 auto t5 = t0 + t2.sum({1}).unsqueeze(-1);
1097 auto t8 = t1 + t2.sum({1}).unsqueeze(-1);
1098
1099 testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__);
1100}
1101
1102TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) {
1103 Fusion fusion;
1104 FusionGuard fg(&fusion);
1105
1106 auto tv0 = makeSymbolicTensor(2);
1107 fusion.addInput(tv0);
1108
1109 auto tv1 = sum(tv0, {0, 1});
1110 auto tv2 = broadcast(tv1, {true});
1111 auto tv3 = broadcast(tv2, {false, true});
1112 fusion.addOutput(tv3);
1113
1114 // tv1 is thread-predicated with TIDx and TIDy
1115 tv1->axis(0)->parallelize(ParallelType::TIDx);
1116 tv1->axis(1)->parallelize(ParallelType::TIDy);
1117 // tv2 broadcasts along TIDx
1118 tv2->axis(0)->parallelize(ParallelType::TIDx);
1119 // tv3 broadcasts along TIDy
1120 tv3->axis(0)->parallelize(ParallelType::TIDx);
1121 tv3->axis(1)->parallelize(ParallelType::TIDy);
1122
1123 // Both tv2 and tv3 broadcast along predicated TID dimensions, but
1124 // since the broadcast domains are not concretized, there should be
1125 // no actual parallel broadcast
1126
1127 GpuLower gpulw(&fusion);
1128 TORCH_CHECK(
1129 !gpulw.kernel()->summary().has_block_broadcasts &&
1130 !gpulw.kernel()->summary().has_grid_broadcasts,
1131 "There must be no parallel broadcast in this fusion");
1132
1133 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1134 at::manual_seed(0);
1135 auto t0 = at::randn({10, 11}, options);
1136 std::vector<IValue> aten_inputs = {t0};
1137
1138 FusionExecutor fe;
1139 fe.compileFusion(&fusion, aten_inputs);
1140 auto outputs = fe.runFusion(aten_inputs);
1141
1142 auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1);
1143
1144 testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__);
1145}
1146
1147TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) {
1148 Fusion fusion;
1149 FusionGuard fg(&fusion);
1150
1151 std::vector<int64_t> input_shape({10, 4, 8});
1152 std::vector<int64_t> output_shape({8, 4, 1});
1153
1154 auto tv0 = makeConcreteTensor(input_shape);
1155 fusion.addInput(tv0);
1156
1157 auto tv2 = sum(tv0, {0});
1158 auto tv3 = set(tv2);
1159 auto tv4 =
1160 view(tv3, {input_shape.begin() + 1, input_shape.end()}, output_shape);
1161 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
1162 fusion.addOutput(tv5);
1163
1164 tv2->axis(0)->parallelize(ParallelType::TIDx);
1165 tv4->axis(-1)->parallelize(ParallelType::TIDx);
1166 tv5->axis(-1)->parallelize(ParallelType::TIDx);
1167
1168 // The view op adds a broadcast domain in tv4, which is
1169 // parallelized. Howver, it is never materialized, so there should
1170 // be no parallel broadcast.
1171
1172 GpuLower gpulw(&fusion);
1173 TORCH_CHECK(
1174 !gpulw.kernel()->summary().has_block_broadcasts &&
1175 !gpulw.kernel()->summary().has_grid_broadcasts,
1176 "There must be no parallel broadcast in this fusion");
1177
1178 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1179 at::manual_seed(0);
1180 auto t0 = at::randn(input_shape, options);
1181 std::vector<IValue> aten_inputs = {t0};
1182
1183 FusionExecutor fe;
1184 fe.compileFusion(&fusion, aten_inputs);
1185 auto outputs = fe.runFusion(aten_inputs);
1186
1187 auto t5 = at::native::view(t0.sum(0), output_shape) + 1;
1188
1189 testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__);
1190}
1191
1192// Merging non-broadcast and broadcast domains
1193// TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418
1194// validateParallelize does not pass. Even if it's skipped,
1195// generated code is invalid as blockBroadcast is not used.
1196#if 0
1197TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {
1198 Fusion fusion;
1199 FusionGuard fg(&fusion);
1200
1201 auto tv0 = makeSymbolicTensor(2);
1202 fusion.addInput(tv0);
1203
1204 auto tv1 = sum(tv0, {1});
1205 auto tv2 = broadcast(tv1, {false, true});
1206 auto tv3 = add(tv2, tv0);
1207 fusion.addOutput(tv3);
1208
1209 tv1->axis(1)->parallelize(ParallelType::TIDx);
1210
1211 tv2->merge(0, 1);
1212 tv2->axis(0)->parallelize(ParallelType::TIDx);
1213 // TODO: When set to shared memory, this kernel should be correct, but fails
1214 // validation and when skipped produces incorrect code
1215 tv2->setMemoryType(MemoryType::Shared);
1216
1217 tv3->merge(0, 1);
1218 tv3->axis(0)->parallelize(ParallelType::TIDx);
1219
1220 fusion.printMath();
1221 fusion.printKernel();
1222}
1223#endif
1224
1225TEST_F(NVFuserTest, FusionBroadcastConcretization5_CUDA) {
1226 Fusion fusion;
1227 FusionGuard fg(&fusion);
1228
1229 auto tv0 = makeSymbolicTensor(1);
1230 fusion.addInput(tv0);
1231 auto tv1 = makeSymbolicTensor(1);
1232 fusion.addInput(tv1);
1233 auto tv2 = makeSymbolicTensor(1);
1234 fusion.addInput(tv2);
1235 auto tv3 = makeSymbolicTensor(1);
1236 fusion.addInput(tv3);
1237
1238 // Assert tv2 and tv3 have the same shape
1239 auto tv4 = add(tv2, tv3);
1240 fusion.addOutput(tv4);
1241
1242 // Concretize a broadcast domain to multiple non-concrete domains
1243 // through a multi-output expression. It should be considered to be
1244 // non-uniquely concretized.
1245 auto tv5 = broadcast(tv0, {false, true});
1246 // Reduce only the non-broadcast domain.
1247 auto tvs = Welford(tv5, {0});
1248 auto tv9 = add(tvs.avg, tv1);
1249 auto tv10 = add(tvs.var_sum, tv2);
1250 fusion.addOutput(tv9);
1251 fusion.addOutput(tv10);
1252
1253 // Same pattern as the above, but concretize the broadcast domain
1254 // with tv2 and tv3, which have the exactly same shape, so the
1255 // broadcast should be considered uniquely concretized.
1256 auto tv11 = broadcast(tv0, {false, true});
1257 // Reduce only the non-broadcast domain.
1258 auto tvs2 = Welford(tv11, {0});
1259 auto tv15 = add(tvs2.avg, tv2);
1260 auto tv16 = add(tvs2.var_sum, tv3);
1261 fusion.addOutput(tv15);
1262 fusion.addOutput(tv16);
1263
1264 // Reduce only the broadcast domain. Since it's reduced, it should
1265 // not be considered to be concretized.
1266 auto tv17 = broadcast(tv0, {false, true});
1267 auto tvs3 = Welford(tv17, {1});
1268 fusion.addOutput(tvs3.avg);
1269
1270 ConcretizedBroadcastDomains bcast_concretization_info(&fusion);
1271
1272 TORCH_CHECK(
1273 bcast_concretization_info.maybeNonUniquelyConcretized(tv5->axis(1)),
1274 "Failed to detect non-unique concretization of ",
1275 tv5->toString());
1276
1277 TORCH_CHECK(
1278 bcast_concretization_info.isUniquelyConcretized(tv11->axis(1)),
1279 "Failed to detect unique concretization of ",
1280 tv11->toString());
1281
1282 TORCH_CHECK(
1283 !bcast_concretization_info.isConcretized(tv17->axis(1)),
1284 "Failed to detect non-concretization of ",
1285 tv17->toString());
1286}
1287
1288TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
1289 // Derived from an expression sorting issue when using loop map, now expr
1290 // sorting uses parallel map.
1291 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
1292 Fusion& fusion = *fusion_ptr.get();
1293 FusionGuard fg(&fusion);
1294
1295 int V = 2, W = 3, X = 4, Y = 5, Z = 6;
1296
1297 // setup fusion
1298 auto tv0 = TensorViewBuilder()
1299 .ndims(5)
1300 .dtype(DataType::Half)
1301 .contiguity(std::vector<bool>(5, true))
1302 .shape({V, W, X, Y, Z})
1303 .build();
1304
1305 fusion.addInput(tv0);
1306 auto tv1 = set(tv0);
1307 auto tv2 = castOp(DataType::Float, tv1);
1308
1309 auto tvs = Welford(tv2, {1, 2, 3, 4});
1310 auto tv3 = tvs.avg;
1311 auto tv4 = tvs.var_sum;
1312 auto tv5 = tvs.n;
1313
1314 // avg
1315 auto tv6 = broadcast(tvs.avg, {false, true, true, true, true});
1316
1317 // var
1318 auto tv7 = mul(tv4, IrBuilder::create<Double>(1. / (W * X * Y * Z)));
1319 auto tv8 = add(tv7, IrBuilder::create<Double>(1.e-6));
1320 auto tv9 = broadcast(tv8, {false, true, true, true, true});
1321 auto tv10 = rsqrt(tv9);
1322
1323 auto tv11 = castOp(DataType::Float, tv1);
1324 auto tv12 = sub(tv11, tv6);
1325 auto tv13 = mul(tv12, tv10);
1326
1327 auto tv14 = set(tv13);
1328 fusion.addOutput(tv14);
1329
1330 tv3->axis(0)->parallelize(ParallelType::BIDy);
1331 tv3->axis(2)->parallelize(ParallelType::BIDx);
1332 tv3->axis(3)->parallelize(ParallelType::TIDx);
1333 tv3->axis(4)->parallelize(ParallelType::Vectorize);
1334
1335 // tv3->reorder({{1, -2}});
1336
1337 auto rfactor = ir_utils::rfactorHelper(tv3, {1, 4});
1338
1339 scheduler_utils::parallelizeAllLike(rfactor);
1340
1341 for (auto tv : ir_utils::allTvs(&fusion)) {
1342 if (tv != tv1 || tv != tv3) {
1343 for (auto i : c10::irange(tv->nDims())) {
1344 if (isParallelTypeVectorize(tv->axis(i)->getParallelType())) {
1345 tv->axis(i)->parallelize(ParallelType::Serial);
1346 }
1347 }
1348 }
1349 }
1350
1351 tv0->computeAt(tv14, 1);
1352 tv13->computeAt(tv14, -2);
1353 tv2->computeAt(tv14, -1, ComputeAtMode::MostInlined);
1354 tv11->computeAt(tv14, -1, ComputeAtMode::MostInlined);
1355
1356 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
1357 at::Tensor t0 = at::randn({V, W, X, Y, Z}, options);
1358
1359 FusionExecutor fe;
1360 fe.compileFusion(&fusion);
1361 auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1));
1362
1363 auto t0_double = t0.to(at::kDouble);
1364
1365 auto at_mu = at::mean(t0_double, {1, 2, 3, 4})
1366 .unsqueeze(-1)
1367 .unsqueeze(-1)
1368 .unsqueeze(-1)
1369 .unsqueeze(-1);
1370 auto at_var = at::var(t0_double, {1, 2, 3, 4}, false)
1371 .unsqueeze(-1)
1372 .unsqueeze(-1)
1373 .unsqueeze(-1)
1374 .unsqueeze(-1);
1375
1376 auto at_out = t0_double.sub(at_mu).div(at_var.add(1.e-6).sqrt());
1377
1378 testValidate(
1379 &fusion,
1380 cg_outputs,
1381 {t0},
1382 {at_out},
1383 __LINE__,
1384 __FILE__,
1385 "",
1386 LaunchParams(X, V, -1, Y, -1, -1));
1387}
1388
1389// Test code generation of allocated scalars
1390TEST_F(NVFuserTest, FusionCodegenAllocatedScalars_CUDA) {
1391 Fusion fusion;
1392 FusionGuard fg(&fusion);
1393
1394 // Fusion is just a dummy container in this test, just used for
1395 // getting a Kernel container
1396 auto tv0 = makeSymbolicTensor(0);
1397 fusion.addInput(tv0);
1398 auto tv1 = set(tv0);
1399 fusion.addOutput(tv1);
1400
1401 GpuLower gpulw(&fusion);
1402 auto kernel = gpulw.kernel();
1403
1404 // Set the kernel as the current fusion
1405 FusionGuard kg(kernel);
1406
1407 // Create alocated scalars
1408 auto ks0 = add(kernel->zeroVal(), kernel->oneVal());
1409 auto ks0_alloc = IrBuilder::create<kir::Allocate>(
1410 ks0, MemoryType::Local, kernel->oneVal());
1411
1412 auto ks1 = add(ks0, kernel->oneVal());
1413 auto ks1_alloc = IrBuilder::create<kir::Allocate>(
1414 ks1, MemoryType::Local, kernel->oneVal());
1415
1416 auto tk0 = kernel->inputs()[0]->as<TensorView>();
1417 auto tki0 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks0});
1418 auto tki1 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks1});
1419 auto tk0_expr = IrBuilder::create<UnaryOp>(UnaryOpType::Set, tki0, tki1);
1420
1421 // Insert the scalar expression and the allocation of the
1422 // output directly to the kernel
1423 auto proxy = kir::KernelInternalProxy(kernel);
1424
1425 const auto indent = " ";
1426 const auto ks0_name = "i" + std::to_string(ks0->name());
1427 const auto ks1_name = "i" + std::to_string(ks1->name());
1428 const auto tk0_name = "T" + std::to_string(tk0->name());
1429
1430 auto& exprs = proxy.topLevelExprs();
1431 exprs.push_back(tk0_expr);
1432
1433 // Invalid code gen
1434 const auto no_alloc_code = codegen::generateCudaKernel(kernel);
1435
1436 // Without alloc, Int vals are just inlined, resulting in:
1437 // t0[(0 + 1)] = t0[((0 + 1) + 1)]
1438 std::stringstream no_alloc_ref;
1439 no_alloc_ref << "\n"
1440 << indent << tk0_name << "[(0 + 1)]\n"
1441 << indent << indent << " = " << tk0_name << "[((0 + 1) + 1)];\n";
1442
1443 TORCH_CHECK(
1444 no_alloc_code.find(no_alloc_ref.str()) != std::string::npos,
1445 "Invalid code generation. Expected:",
1446 no_alloc_ref.str(),
1447 "Actual:\n",
1448 no_alloc_code);
1449
1450 // Insert proper allocations and definitions
1451 exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks0_alloc);
1452 exprs.insert(
1453 std::find(exprs.begin(), exprs.end(), tk0_expr), ks0->definition());
1454 exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks1_alloc);
1455 exprs.insert(
1456 std::find(exprs.begin(), exprs.end(), tk0_expr), ks1->definition());
1457
1458 const auto valid_code = codegen::generateCudaKernel(kernel);
1459
1460 std::stringstream valid_ref;
1461 valid_ref << "\n"
1462 << indent << tk0_name << "[" << ks0_name << "]\n"
1463 << indent << indent << " = " << tk0_name << "[" << ks1_name
1464 << "];\n";
1465
1466 TORCH_CHECK(
1467 valid_code.find(valid_ref.str()) != std::string::npos,
1468 "Invalid code generation. Expected:",
1469 valid_ref.str(),
1470 "Actual:\n",
1471 valid_code);
1472}
1473
1474TEST_F(NVFuserTest, FusionIndexHoist1_CUDA) {
1475 if (isOptionDisabled(DisableOption::IndexHoist)) {
1476 GTEST_SKIP() << "Index hoisting disabled";
1477 }
1478
1479 Fusion fusion;
1480 FusionGuard fg(&fusion);
1481
1482 auto tv0 = makeSymbolicTensor(2);
1483 fusion.addInput(tv0);
1484
1485 auto tv1 = set(tv0);
1486 auto tv2 = set(tv1);
1487 auto tv3 = set(tv2);
1488 auto tv4 = set(tv3);
1489 auto tv5 = set(tv4);
1490 fusion.addOutput(tv5);
1491
1492 tv1->split(-1, 4);
1493 tv2->split(-1, 4);
1494 tv3->merge(0, 1);
1495 tv3->split(0, 8);
1496 tv5->merge(0, 1);
1497 tv5->split(0, 8);
1498 tv4->computeAt(tv5, -1);
1499
1500 tv1->setMemoryType(MemoryType::Global);
1501 tv2->setMemoryType(MemoryType::Global);
1502 tv3->setMemoryType(MemoryType::Global);
1503
1504 // Use Int32 as the index type to verify Int32 is used as the type
1505 // of hoisted indices
1506 GpuLower gpulw(&fusion, DataType::Int32);
1507 auto kernel = gpulw.kernel();
1508
1509 auto is_index_times_ns = [](Val* val, Val* index, std::string name) -> bool {
1510 auto def = dynamic_cast<BinaryOp*>(val->definition());
1511 if (def == nullptr) {
1512 return false;
1513 }
1514 return def->getBinaryOpType() == BinaryOpType::Mul &&
1515 def->rhs()->isA<NamedScalar>() &&
1516 def->rhs()->as<NamedScalar>()->name() == name && def->lhs() == index;
1517 };
1518
1519 // Validate indices in the kernel are hoisted as
1520 // intended. Validation could be also done by just string comparison
1521 // as the parser test, but updating such tests would be tedious.
1522 for (auto top_level_loop :
1523 ir_utils::filterByType<kir::ForLoop>(kernel->topLevelExprs())) {
1524 auto innermost_loop = top_level_loop;
1525 while (auto first_expr_loop = dynamic_cast<kir::ForLoop*>(
1526 innermost_loop->body().exprs().at(0))) {
1527 innermost_loop = first_expr_loop;
1528 }
1529 const auto& exprs = innermost_loop->body().exprs();
1530 TORCH_CHECK(!exprs.empty(), "No expression found");
1531 TORCH_CHECK(
1532 exprs.at(0)->isA<kir::Allocate>(),
1533 "Invalid expression: ",
1534 exprs.at(0)->toString());
1535 auto hoisted_index = exprs.at(0)->as<kir::Allocate>()->buffer();
1536 TORCH_CHECK(
1537 hoisted_index->dtype() == DataType::Int32,
1538 "Invalid data type of hoisted indices. Should be Int32 but: ",
1539 hoisted_index->dtype());
1540 kir::Predicate* pred = nullptr;
1541 for (auto expr : exprs) {
1542 if (expr->isA<kir::IfThenElse>()) {
1543 pred = expr->as<kir::IfThenElse>()->predicate();
1544 auto arith_expr = expr->as<kir::IfThenElse>()->thenBody().exprs().at(0);
1545 auto out_ti = arith_expr->outputs()[0]->as<kir::TensorIndex>();
1546 if (out_ti->view()->name() == 1) {
1547 // Ref: T1[*, hoisted_index] = T0[*, hoisted_index * T0.stride];
1548 auto t1_index = out_ti->index(1);
1549 TORCH_CHECK(
1550 t1_index == hoisted_index,
1551 "Invalid index: ",
1552 t1_index->toInlineString());
1553 // Pred: hoisted_index < T0.size[1]
1554 TORCH_CHECK(
1555 pred->value()->definition()->as<BinaryOp>()->lhs() ==
1556 hoisted_index,
1557 "Invalid predicate: ",
1558 pred->value()->toInlineString(),
1559 ", ",
1560 expr->toString());
1561 TORCH_CHECK(arith_expr->inputs().size() == 1);
1562 auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1563 TORCH_CHECK(in0->view()->name() == 0);
1564 // hoisted_index * T0.stride[1]
1565 auto t0_index = in0->index(1);
1566 TORCH_CHECK(
1567 is_index_times_ns(t0_index, hoisted_index, "T0.stride[1]"),
1568 "Invalid index: ",
1569 t0_index->toInlineString(),
1570 ", ",
1571 expr->toString());
1572 } else if (out_ti->view()->name() == 2) {
1573 // Ref: T3[*, hoisted_index] = T2[*, hoisted_index];
1574 auto out_index = out_ti->index(1);
1575 TORCH_CHECK(
1576 out_index == hoisted_index,
1577 "Invalid index: ",
1578 out_index->toInlineString(),
1579 ", ",
1580 expr->toString());
1581 TORCH_CHECK(
1582 pred->value()->definition()->as<BinaryOp>()->lhs() ==
1583 hoisted_index,
1584 "Invalid predicate: ",
1585 pred->value()->toInlineString(),
1586 ", ",
1587 expr->toString());
1588 TORCH_CHECK(arith_expr->inputs().size() == 1);
1589 auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1590 TORCH_CHECK(in0->view()->name() == 1);
1591 auto in0_index = in0->index(1);
1592 TORCH_CHECK(
1593 in0_index == hoisted_index,
1594 "Invalid index: ",
1595 in0_index->toInlineString(),
1596 ", ",
1597 expr->toString());
1598 } else if (out_ti->view()->name() == 3) {
1599 // Ref: T3[hoisted_index] = T2[hoisted_index];
1600 auto out_index = out_ti->index(0);
1601 TORCH_CHECK(
1602 out_index == hoisted_index,
1603 "Invalid index: ",
1604 out_index->toInlineString(),
1605 ", ",
1606 expr->toString());
1607 TORCH_CHECK(
1608 pred->value()->definition()->as<BinaryOp>()->lhs() ==
1609 hoisted_index,
1610 "Invalid predicate: ",
1611 pred->value()->toInlineString(),
1612 ", ",
1613 expr->toString());
1614 TORCH_CHECK(arith_expr->inputs().size() == 1);
1615 auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1616 TORCH_CHECK(in0->view()->name() == 2);
1617 auto in0_index = in0->index(0);
1618 TORCH_CHECK(
1619 in0_index == hoisted_index,
1620 "Invalid index: ",
1621 in0_index->toInlineString(),
1622 ", ",
1623 expr->toString());
1624 } else if (out_ti->view()->name() == 4) {
1625 // Ref: T4[0] = T3[hoisted_index];
1626 TORCH_CHECK(
1627 pred->value()->definition()->as<BinaryOp>()->lhs() ==
1628 hoisted_index,
1629 "Invalid predicate: ",
1630 pred->value()->toInlineString(),
1631 ", ",
1632 expr->toString());
1633 TORCH_CHECK(arith_expr->inputs().size() == 1);
1634 auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
1635 TORCH_CHECK(in0->view()->name() == 3);
1636 auto in0_index = in0->index(0);
1637 TORCH_CHECK(
1638 in0_index == hoisted_index,
1639 "Invalid index: ",
1640 in0_index->toInlineString(),
1641 ", ",
1642 expr->toString());
1643 } else if (out_ti->view()->name() == 5) {
1644 // Ref: T5[hoisted_index] = T4[0]
1645 auto out_index = out_ti->index(0);
1646 TORCH_CHECK(
1647 out_index == hoisted_index,
1648 "Invalid index: ",
1649 out_index->toInlineString(),
1650 ", ",
1651 expr->toString());
1652 TORCH_CHECK(
1653 pred->value()->definition()->as<BinaryOp>()->lhs() ==
1654 hoisted_index,
1655 "Invalid predicate: ",
1656 pred->value()->toInlineString(),
1657 ", ",
1658 expr->toString());
1659 }
1660 }
1661 }
1662 }
1663
1664 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1665 at::manual_seed(0);
1666 auto t0 = at::randn({15, 17}, options);
1667
1668 FusionExecutor fe;
1669 fe.compileFusion(&fusion, {t0});
1670 auto cg_outputs = fe.runFusion({t0});
1671
1672 auto ref = t0;
1673
1674 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1675}
1676
1677// Hoist indices for vectorized tensors
1678TEST_F(NVFuserTest, FusionIndexHoist2_CUDA) {
1679 if (isOptionDisabled(DisableOption::IndexHoist)) {
1680 GTEST_SKIP() << "Index hoisting disabled";
1681 }
1682
1683 Fusion fusion;
1684 FusionGuard fg(&fusion);
1685
1686 auto tv0 = makeContigTensor(1);
1687 fusion.addInput(tv0);
1688 auto tv1 = makeContigTensor(1);
1689 fusion.addInput(tv1);
1690
1691 auto tv2 = set(tv0);
1692 auto tv3 = set(tv1);
1693 auto tv4 = add(tv2, tv3);
1694 auto tv5 = set(tv4);
1695 fusion.addOutput(tv5);
1696
1697 tv5->split(-1, 4);
1698 TransformPropagatorWithCheck propagator(tv5);
1699 MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
1700
1701 tv4->split(-1, 3);
1702
1703 tv0->computeAt(tv5, 1);
1704 tv1->computeAt(tv5, 1);
1705
1706 tv2->axis(-1)->parallelize(ParallelType::Vectorize);
1707 tv3->axis(-1)->parallelize(ParallelType::Vectorize);
1708 tv5->axis(-1)->parallelize(ParallelType::Vectorize);
1709
1710 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1711 at::manual_seed(0);
1712 auto t0 = at::randn({16}, options);
1713 auto t1 = at::randn({16}, options);
1714
1715 FusionExecutor fe;
1716 fe.compileFusion(&fusion, {t0, t1});
1717 auto cg_outputs = fe.runFusion({t0, t1});
1718
1719 auto ref = t0 + t1;
1720
1721 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1722}
1723
1724TEST_F(NVFuserTest, FusionTestGridComm_CUDA) {
1725 Fusion fusion;
1726 FusionGuard fg(&fusion);
1727 int X = 3, Y = 4, Z = 2;
1728 auto tv0 = makeConcreteTensor({X, Y, Z});
1729 fusion.addInput(tv0);
1730 auto tv1 = makeConcreteTensor({X, Y, Z});
1731 fusion.addInput(tv1);
1732
1733 auto tv2 = set(tv0);
1734 auto tv3 = add(tv2, tv1);
1735 auto tv4 = set(tv3);
1736 auto tv5 = set(tv4);
1737 fusion.addOutput(tv5);
1738
1739 tv2->setMemoryType(MemoryType::Global);
1740 tv3->setMemoryType(MemoryType::Global);
1741 tv4->setMemoryType(MemoryType::Global);
1742
1743 tv2->axis(0)->parallelize(ParallelType::BIDy);
1744 tv2->axis(1)->parallelize(ParallelType::BIDx);
1745 tv2->axis(2)->parallelize(ParallelType::Vectorize);
1746
1747 tv3->axis(0)->parallelize(ParallelType::BIDx);
1748 tv3->axis(1)->parallelize(ParallelType::BIDy);
1749
1750 tv4->axis(0)->parallelize(ParallelType::BIDy);
1751 tv4->axis(1)->parallelize(ParallelType::BIDx);
1752
1753 tv5->axis(0)->parallelize(ParallelType::BIDy);
1754 tv5->axis(1)->parallelize(ParallelType::BIDx);
1755 tv5->axis(2)->parallelize(ParallelType::Vectorize);
1756
1757 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1758 at::manual_seed(0);
1759 auto t0 = at::randn({X, Y, Z}, options);
1760 auto t1 = at::randn({X, Y, Z}, options);
1761
1762 FusionExecutor fe;
1763 fe.compileFusion(&fusion, {t0, t1});
1764 auto cg_outputs = fe.runFusion({t0, t1});
1765
1766 auto ref = t0 + t1;
1767
1768 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1769}
1770
1771// See issue https://github.com/csarofeen/pytorch/issues/1497
1772TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) {
1773 Fusion fusion;
1774 FusionGuard fg(&fusion);
1775
1776 int64_t W = 3, X = 4;
1777
1778 auto tv0 = makeConcreteTensor({X});
1779 auto tv1 = makeConcreteTensor({W, X});
1780 fusion.addInput(tv0);
1781 fusion.addInput(tv1);
1782
1783 auto tv2 = add(tv0, IrBuilder::create<Double>(1));
1784 auto tv3 = broadcast(tv2, {true, false});
1785 auto tv4 = add(tv3, tv1);
1786 fusion.addOutput(tv4);
1787
1788 tv4->merge(0);
1789 tv4->split(0, 2);
1790
1791 TransformPropagatorWithCheck propagator(tv4);
1792 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
1793
1794 tv3->computeAt(tv4, 1);
1795
1796 tv4->axis(0)->parallelize(ParallelType::BIDx);
1797 tv4->axis(-1)->parallelize(ParallelType::TIDx);
1798 tv2->axis(0)->parallelize(ParallelType::BIDx);
1799 tv2->axis(-1)->parallelize(ParallelType::TIDx);
1800 tv3->axis(-1)->parallelize(ParallelType::TIDx);
1801
1802 tv2->setMemoryType(MemoryType::Global);
1803
1804 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1805 at::manual_seed(0);
1806 auto t0 = at::randn({X}, options);
1807 auto t1 = at::randn({W, X}, options);
1808
1809 FusionExecutor fe;
1810 fe.compileFusion(&fusion, {t0, t1});
1811 auto cg_outputs = fe.runFusion({t0, t1});
1812
1813 auto ref = t0 + t1 + 1;
1814
1815 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
1816}
1817
1818// Vectorized reset test for double buffered registers
1819TEST_F(NVFuserTest, FusionDoubleBufferVector_CUDA) {
1820 Fusion fusion;
1821 FusionGuard fg(&fusion);
1822
1823 auto tv0 = makeContigTensor(1);
1824 fusion.addInput(tv0);
1825
1826 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
1827 auto tv2 = sum(tv1, {0});
1828 auto tv2c = tv2->cacheBefore();
1829
1830 fusion.addOutput(tv2);
1831
1832 auto tv1cw = tv1->cacheAfter();
1833 auto tv1cr = tv1cw->cacheAfter();
1834
1835 tv1cw->split(-1, 32);
1836 tv1cr->split(-1, 32);
1837 tv1cr->split(-1, 4);
1838 tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
1839
1840 tv1cw->computeAt(tv1cr, 1);
1841 tv0->computeAt(tv1cw, -1);
1842 tv2c->split(-1, 32);
1843 tv2c->split(-1, 4);
1844 tv1cr->computeAt(tv2c, 2);
1845
1846 tv1cw->setMemoryType(MemoryType::Shared);
1847 tv1cr->doubleBuffer();
1848
1849 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1850
1851 at::manual_seed(0);
1852 auto t0 = at::randn({200}, options);
1853 FusionExecutor fe;
1854 fe.compileFusion(&fusion, {t0});
1855 auto cg_outputs = fe.runFusion({t0});
1856 auto ref = (t0 + 1).sum({0});
1857
1858 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1859}
1860
1861// Request 48KB of data in shared mem,
1862// should be large enough not to fit in
1863// static allocations, but small enough
1864// to fit in supported devices (sm70+).
1865TEST_F(NVFuserTest, FusionLargeSmem_CUDA) {
1866 Fusion fusion;
1867 FusionGuard fg(&fusion);
1868
1869 auto tv0 = makeContigTensor(1);
1870 fusion.addInput(tv0);
1871 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
1872 auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
1873 fusion.addOutput(tv2);
1874
1875 tv2->split(0, 12288);
1876 tv2->split(1, 128);
1877 tv1->computeAt(tv2, 1);
1878 tv1->split(1, 128);
1879 tv0->computeAt(tv1, -1);
1880 tv1->setMemoryType(MemoryType::Shared);
1881 tv1->axis(-1)->parallelize(ParallelType::TIDx);
1882 tv2->axis(-1)->parallelize(ParallelType::TIDx);
1883
1884 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1885
1886 at::manual_seed(0);
1887 auto t0 = at::randn({12288 * 4}, options);
1888 FusionExecutor fe;
1889 fe.compileFusion(&fusion, {t0});
1890 auto cg_outputs = fe.runFusion({t0});
1891 auto ref = t0 + 1 + 2;
1892
1893 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
1894}
1895
1896// Request a smem allocation that is equal to the device limit
1897TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) {
1898 Fusion fusion;
1899 FusionGuard fg(&fusion);
1900
1901 auto properties = at::cuda::getDeviceProperties(
1902 c10::Device(c10::DeviceType::CUDA, 0).index());
1903 int device_limit = properties->sharedMemPerBlockOptin;
1904
1905 auto tv0 = makeContigTensor(1);
1906 fusion.addInput(tv0);
1907 auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
1908 auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
1909 fusion.addOutput(tv2);
1910
1911 // 4 byte per float
1912 tv2->split(0, device_limit / 4);
1913 tv2->split(1, 128);
1914 tv1->computeAt(tv2, 1);
1915 tv1->split(1, 128);
1916 tv0->computeAt(tv1, -1);
1917 tv1->setMemoryType(MemoryType::Shared);
1918 tv1->axis(-1)->parallelize(ParallelType::TIDx);
1919 tv2->axis(-1)->parallelize(ParallelType::TIDx);
1920
1921 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1922
1923 at::manual_seed(0);
1924 auto t0 = at::randn({12288 * 4}, options);
1925 FusionExecutor fe;
1926
1927 // First compile gets a compiled kernel
1928 fe.compileFusion(&fusion, {t0});
1929
1930 // Should be throwing because the kernel
1931 // requested absolute device limit
1932 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1933 ASSERT_ANY_THROW(fe.runFusion({t0}));
1934}
1935
1936// Try to test alignment when multiple tensors are
1937// in shared mem.
1938TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) {
1939 Fusion fusion;
1940 FusionGuard fg(&fusion);
1941
1942 auto tv0 = makeConcreteTensor({3, 4, 7, 2, 5});
1943 fusion.addInput(tv0);
1944 auto tv1 = sum(tv0, {4});
1945 auto tv2 = sum(tv1, {3});
1946 auto tv3 = sum(tv2, {2});
1947 auto tv4 = sum(tv3, {1});
1948 fusion.addOutput(tv4);
1949
1950 auto tv0c = tv0->cacheAfter();
1951 auto tv1bc = tv1->cacheBefore();
1952 auto tv2bc = tv2->cacheBefore();
1953 auto tv3bc = tv3->cacheBefore();
1954 auto tv4bc = tv4->cacheBefore();
1955
1956 tv0c->setMemoryType(MemoryType::Shared);
1957 tv1bc->setMemoryType(MemoryType::Shared);
1958 tv2bc->setMemoryType(MemoryType::Shared);
1959 tv3bc->setMemoryType(MemoryType::Shared);
1960 tv4bc->setMemoryType(MemoryType::Shared);
1961
1962 tv1->axis(-1)->parallelize(ParallelType::Vectorize);
1963 tv3->axis(-1)->parallelize(ParallelType::Vectorize);
1964 tv0->computeAt(tv4, 0);
1965 tv0->computeAt(tv2, 2);
1966
1967 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
1968
1969 at::manual_seed(0);
1970 auto t0 = at::randn({3, 4, 7, 2, 5}, options);
1971 FusionExecutor fe;
1972
1973 fe.compileFusion(&fusion, {t0});
1974 auto cg_outputs = fe.runFusion({t0});
1975 auto tref = t0.sum({1, 2, 3, 4});
1976
1977 testValidate(&fusion, cg_outputs, {t0}, {tref}, __LINE__, __FILE__);
1978}
1979
1980// Repro of #1521
1981TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) {
1982 Fusion fusion;
1983 FusionGuard fg(&fusion);
1984
1985 auto tv0 = makeSymbolicTensor(1);
1986 fusion.addInput(tv0);
1987
1988 auto immediate_scalr = IrBuilder::create<Double>(0.1);
1989 // Adding an immediate scalar value as an input is not allowed
1990 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
1991 ASSERT_ANY_THROW(fusion.addInput(immediate_scalr));
1992
1993 // Instead, use a symbolic value
1994 auto symbolic_scalar = IrBuilder::create<Double>();
1995 fusion.addInput(symbolic_scalar);
1996
1997 auto tv1 = add(tv0, symbolic_scalar);
1998 fusion.addOutput(tv1);
1999
2000 // Make sure the kernel is compiled.
2001 FusionExecutor fe;
2002 fe.compileFusion(&fusion);
2003}
2004
2005// Repro of #1506
2006TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) {
2007 std::vector<int64_t> shape{14, 14};
2008
2009 Fusion fusion;
2010 FusionGuard fg(&fusion);
2011
2012 auto tv0 = makeContigTensor(2);
2013 fusion.addInput(tv0);
2014 auto tv1 = set(tv0);
2015 auto tv2 = set(tv1);
2016 fusion.addOutput(tv2);
2017
2018 tv2->merge(0);
2019
2020 // Vectorize by 4 should be allowed
2021 tv2->split(0, 4);
2022
2023 tv2->axis(0)->parallelize(ParallelType::TIDx);
2024 tv0->computeAt(tv2, 1);
2025
2026 tv1->axis(1)->parallelize(ParallelType::Vectorize);
2027 tv2->axis(1)->parallelize(ParallelType::Vectorize);
2028
2029 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2030 auto t0 = at::randn(shape, options);
2031
2032 FusionExecutor fe;
2033 fe.compileFusion(&fusion, {t0});
2034 auto cg_outputs = fe.runFusion({t0});
2035
2036 TORCH_CHECK(t0.equal(cg_outputs[0]));
2037}
2038
2039// Make sure the same fusion as FusionVectorizeContigIndex fails if
2040// not contig.
2041TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) {
2042 std::vector<int64_t> shape{14, 14};
2043
2044 Fusion fusion;
2045 FusionGuard fg(&fusion);
2046
2047 auto tv0 = makeSymbolicTensor(2);
2048 fusion.addInput(tv0);
2049 auto tv1 = set(tv0);
2050 auto tv2 = set(tv1);
2051 fusion.addOutput(tv2);
2052
2053 tv2->merge(0);
2054
2055 tv2->split(0, 4);
2056
2057 tv2->axis(0)->parallelize(ParallelType::TIDx);
2058 tv0->computeAt(tv2, 1);
2059
2060 tv1->axis(1)->parallelize(ParallelType::Vectorize);
2061 tv2->axis(1)->parallelize(ParallelType::Vectorize);
2062
2063 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2064 auto t0 = at::randn(shape, options);
2065
2066 FusionExecutor fe;
2067 fe.compileFusion(&fusion, {t0});
2068
2069 // This should fail at the launch time as 14 is not divisible by the
2070 // vector word size. The two domains are merged, but they are not
2071 // contiguous, so contig indexing is not involved in this case.
2072 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2073 ASSERT_ANY_THROW(fe.runFusion({t0}));
2074}
2075
2076TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
2077 Fusion fusion;
2078 FusionGuard fg(&fusion);
2079
2080 auto tv0 = makeSymbolicTensor(1);
2081 fusion.addInput(tv0);
2082 auto tv1 = set(tv0);
2083 fusion.addOutput(tv1);
2084
2085 tv1->split(0, 4);
2086
2087 tv1->axis(-1)->parallelize(ParallelType::Vectorize);
2088
2089 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2090 at::manual_seed(0);
2091
2092 const int n = 12;
2093 auto t0 = at::randn({n}, options);
2094 // Shift by one to make it non-aligned
2095 auto t0_misaligned = at::randn({n + 1}, options).index({Slice(1)});
2096 auto t1_misaligned = at::empty({n + 1}, options).index({Slice(1)});
2097
2098 FusionExecutor fe;
2099 fe.compileFusion(&fusion, {t0});
2100 auto cg_outputs = fe.runFusion({t0});
2101 TORCH_CHECK(t0.equal(cg_outputs[0]));
2102
2103 // Pass misaligned input. This must fail.
2104 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2105 ASSERT_ANY_THROW(fe.runFusion({t0_misaligned}));
2106
2107 // Pass misaligned output. This must fail too.
2108 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2109 ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned}));
2110}
2111
2112// Repro of issue #1530
2113TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) {
2114 std::vector<int64_t> shape{1, 2, 1};
2115
2116 Fusion fusion;
2117 FusionGuard fg(&fusion);
2118
2119 auto tv0 = makeContigTensor(shape.size());
2120 fusion.addInput(tv0);
2121 auto tv1 = set(tv0);
2122 fusion.addOutput(tv1);
2123
2124 tv1->merge(1);
2125 tv1->merge(0);
2126
2127 auto invalid_vec_size = shape[0] * shape[1] * shape[2];
2128 invalid_vec_size *= invalid_vec_size;
2129
2130 tv1->split(0, invalid_vec_size);
2131
2132 tv1->axis(1)->parallelize(ParallelType::Vectorize);
2133
2134 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2135 auto t0 = at::randn(shape, options);
2136
2137 FusionExecutor fe;
2138 fe.compileFusion(&fusion, {t0});
2139
2140 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2141 ASSERT_ANY_THROW(fe.runFusion({t0}));
2142}
2143
2144TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
2145 Fusion fusion;
2146 FusionGuard fg(&fusion);
2147
2148 auto tv0 = makeConcreteTensor({4});
2149 fusion.addInput(tv0);
2150 auto tv1 = makeConcreteTensor({3, 4});
2151 fusion.addInput(tv1);
2152
2153 auto tv2 = broadcast(tv0, {true, false});
2154 auto tv3 = add(tv2, tv1);
2155 fusion.addOutput(tv3);
2156
2157 tv3->merge(0);
2158 TransformPropagatorWithCheck propagator(tv3);
2159 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
2160
2161 tv2->setMemoryType(MemoryType::Local);
2162
2163 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2164 auto t0 = at::randn({4}, options);
2165 auto t1 = at::randn({3, 4}, options);
2166
2167 auto t3 = t0.unsqueeze(0).add(t1);
2168 {
2169 FusionExecutor fe;
2170 fe.compileFusion(&fusion, {t0, t1});
2171 auto cg_outputs = fe.runFusion({t0, t1});
2172
2173 testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
2174 }
2175
2176 // Make sure tv2 indexing also works when it's stored in global memory
2177 tv2->setMemoryType(MemoryType::Global);
2178 {
2179 FusionExecutor fe;
2180 fe.compileFusion(&fusion, {t0, t1});
2181 auto cg_outputs = fe.runFusion({t0, t1});
2182
2183 testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
2184 }
2185}
2186
2187// Repro of #1534. Validation should detect invalid vectorization.
2188TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) {
2189 std::vector<int64_t> shape1{2, 3, 2};
2190 std::vector<int64_t> shape2{2, 2};
2191
2192 Fusion fusion;
2193 FusionGuard fg(&fusion);
2194
2195 auto tv0 = makeContigConcreteTensor(shape1);
2196 fusion.addInput(tv0);
2197 auto tv1 = makeContigConcreteTensor(shape2);
2198 fusion.addInput(tv1);
2199
2200 auto tv2 = set(tv1);
2201 auto tv3 = broadcast(tv2, {false, true, false});
2202 auto tv4 = add(tv0, tv3);
2203 fusion.addOutput(tv4);
2204
2205 tv4->merge(1, 2);
2206 tv4->merge(0, 1);
2207 tv4->split(0, 4);
2208 TransformPropagatorWithCheck propagator(tv4);
2209 MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
2210
2211 tv0->computeAt(tv4, -2);
2212 tv1->computeAt(tv4, -2);
2213
2214 tv2->axis(-1)->parallelize(ParallelType::Vectorize);
2215
2216 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2217 auto t0 = at::randn(shape1, options);
2218 auto t1 = at::randn(shape2, options);
2219
2220 FusionExecutor fe;
2221 fe.compileFusion(&fusion, {t0, t1});
2222
2223 // Vectorization of tv2 should be detected as invalid.
2224 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
2225 ASSERT_ANY_THROW(fe.runFusion({t0, t1}));
2226}
2227
2228TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
2229 std::vector<int64_t> shape1{2, 2, 2};
2230 std::vector<int64_t> shape2{1, 2, 2};
2231
2232 Fusion fusion;
2233 FusionGuard fg(&fusion);
2234
2235 // [I0, I1, I2]
2236 auto tv0 = makeContigTensor(shape1.size());
2237 fusion.addInput(tv0);
2238
2239 // [B3, I1, I2]
2240 auto tv1 = makeContigConcreteTensor(shape2);
2241 fusion.addInput(tv1);
2242
2243 auto tv2 = set(tv1);
2244 auto tv3 = add(tv0, tv2);
2245 fusion.addOutput(tv3);
2246
2247 tv3->merge(1, 2);
2248 tv3->merge(0, 1);
2249 tv3->split(0, 4);
2250
2251 // Don't modify tv1 so that it's replayed as tv2 with actual
2252 // transformations. It would create temporary IterDomains, and the
2253 // validation should still be able to detect vectorization by 4 is valid.
2254 // TransformPropagatorWithCheck propagator(tv3);
2255 // MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
2256
2257 tv2->merge(1, 2);
2258 tv2->merge(0, 1);
2259 tv2->split(0, 4);
2260
2261 tv2->computeAt(tv3, -2);
2262
2263 tv2->axis(-1)->parallelize(ParallelType::Vectorize);
2264
2265 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2266 auto t0 = at::randn(shape1, options);
2267 auto t1 = at::randn(shape2, options);
2268
2269 FusionExecutor fe;
2270 fe.compileFusion(&fusion, {t0, t1});
2271 auto cg_outputs = fe.runFusion({t0, t1});
2272
2273 auto ref = t0 + t1;
2274
2275 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2276}
2277
2278TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
2279 std::vector<int64_t> shape0{100, 14, 2, 14};
2280 std::vector<int64_t> shape1{100, 2, 14};
2281
2282 Fusion fusion;
2283 FusionGuard fg(&fusion);
2284
2285 auto tv0 = makeContigTensor(shape0.size());
2286 fusion.addInput(tv0);
2287 auto tv1 = makeContigTensor(shape1.size());
2288 fusion.addInput(tv1);
2289
2290 auto tv2 = broadcast(tv1, {false, true, false, false});
2291 auto tv3 = add(tv0, tv2);
2292 fusion.addOutput(tv3);
2293
2294 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2295 auto t0 = at::randn(shape0, options);
2296 auto t1 = at::randn(shape1, options);
2297
2298 auto lparams = schedulePointwise(&fusion, {t0, t1});
2299
2300 GpuLower gpulw(&fusion);
2301 auto kernel = gpulw.kernel();
2302
2303 // The innermost two dimensions are merged and contiguous, so
2304 // vectorization can be done against 2*14=28 rather than 14, so
2305 // vector word size should be 4. Broadcasting of tv1 should not
2306 // matter.
2307 for (const auto& vec_info : kernel->summary().vectorized_set_info) {
2308 TORCH_CHECK(
2309 vec_info.word_size == 4,
2310 "Invalid vector word size: ",
2311 vec_info.word_size);
2312 }
2313
2314 FusionExecutor fe;
2315 fe.compileFusion(&fusion, {t0, t1}, lparams);
2316 auto cg_outputs = fe.runFusion({t0, t1});
2317
2318 auto ref = t0 + t1.unsqueeze(-3);
2319
2320 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2321}
2322
2323// Repro of issue #1539.
2324TEST_F(NVFuserTest, FusionTrivialReductionForwarding1_CUDA) {
2325 Fusion fusion;
2326 FusionGuard fg(&fusion);
2327
2328 auto tv0 = makeSymbolicTensor(1);
2329 fusion.addInput(tv0);
2330
2331 auto tv1 = broadcast(tv0, {true, false});
2332 auto tv2 = sum(tv1, {0});
2333 auto tv3 = set(tv2);
2334 fusion.addOutput(tv3);
2335
2336 tv2->merge(0);
2337 tv2->split(0, 4);
2338
2339 TransformPropagatorWithCheck propagator(tv2);
2340 MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
2341
2342 // All tensors must be transformed to a 2D tensor with each axis
2343 // mapped with each other in the LOOP map.
2344 ComputeAtMap ca_map(&fusion);
2345 for (auto tv : ir_utils::allTvs(&fusion)) {
2346 TORCH_CHECK(
2347 tv->nDims() == 2, "Expected to be a 2D tensor but: ", tv->toString());
2348 for (const auto i : c10::irange(2)) {
2349 TORCH_CHECK(ca_map.areMapped(
2350 tv->axis(i), tv3->axis(i), IdMappingMode::PERMISSIVE));
2351 }
2352 }
2353}
2354
2355TEST_F(NVFuserTest, FusionTrivialReductionForwarding2_CUDA) {
2356 Fusion fusion;
2357 FusionGuard fg(&fusion);
2358
2359 auto tv0 = makeSymbolicTensor(1);
2360 fusion.addInput(tv0);
2361
2362 auto tv1 = broadcast(tv0, {true, false});
2363 auto tv2 = sum(tv1, {0});
2364 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
2365
2366 fusion.addOutput(tv3);
2367
2368 // Merging a trivial reduction with a non-reduction domain
2369 tv2->merge(0, 1);
2370 tv2->split(0, 4);
2371
2372 tv3->split(0, 4);
2373
2374 // tv2 and tv3 are different as tv3 lacks the trivial reduction, but
2375 // they are mapped with each other by BestEffortReplay as the merge
2376 // of trivial reduciton dim is forwarded.
2377
2378 PairwiseRootDomainMap root_map(tv2, tv3);
2379
2380 auto p2c = BestEffortReplay::replayCasP(tv3, tv2, 2, root_map).getReplay();
2381 for (const auto i : c10::irange(tv2->nDims())) {
2382 auto tv2_id = tv2->axis(i);
2383 auto it = p2c.find(tv2_id);
2384 TORCH_CHECK(
2385 it != p2c.end(),
2386 "Expected mapped consumer ID but not found: ",
2387 tv2_id->toString());
2388 auto tv3_mapped_id = it->second;
2389 TORCH_CHECK(
2390 tv3_mapped_id == tv3->axis(i),
2391 "Unexpected mapped consumer ID: ",
2392 tv3_mapped_id->toString());
2393 }
2394
2395 auto c2p = BestEffortReplay::replayPasC(tv2, tv3, 2, root_map).getReplay();
2396 for (const auto i : c10::irange(tv3->nDims())) {
2397 auto tv3_id = tv3->axis(i);
2398 auto it = c2p.find(tv3_id);
2399 TORCH_CHECK(
2400 it != c2p.end(),
2401 "Expected mapped producer ID but not found: ",
2402 tv3_id->toString());
2403 auto tv2_mapped_id = it->second;
2404 TORCH_CHECK(
2405 tv2_mapped_id == tv2->axis(i),
2406 "Unexpected mapped consumer ID: ",
2407 tv2_mapped_id->toString());
2408 }
2409}
2410
2411TEST_F(NVFuserTest, FusionTrivialReductionForwarding3_CUDA) {
2412 Fusion fusion;
2413 FusionGuard fg(&fusion);
2414
2415 auto tv0 = makeSymbolicTensor(2);
2416 fusion.addInput(tv0);
2417
2418 auto tv1 = sum(tv0, {1});
2419 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
2420 fusion.addOutput(tv2);
2421
2422 // Similar pattern as FusionTrivialReductionForwarding2 but trivial
2423 // reduciton at non-root domain
2424
2425 // Create a trivial reduction by splitting with a factor of 1
2426 tv1->split(1, 1, false);
2427 // Merging with a trivial reduction
2428 tv1->merge(0, 1);
2429 auto tv1_merge_out_id = tv1->axis(0);
2430 tv1->split(0, 5);
2431
2432 tv2->split(0, 5);
2433
2434 // The merge of tv1 is done with a non-root trivial
2435 // reduciton. BestEffortReplay should forward the merge.
2436
2437 PairwiseRootDomainMap root_map(tv1, tv2);
2438 auto p2c = BestEffortReplay::replayCasP(tv2, tv1, 2, root_map).getReplay();
2439
2440 // The two tensors should look like:
2441 // tv1: [I1*1//5, 5, I2//1]
2442 // tv2: [I1//5, 5]
2443 //
2444 // BestEffortRepaly should forward the merge of (I1 * 1) and create
2445 // mappings of:
2446 // I1*1//5 -> I1//5
2447 // 5 -> 5
2448 // I1*1 -> I1
2449
2450 TORCH_CHECK(p2c.size() == 3, "Unexpected number of mappings");
2451 TORCH_CHECK(p2c.count(tv1->axis(0)) && p2c[tv1->axis(0)] == tv2->axis(0));
2452 TORCH_CHECK(p2c.count(tv1->axis(1)) && p2c[tv1->axis(1)] == tv2->axis(1));
2453 TORCH_CHECK(
2454 p2c.count(tv1_merge_out_id) &&
2455 p2c[tv1_merge_out_id] == tv2->getRootDomain()[0]);
2456}
2457
2458TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) {
2459 Fusion fusion;
2460 FusionGuard fg(&fusion);
2461
2462 auto tv0 = makeSymbolicTensor(1);
2463 fusion.addInput(tv0);
2464
2465 auto tv1 = makeSymbolicTensor(2);
2466 fusion.addInput(tv1);
2467
2468 auto tv2 = broadcast(tv0, {true, false});
2469 auto tv3 = add(tv1, tv2);
2470 fusion.addOutput(tv3);
2471
2472 // tv4 has a trivial reduction axis
2473 auto tv4 = sum(tv2, {0});
2474 auto tv5 = add(tv4, IrBuilder::create<Double>(1));
2475 fusion.addOutput(tv5);
2476
2477 tv3->merge(0, 1);
2478 tv3->split(0, 32);
2479
2480 // This causes the trivial reduction of tv4 to be merged with
2481 // another axis of tv4, and then forward computeAt is done from tv4
2482 // to tv5. The split of the merged id of tv4 should be done on tv5
2483 // by forwarding the merge of the trivial reduction.
2484 tv0->computeAt(tv3, -1);
2485
2486 tv3->axis(0)->parallelize(ParallelType::BIDx);
2487 tv3->axis(1)->parallelize(ParallelType::TIDx);
2488
2489 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2490 auto t0 = at::randn({111}, options);
2491 auto t1 = at::randn({123, 111}, options);
2492
2493 FusionExecutor fe;
2494 fe.compileFusion(&fusion, {t0, t1});
2495 auto cg_outputs = fe.runFusion({t0, t1});
2496
2497 auto t2 = t0.unsqueeze(0);
2498 auto t3 = t1 + t2;
2499 auto t5 = sum(t2, {0}) + 1;
2500
2501 testValidate(&fusion, cg_outputs, {t0, t1}, {t3, t5}, __LINE__, __FILE__);
2502}
2503
2504// See issue #1598
2505TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) {
2506 Fusion fusion;
2507 FusionGuard fg(&fusion);
2508
2509 auto tv0 = makeSymbolicTensor(2);
2510 auto tv1 = makeSymbolicTensor(2);
2511 fusion.addInput(tv0);
2512 fusion.addInput(tv1);
2513
2514 auto tv2 = set(tv0);
2515 auto tv3 = set(tv1);
2516 auto tv4 = add(tv2, tv3);
2517 fusion.addOutput(tv4);
2518
2519 // Place tv2 on shared memory
2520 tv2->split(0, 2);
2521 tv2->split(-1, 4);
2522 tv2->setMemoryType(MemoryType::Shared);
2523 tv2->axis(-2)->parallelize(ParallelType::TIDy);
2524 tv2->axis(-1)->parallelize(ParallelType::TIDx);
2525
2526 tv3->split(0, 2);
2527 tv3->split(-1, 4);
2528 // swap tidx and tidy
2529 tv3->axis(-2)->parallelize(ParallelType::TIDx);
2530 tv3->axis(-1)->parallelize(ParallelType::TIDy);
2531
2532 tv4->split(0, 2);
2533 tv4->split(-1, 4);
2534 tv4->axis(-2)->parallelize(ParallelType::TIDx);
2535 tv4->axis(-1)->parallelize(ParallelType::TIDy);
2536
2537 tv0->computeAt(tv4, 1);
2538 tv3->computeAt(tv4, -1);
2539
2540 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2541 auto t0 = at::randn({10, 64}, options);
2542 auto t1 = at::randn({10, 64}, options);
2543
2544 FusionExecutor fe;
2545 fe.compileFusion(&fusion, {t0, t1});
2546 auto cg_outputs = fe.runFusion({t0, t1});
2547
2548 auto ref = t0 + t1;
2549
2550 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2551}
2552
2553// See issue #1598
2554TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) {
2555 Fusion fusion;
2556 FusionGuard fg(&fusion);
2557
2558 auto tv0 = makeSymbolicTensor(2);
2559 auto tv1 = makeSymbolicTensor(2);
2560 fusion.addInput(tv0);
2561 fusion.addInput(tv1);
2562
2563 auto tv2 = set(tv0);
2564 auto tv3 = set(tv1);
2565 auto tv4 = add(tv2, tv3);
2566 fusion.addOutput(tv4);
2567
2568 tv2->split(0, 2);
2569 tv2->split(-1, 4);
2570 tv2->setMemoryType(MemoryType::Shared);
2571
2572 tv2->axis(-2)->parallelize(ParallelType::TIDy);
2573 tv2->axis(-1)->parallelize(ParallelType::TIDx);
2574
2575 tv4->split(0, 2);
2576 tv4->split(-1, 4);
2577 // Also do unroll for tv3 and tv4
2578 tv4->split(-2, 8, false);
2579 tv4->axis(-3)->parallelize(ParallelType::Unroll);
2580 // swap tidx and tidy
2581 tv4->axis(-2)->parallelize(ParallelType::TIDx);
2582 tv4->axis(-1)->parallelize(ParallelType::TIDy);
2583
2584 tv0->computeAt(tv4, 1);
2585 tv3->computeAt(tv4, -1);
2586
2587 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2588 auto t0 = at::randn({10, 64}, options);
2589 auto t1 = at::randn({10, 64}, options);
2590
2591 FusionExecutor fe;
2592 fe.compileFusion(&fusion, {t0, t1});
2593 auto cg_outputs = fe.runFusion({t0, t1});
2594
2595 auto ref = t0 + t1;
2596
2597 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2598}
2599
2600// See issue #1599
2601TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) {
2602 Fusion fusion;
2603 FusionGuard fg(&fusion);
2604
2605 auto tv0 = makeSymbolicTensor(2);
2606 auto tv1 = makeSymbolicTensor(2);
2607 fusion.addInput(tv0);
2608 fusion.addInput(tv1);
2609
2610 auto tv2 = set(tv0);
2611 auto tv3 = set(tv1);
2612 auto tv4 = add(tv2, tv3);
2613 fusion.addOutput(tv4);
2614
2615 // Use unroll where a RAW-sync tensor is stored
2616
2617 tv4->split(0, 2);
2618 tv4->split(0, 3);
2619 tv4->split(-1, 4);
2620 tv4->axis(1)->parallelize(ParallelType::Unroll);
2621 tv4->axis(-2)->parallelize(ParallelType::TIDx);
2622 tv4->axis(-1)->parallelize(ParallelType::TIDy);
2623
2624 tv0->computeAt(tv4, 3);
2625 tv3->computeAt(tv4, -1);
2626
2627 tv2->split(-1, 4);
2628 tv2->axis(-2)->parallelize(ParallelType::TIDy);
2629 tv2->axis(-1)->parallelize(ParallelType::TIDx);
2630 tv2->setMemoryType(MemoryType::Shared);
2631
2632 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2633 auto t0 = at::randn({50, 64}, options);
2634 auto t1 = at::randn({50, 64}, options);
2635
2636 FusionExecutor fe;
2637 fe.compileFusion(&fusion, {t0, t1});
2638 auto cg_outputs = fe.runFusion({t0, t1});
2639
2640 auto ref = t0 + t1;
2641
2642 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2643}
2644
2645// See #1618
2646TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace4_CUDA) {
2647 Fusion fusion;
2648 FusionGuard fg(&fusion);
2649
2650 auto tv0 = makeConcreteTensor({16, 128});
2651 auto tv1 = makeConcreteTensor({16, 128});
2652 fusion.addInput(tv0);
2653 fusion.addInput(tv1);
2654
2655 auto tv2 = set(tv0);
2656 auto tv3 = set(tv1);
2657 auto tv4 = set(tv2);
2658 auto tv5 = set(tv3);
2659 auto tv6 = add(tv4, tv5);
2660 fusion.addOutput(tv6);
2661
2662 tv2->setMemoryType(MemoryType::Shared);
2663 tv3->setMemoryType(MemoryType::Shared);
2664
2665 tv2->computeAt(tv6, 0);
2666 tv3->computeAt(tv6, 1);
2667 tv4->computeAt(tv6, 1);
2668 tv5->computeAt(tv6, -1);
2669 tv2->split(1, 64);
2670 tv3->split(1, 64);
2671 tv2->axis(-1)->parallelize(ParallelType::TIDx);
2672 tv3->axis(-1)->parallelize(ParallelType::TIDx);
2673 tv6->axis(-1)->parallelize(ParallelType::TIDx);
2674
2675 // Check the block sync is inserted at the correct location.
2676 // There is exactly one block sync needed in this test case
2677 // and the sync needs to be after the 2 expressions
2678 // that modify shared memory.
2679 class SyncInsertionPointChecker : public kir::IrVisitor {
2680 public:
2681 using kir::IrVisitor::handle;
2682
2683 private:
2684 void handle(UnaryOp* uop) final {
2685 // Record number of unary ops that modifies shared memory.
2686 if (uop->out()->isA<kir::TensorIndex>() &&
2687 uop->out()->as<kir::TensorIndex>()->view()->getMemoryType() ==
2688 MemoryType::Shared &&
2689 // Filter out initialization expressions
2690 uop->in()->isA<kir::TensorIndex>()) {
2691 number_of_writes_++;
2692 }
2693 }
2694 void handle(kir::BlockSync* bsync) final {
2695 // Make sure both shared memory modifying expressions
2696 // have been observed at the sync insertion point.
2697 TORCH_INTERNAL_ASSERT(
2698 number_of_writes_ == 2,
2699 "FusionRAWSyncInsertionPlace4 test fail:",
2700 "only 1 sync after the 2 shared mem writes is needed in this test,"
2701 "either a redundant sync has been inserted or the block sync is not inserted at the right place");
2702 }
2703
2704 private:
2705 int number_of_writes_ = 0;
2706 } sync_insertion_checker;
2707 GpuLower gpulw(&fusion);
2708 sync_insertion_checker.handle(gpulw.kernel()->topLevelExprs());
2709}
2710
2711// Test serial write and parallel read of shared mem: mapped case
2712TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) {
2713 Fusion fusion;
2714 FusionGuard fg(&fusion);
2715
2716 TensorView* tv0 = makeConcreteTensor({128, 6});
2717 TensorView* tv1 = makeConcreteTensor({128, 6});
2718 TensorView* tv2 = makeConcreteTensor({128, 6});
2719 fusion.addInput(tv0);
2720 fusion.addInput(tv1);
2721 fusion.addInput(tv2);
2722
2723 TensorView* tv3 = add(tv0, tv1);
2724 TensorView* tv4 = add(tv3, tv2);
2725
2726 fusion.addOutput(tv4);
2727
2728 // Use shared memory
2729 tv3->setMemoryType(MemoryType::Shared);
2730
2731 // Parallelize t4, in this case dim 0 on tv3 will
2732 // not be parallelized but dim0 of t4 will be.
2733 // We will need to make sure a sync is inserted
2734 // even if these dimensions are mapped.
2735 tv4->axis(0)->parallelize(ParallelType::TIDx);
2736
2737 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2738
2739 at::Tensor t0 = at::randn({128, 6}, options);
2740 at::Tensor t1 = at::randn({128, 6}, options);
2741 at::Tensor t2 = at::randn({128, 6}, options);
2742
2743 FusionExecutor fe;
2744 fe.compileFusion(&fusion, {t0, t1, t2});
2745 auto cg_outputs = fe.runFusion({t0, t1, t2});
2746
2747 auto ref = t0 + t1 + t2;
2748
2749 testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
2750}
2751
2752// Test serial write and parallel read of shared mem: un-mapped case
2753TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) {
2754 Fusion fusion;
2755 FusionGuard fg(&fusion);
2756
2757 TensorView* tv0 = makeConcreteTensor({128, 6});
2758 TensorView* tv1 = makeConcreteTensor({128, 6});
2759 TensorView* tv2 = makeConcreteTensor({128, 6});
2760 fusion.addInput(tv0);
2761 fusion.addInput(tv1);
2762 fusion.addInput(tv2);
2763
2764 TensorView* tv3 = add(tv0, tv1);
2765 TensorView* tv4 = add(tv3, tv2);
2766
2767 fusion.addOutput(tv4);
2768
2769 // Use shared memory
2770 tv3->setMemoryType(MemoryType::Shared);
2771
2772 // Split and parallelize t4,
2773 // the parallelized dimension in t4 will not
2774 // map across to the shared mem tensor, t3. So
2775 // there will need to be a sync before use of t3.
2776 tv4->split(0, 2);
2777 tv4->axis(0)->parallelize(ParallelType::TIDx);
2778
2779 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2780
2781 at::Tensor t0 = at::randn({128, 6}, options);
2782 at::Tensor t1 = at::randn({128, 6}, options);
2783 at::Tensor t2 = at::randn({128, 6}, options);
2784
2785 FusionExecutor fe;
2786 fe.compileFusion(&fusion, {t0, t1, t2});
2787 auto cg_outputs = fe.runFusion({t0, t1, t2});
2788
2789 auto ref = t0 + t1 + t2;
2790
2791 testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
2792}
2793
2794// Simple test of async copy primitive
2795TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) {
2796 Fusion fusion;
2797 FusionGuard fg(&fusion);
2798
2799 int m = 33, n = 31;
2800
2801 TensorView* tv0 = makeConcreteTensor({m, n});
2802 TensorView* tv1 = makeConcreteTensor({m, n});
2803
2804 fusion.addInput(tv0);
2805 fusion.addInput(tv1);
2806
2807 TensorView* tv2 = add(tv0, tv1);
2808
2809 fusion.addOutput(tv2);
2810
2811 auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
2812 tv0_shared->setMemoryType(MemoryType::Shared);
2813
2814 tv0->computeAt(tv2, 1);
2815 tv0_shared->axis(1)->parallelize(ParallelType::TIDx);
2816 tv2->axis(1)->parallelize(ParallelType::TIDx);
2817
2818 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2819 at::Tensor t0 = at::randn({m, n}, options);
2820 at::Tensor t1 = at::randn({m, n}, options);
2821
2822 FusionExecutor fe;
2823
2824 // requires ampere+ GPU
2825 if (!deviceMajorMinorCheck(8)) {
2826 ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
2827 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
2828 }
2829 fe.compileFusion(&fusion, {t0, t1});
2830 auto cg_outputs = fe.runFusion({t0, t1});
2831
2832 auto ref = t0 + t1;
2833
2834 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2835}
2836
2837// Simple test of async copy primitive: double buffered
2838// Double buffer case 1, both block sync and async wait
2839// are needed.
2840TEST_F(NVFuserTest, FusionDoubleBufferCpAsync1_CUDA) {
2841 Fusion fusion;
2842 FusionGuard fg(&fusion);
2843
2844 // Using vectorization so need to keep n multiple of 4.
2845 int m = 33, n = 48;
2846
2847 TensorView* tv0 = makeConcreteTensor({m, n});
2848 TensorView* tv1 = makeConcreteTensor({m, n});
2849
2850 fusion.addInput(tv0);
2851 fusion.addInput(tv1);
2852
2853 TensorView* tv2 = add(tv0, tv1);
2854
2855 fusion.addOutput(tv2);
2856
2857 auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
2858 tv0_shared->setMemoryType(MemoryType::Shared);
2859 tv0->computeAt(tv2, 1);
2860
2861 // Asynchronously load a tile in one schedule
2862 tv0_shared->split(1, 4);
2863 tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize);
2864 tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
2865
2866 // Consume the loaded tile in another schedule,
2867 // triggering the need for a sync.
2868 tv2->split(1, 12);
2869 tv2->axis(-1)->parallelize(ParallelType::TIDx);
2870
2871 // Double buffer the shared mem tensor.
2872 tv0_shared->doubleBuffer();
2873
2874 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2875 at::Tensor t0 = at::randn({m, n}, options);
2876 at::Tensor t1 = at::randn({m, n}, options);
2877
2878 FusionExecutor fe;
2879 // requires ampere+ GPU
2880 if (!deviceMajorMinorCheck(8)) {
2881 ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
2882 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
2883 }
2884 fe.compileFusion(&fusion, {t0, t1});
2885 auto cg_outputs = fe.runFusion({t0, t1});
2886
2887 auto ref = t0 + t1;
2888
2889 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2890}
2891
2892// Simple test of async copy primitive: double buffered
2893// Double buffer case 2, only async wait is needed
2894TEST_F(NVFuserTest, FusionDoubleBufferCpAsync2_CUDA) {
2895 Fusion fusion;
2896 FusionGuard fg(&fusion);
2897
2898 // Using vectorization so need to keep n multiple of 4.
2899 int m = 33, n = 48;
2900
2901 TensorView* tv0 = makeConcreteTensor({m, n});
2902 TensorView* tv1 = makeConcreteTensor({m, n});
2903
2904 fusion.addInput(tv0);
2905 fusion.addInput(tv1);
2906
2907 TensorView* tv2 = add(tv0, tv1);
2908
2909 fusion.addOutput(tv2);
2910
2911 auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
2912 tv0_shared->setMemoryType(MemoryType::Shared);
2913 tv0->computeAt(tv2, 1);
2914
2915 // Asynchronously load a tile in one schedule
2916 tv0_shared->split(1, 4);
2917 tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
2918
2919 // Consume the loaded tile in another schedule,
2920 // triggering the need for a sync.
2921 tv2->split(1, 4);
2922 tv2->axis(-2)->parallelize(ParallelType::TIDx);
2923
2924 // Double buffer the shared mem tensor.
2925 tv0_shared->doubleBuffer();
2926
2927 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2928 at::Tensor t0 = at::randn({m, n}, options);
2929 at::Tensor t1 = at::randn({m, n}, options);
2930
2931 FusionExecutor fe;
2932 // requires ampere+ GPU
2933 if (!deviceMajorMinorCheck(8)) {
2934 ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
2935 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
2936 }
2937 fe.compileFusion(&fusion, {t0, t1});
2938 auto cg_outputs = fe.runFusion({t0, t1});
2939
2940 auto ref = t0 + t1;
2941
2942 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
2943}
2944
2945// Simple test for double buffer in shared mem,
2946// where we should not insert redundant syncs when
2947// they are not needed.
2948TEST_F(NVFuserTest, FusionDoubleBufferNoSync_CUDA) {
2949 Fusion fusion;
2950 FusionGuard fg(&fusion);
2951
2952 // Using vectorization so need to keep n multiple of 4.
2953 int m = 33, n = 48;
2954
2955 TensorView* tv0 = makeConcreteTensor({m, n});
2956 TensorView* tv1 = makeConcreteTensor({m, n});
2957
2958 fusion.addInput(tv0);
2959 fusion.addInput(tv1);
2960
2961 TensorView* tv2 = add(tv0, tv1);
2962
2963 fusion.addOutput(tv2);
2964
2965 auto tv0_shared = tv0->cacheAfter();
2966 tv0_shared->setMemoryType(MemoryType::Shared);
2967 tv0->computeAt(tv2, 1);
2968
2969 // Asynchronously load a tile in one schedule
2970 tv0_shared->split(1, 4);
2971 tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
2972
2973 // Consume the loaded tile in another schedule,
2974 // triggering the need for a sync.
2975 tv2->split(1, 4);
2976 tv2->axis(-2)->parallelize(ParallelType::TIDx);
2977
2978 // Double buffer the shared mem tensor.
2979 tv0_shared->doubleBuffer();
2980
2981 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
2982 at::Tensor t0 = at::randn({m, n}, options);
2983 at::Tensor t1 = at::randn({m, n}, options);
2984
2985 GpuLower gpulw(&fusion);
2986 auto flattened_exprs =
2987 ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
2988 bool sync_inserted = std::any_of(
2989 flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
2990 return expr->isA<kir::BlockSync>();
2991 });
2992 TORCH_INTERNAL_ASSERT(!sync_inserted, "Un-expected block sync inserted");
2993
2994 FusionExecutor fe;
2995 fe.compileFusion(&fusion, {t0, t1});
2996 auto cg_outputs = fe.runFusion({t0, t1});
2997
2998 auto ref = t0 + t1;
2999
3000 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
3001}
3002
3003// Test predicate inversion for cp.async
3004TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) {
3005 // requires ampere+ GPU
3006
3007 Fusion fusion;
3008 FusionGuard fg(&fusion);
3009
3010 // Using vectorization so need to keep n multiple of 4.
3011 int m = 33, n = 48;
3012
3013 TensorView* tv0 = makeConcreteTensor({m, n});
3014
3015 fusion.addInput(tv0);
3016 auto tv1 = sum(tv0, {1});
3017 fusion.addOutput(tv1);
3018
3019 auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
3020 auto tv0_reg = tv0_shared->cacheAfter();
3021 tv0_shared->setMemoryType(MemoryType::Shared);
3022 tv0->computeAt(tv1, 1);
3023
3024 tv0_shared->split(-1, 32);
3025 tv0_shared->split(-1, 4);
3026 tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize);
3027
3028 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3029 at::Tensor t0 = at::randn({m, n}, options);
3030
3031 FusionExecutor fe;
3032 if (!deviceMajorMinorCheck(8)) {
3033 ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0}));
3034 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
3035 }
3036
3037 fe.compileFusion(&fusion, {t0});
3038 auto cg_outputs = fe.runFusion({t0});
3039
3040 auto ref = t0.sum({1});
3041
3042 testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
3043}
3044
3045// Test predicate removal on reg-to-reg expressions
3046TEST_F(NVFuserTest, FusionPredRemovalCheck_CUDA) {
3047 Fusion fusion;
3048 FusionGuard fg(&fusion);
3049
3050 TensorView* tv0 = makeContigTensor(2);
3051 fusion.addInput(tv0);
3052
3053 TensorView* tv1 = set(tv0);
3054 TensorView* tv2 = set(tv1);
3055 TensorView* tv3 = set(tv2);
3056 TensorView* tv4 = set(tv3);
3057
3058 fusion.addOutput(tv4);
3059 tv4->split(1, 4);
3060 tv0->computeAt(tv4, -2);
3061 tv3->axis(-1)->parallelize(ParallelType::Vectorize);
3062
3063 class PredicateRemovalChecker : public kir::IrVisitor {
3064 public:
3065 using kir::IrVisitor::handle;
3066
3067 private:
3068 void handle(UnaryOp* uop) final {
3069 assertOnLocalToLocal(uop);
3070 }
3071
3072 // Utility to assert any local-to-local expr is only trivially predicated.
3073 void assertOnLocalToLocal(Expr* expr) {
3074 bool is_local = true;
3075 for (auto in : ir_utils::filterByType<kir::TensorIndex>(expr->inputs())) {
3076 if (in->view()->getMemoryType() != MemoryType::Local) {
3077 is_local = false;
3078 }
3079 }
3080 for (auto in :
3081 ir_utils::filterByType<kir::TensorIndex>(expr->outputs())) {
3082 if (in->view()->getMemoryType() != MemoryType::Local) {
3083 is_local = false;
3084 }
3085 }
3086
3087 if (is_local) {
3088 if (auto ite = dynamic_cast<kir::IfThenElse*>(scope_exprs_.back())) {
3089 TORCH_INTERNAL_ASSERT(
3090 ite->predicate()->value()->isConst(),
3091 "redundant predicate on: ",
3092 expr);
3093 }
3094 }
3095 }
3096
3097 private:
3098 bool within_ite_ = false;
3099 } pred_checker;
3100
3101 GpuLower gpulw(&fusion);
3102 pred_checker.handle(gpulw.kernel()->topLevelExprs());
3103}
3104
3105TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
3106 Fusion fusion;
3107 FusionGuard fg(&fusion);
3108
3109 auto tv0 = makeSymbolicTensor(1);
3110 fusion.addInput(tv0);
3111 auto tvs = Welford(tv0, {0});
3112 auto tv_avg = tvs.avg;
3113 fusion.addOutput(tv_avg);
3114
3115 tv_avg->split(0, 128);
3116 TransformPropagatorWithCheck propagator(tv_avg);
3117 MaxRootDomainInfoSpanningTree(tv_avg).traverse(&propagator);
3118
3119 tv_avg->axis(0)->parallelize(ParallelType::BIDx);
3120 tv_avg->axis(1)->parallelize(ParallelType::TIDx);
3121
3122 // Make sure the parallelization of tv_avg is propagated to the var
3123 // and count tensors.
3124 GpuLower gpulw(&fusion);
3125 for (const auto expr : gpulw.kernel()->exprs()) {
3126 auto wop = dynamic_cast<WelfordOp*>(expr);
3127 if (wop == nullptr) {
3128 continue;
3129 }
3130 auto ref = wop->outAvg()->as<TensorView>();
3131 for (auto sibling : ir_utils::filterByType<TensorView>(wop->outputs())) {
3132 if (ref == sibling) {
3133 continue;
3134 }
3135 TORCH_CHECK(
3136 ref->nDims() == sibling->nDims(),
3137 "Invalid sibling: ",
3138 sibling->toString());
3139 for (const auto i : c10::irange(ref->nDims())) {
3140 TORCH_CHECK(
3141 ref->axis(i)->getParallelType() ==
3142 sibling->axis(i)->getParallelType(),
3143 "Mismatched parallel types between siblings. ",
3144 ref->toString(),
3145 ", ",
3146 sibling->toString());
3147 }
3148 }
3149 }
3150
3151 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3152 auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
3153 at::manual_seed(0);
3154 at::Tensor t0 = at::randn({9999}, options);
3155
3156 FusionExecutor fe;
3157 fe.compileFusion(&fusion, {t0});
3158 auto outputs = fe.runFusion({t0});
3159
3160 testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
3161}
3162
3163// Test ExactRootDomainMap
3164TEST_F(NVFuserTest, FusionExactRootDomainMap_CUDA) {
3165 Fusion fusion;
3166 FusionGuard fg(&fusion);
3167
3168 auto tv0 = makeSymbolicTensor(1);
3169 fusion.addInput(tv0);
3170 auto tv1 = makeSymbolicTensor(2);
3171 fusion.addInput(tv1);
3172
3173 auto tv2 = broadcast(tv0, {false, true});
3174 auto tv3 = transpose(tv2);
3175 auto tv4 = add(tv2, tv1);
3176 auto tv5 = add(tv2, tv3);
3177 auto tv6 = add(tv3, tv1);
3178 fusion.addOutput(tv4);
3179 fusion.addOutput(tv5);
3180 fusion.addOutput(tv6);
3181
3182 const auto exact_map = ExactRootDomainMap(&fusion);
3183
3184 // In the exact mapping, the broadcast domain introduced at tv2 is
3185 // only mapped with the another one in tv3, which is just transposed
3186 // from tv2. Any other domain, including the second domain of tv4,
3187 // must not be mapped.
3188
3189 auto tv2_bc = tv2->axis(1);
3190 auto tv3_bc = tv3->axis(0);
3191
3192 TORCH_CHECK(
3193 exact_map.areMapped(tv2_bc, tv3_bc),
3194 "Invalid exact root domain map: ",
3195 exact_map.toString());
3196
3197 // They must not be mapped with anything else.
3198 for (auto tv : ir_utils::allTvs(&fusion)) {
3199 for (auto root_id : tv->getRootDomain()) {
3200 if (root_id == tv2_bc || root_id == tv3_bc) {
3201 continue;
3202 }
3203 TORCH_CHECK(
3204 !exact_map.areMapped(root_id, tv2_bc),
3205 "Invalid exact root domain map: ",
3206 exact_map.toString());
3207 TORCH_CHECK(
3208 !exact_map.areMapped(root_id, tv3_bc),
3209 "Invalid exact root domain map: ",
3210 exact_map.toString());
3211 }
3212 }
3213}
3214
3215class NVFuserMultithreadedTest : public ::testing::Test {
3216 protected:
3217 bool was_enabled = false;
3218
3219 void SetUp() override {
3220 was_enabled = fuser::cuda::setEnabled(true);
3221 }
3222
3223 void TearDown() override {
3224 fuser::cuda::setEnabled(was_enabled);
3225 }
3226};
3227
3228TEST_F(NVFuserMultithreadedTest, SingleFunction_CUDA) {
3229 std::string ir = R"IR(
3230graph(%x.1 : Tensor,
3231 %y.1 : Tensor):
3232 %12 : NoneType = prim::Constant()
3233 %11 : bool = prim::Constant[value=0]()
3234 %9 : int = prim::Constant[value=1]()
3235 %3 : Tensor = aten::exp(%x.1)
3236 %5 : Tensor = aten::relu(%y.1)
3237 %6 : Tensor = aten::sin(%5)
3238 %8 : Tensor = aten::add(%3, %6, %9)
3239 %10 : int[] = prim::ListConstruct(%9)
3240 %13 : Tensor = aten::sum(%8, %10, %11, %12)
3241 return (%13)
3242)IR";
3243 auto g = std::make_shared<Graph>();
3244 torch::jit::parseIR(ir, g.get());
3245 GraphFunction fn("nvfuser_test", g, nullptr);
3246
3247 auto run_kernel = [&fn]() {
3248 auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
3249 auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
3250 std::vector<IValue> results;
3251 for (const auto& _ : c10::irange(10)) {
3252 auto stack = createStack({x.clone(), y.clone()});
3253 fn.run(stack);
3254 results.push_back(stack.back());
3255 }
3256 for (const auto& i : c10::irange(1, 10)) {
3257 auto t0 = results[0].toTensor();
3258 auto ti = results[i].toTensor();
3259 ASSERT_TRUE(at::allclose(t0, ti));
3260 }
3261 };
3262
3263 constexpr size_t kNumThreads = 4;
3264 std::vector<std::thread> threads;
3265 for (size_t id = 0; id < kNumThreads; ++id) {
3266 threads.emplace_back(run_kernel);
3267 }
3268 for (auto& t : threads) {
3269 t.join();
3270 }
3271}
3272
3273TEST_F(NVFuserMultithreadedTest, MultipleFunctions_CUDA) {
3274 auto run_kernel = []() {
3275 const std::string ir = R"IR(
3276 graph(%x.1 : Tensor,
3277 %y.1 : Tensor):
3278 %12 : NoneType = prim::Constant()
3279 %11 : bool = prim::Constant[value=0]()
3280 %9 : int = prim::Constant[value=1]()
3281 %3 : Tensor = aten::exp(%x.1)
3282 %5 : Tensor = aten::relu(%y.1)
3283 %6 : Tensor = aten::sin(%5)
3284 %8 : Tensor = aten::add(%3, %6, %9)
3285 %10 : int[] = prim::ListConstruct(%9)
3286 %13 : Tensor = aten::sum(%8, %10, %11, %12)
3287 return (%13)
3288 )IR";
3289 auto g = std::make_shared<Graph>();
3290 torch::jit::parseIR(ir, g.get());
3291 GraphFunction fn("nvfuser_test", g, nullptr);
3292
3293 auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
3294 auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
3295 std::vector<IValue> results;
3296 constexpr size_t numRuns = 10;
3297 for (const auto& _ : c10::irange(numRuns)) {
3298 auto stack = createStack({x.clone(), y.clone()});
3299 fn.run(stack);
3300 results.push_back(stack.back());
3301 }
3302 for (const auto& i : c10::irange(1, numRuns)) {
3303 auto t0 = results[0].toTensor();
3304 auto ti = results[i].toTensor();
3305 ASSERT_TRUE(at::allclose(t0, ti));
3306 }
3307 };
3308
3309 constexpr size_t kNumThreads = 4;
3310 std::vector<std::thread> threads;
3311 for (size_t id = 0; id < kNumThreads; ++id) {
3312 threads.emplace_back(run_kernel);
3313 }
3314 for (auto& t : threads) {
3315 t.join();
3316 }
3317}
3318
3319// Repro of issue #1655
3320TEST_F(NVFuserTest, FusionIncompleteConcreteID_CUDA) {
3321 Fusion fusion;
3322 FusionGuard fg(&fusion);
3323
3324 auto tv0 = makeSymbolicTensor(1);
3325 fusion.addInput(tv0);
3326 auto tv1 = makeSymbolicTensor(2);
3327 fusion.addInput(tv1);
3328 auto tv2 = makeSymbolicTensor(2);
3329 fusion.addInput(tv2);
3330
3331 auto tv3 = broadcast(tv0, {true, true, false});
3332 auto tv4 = broadcast(tv1, {false, true, false});
3333 auto tv5 = broadcast(tv2, {true, false, false});
3334
3335 auto tv6 = add(tv3, tv4);
3336 auto tv7 = add(tv3, tv5);
3337
3338 fusion.addOutput(tv6);
3339 fusion.addOutput(tv7);
3340
3341 tv6->merge(0);
3342 tv6->merge(0);
3343
3344 TransformPropagatorWithCheck propagator(tv6);
3345 MaxRootDomainInfoSpanningTree(tv6).traverse(&propagator);
3346
3347 tv0->computeAt(tv6, -1, ComputeAtMode::MostInlined);
3348 tv1->computeAt(tv6, -1, ComputeAtMode::MostInlined);
3349 tv2->computeAt(tv7, -1, ComputeAtMode::MostInlined);
3350
3351 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
3352 ASSERT_ANY_THROW(fusion.printKernel());
3353}
3354
3355TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) {
3356 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
3357 Fusion& fusion = *fusion_ptr.get();
3358 FusionGuard fg(&fusion);
3359
3360 int X = 256, Y = 7, Z = 2048;
3361
3362 // setup fusion
3363 auto tv0 = makeContigTensor(4, DataType::Half);
3364 fusion.addInput(tv0);
3365 auto tv1 = castOp(DataType::Float, tv0);
3366
3367 auto tvs = Welford(tv1, {0, 1, 2});
3368 auto tv_avg = tvs.avg;
3369 auto tv_M2 = tvs.var_sum;
3370 auto tv_N = tvs.n;
3371 fusion.addOutput(tv_avg);
3372 fusion.addOutput(tv_M2);
3373
3374 auto cached_input = tv0->cacheAfter();
3375 auto cached_avg = tv_avg->cacheBefore();
3376 auto cached_M2 = tv_M2->cacheBefore();
3377
3378 auto reduction_tv = scheduler_utils::getReductionTvs(&fusion)[0];
3379
3380 reduction_tv->merge(0);
3381 reduction_tv->merge(0);
3382
3383 int TIDx = 16;
3384 int vec = 4;
3385
3386 int TIDy = 16;
3387 int outer_tidy_fact = 16;
3388
3389 reduction_tv->split(-1, TIDx * vec);
3390 reduction_tv->split(-1, vec);
3391 reduction_tv->axis(-2)->parallelize(ParallelType::TIDx);
3392 reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize);
3393 reduction_tv->axis(-3)->parallelize(ParallelType::BIDx);
3394
3395 reduction_tv->split(0, TIDy);
3396 reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
3397 reduction_tv->split(0, outer_tidy_fact);
3398 reduction_tv->axis(0)->parallelize(ParallelType::BIDy);
3399
3400 // T2_g[ rblockIdx.y, rS{16}, rthreadIdx.y, iblockIdx.x, ithreadIdx.x24,
3401 // iV25{4} ]
3402 reduction_tv->reorder({{3, 0}, {4, 1}, {0, 2}, {2, 3}, {1, 4}, {5, 5}});
3403 // T2_g[iblockIdx.x, ithreadIdx.x24, rblockIdx.y, rthreadIdx.y, rS{16},
3404 // iV25{4}]
3405
3406 TransformPropagatorWithCheck propagator(reduction_tv);
3407 MaxRootDomainInfoSpanningTree(reduction_tv).traverse(&propagator);
3408 auto rfactor_tv = ir_utils::rfactorHelper(reduction_tv, {4});
3409 scheduler_utils::parallelizeAllLike(rfactor_tv);
3410
3411 tv0->computeAt(tv_avg, 2);
3412 tv0->computeAt(cached_input, -2);
3413
3414 cached_input->computeAt(rfactor_tv, 4, ComputeAtMode::BestEffort);
3415
3416 for (auto tv : ir_utils::allTvs(&fusion)) {
3417 if (tv == cached_input || tv == tv_avg || tv == tv_M2) {
3418 continue;
3419 }
3420 tv->axis(-1)->parallelize(ParallelType::Serial);
3421 }
3422
3423 FusionExecutor fe;
3424 fe.compileFusion(&fusion, {}, LaunchParams());
3425
3426 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
3427 at::Tensor t0 = at::randn({X, Y, Y, Z}, options);
3428
3429 auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1));
3430
3431 // by default Welford outputs sum of square diff so need to divide to get var
3432 cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y));
3433
3434 auto at_mu = at::mean(t0.to(at::kDouble), {0, 1, 2});
3435 auto at_var = at::var(t0.to(at::kDouble), {0, 1, 2}, false);
3436
3437 testValidate(
3438 &fusion,
3439 cg_outputs,
3440 {t0},
3441 {at_mu, at_var},
3442 __LINE__,
3443 __FILE__,
3444 "",
3445 LaunchParams(-1, -1, -1, -1, -1, -1));
3446}
3447
3448// Test sync insertion with redundant predicates
3449TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) {
3450 Fusion fusion;
3451 FusionGuard fg(&fusion);
3452
3453 TensorView* tv0 = makeConcreteTensor({32});
3454 TensorView* tv1 = makeConcreteTensor({32, 32});
3455 fusion.addInput(tv0);
3456 fusion.addInput(tv1);
3457
3458 auto tv2 = broadcast(tv0, {true, false});
3459 auto tv3 = add(tv2, tv1);
3460
3461 fusion.addOutput(tv3);
3462
3463 auto tv0c = tv0->cacheAfter();
3464
3465 // Make a redundant write through smem
3466 tv0c->setMemoryType(MemoryType::Shared);
3467
3468 tv0->computeAt(tv3, 0);
3469 tv1->computeAt(tv3, 0);
3470
3471 tv0c->axis(0)->parallelize(ParallelType::TIDx);
3472 tv2->axis(0)->parallelize(ParallelType::TIDy);
3473 tv2->axis(1)->parallelize(ParallelType::TIDx);
3474
3475 tv3->axis(0)->parallelize(ParallelType::TIDy);
3476 tv3->axis(1)->parallelize(ParallelType::TIDx);
3477
3478 GpuLower gpulw(&fusion);
3479 auto flattened_exprs =
3480 ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
3481 bool sync_inserted = std::any_of(
3482 flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
3483 return expr->isA<kir::BlockSync>();
3484 });
3485 TORCH_INTERNAL_ASSERT(sync_inserted, "Expected block sync not inserted");
3486
3487 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3488
3489 at::Tensor t0 = at::randn({32}, options);
3490 at::Tensor t1 = at::randn({32, 32}, options);
3491
3492 FusionExecutor fe;
3493 fe.compileFusion(&fusion, {t0, t1});
3494 auto cg_outputs = fe.runFusion({t0, t1});
3495
3496 auto ref = t0 + t1;
3497
3498 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
3499}
3500
3501// Test case for removing syncs on chain of redundant uses.
3502TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) {
3503 Fusion fusion;
3504 FusionGuard fg(&fusion);
3505
3506 TensorView* tv0 = makeConcreteTensor({32});
3507 TensorView* tv1 = makeConcreteTensor({32, 32});
3508 fusion.addInput(tv0);
3509 fusion.addInput(tv1);
3510
3511 auto tv2 = broadcast(tv0, {true, false});
3512 auto tv3 = add(tv2, tv1);
3513
3514 fusion.addOutput(tv3);
3515
3516 auto tv0c = tv0->cacheAfter();
3517
3518 // Make a redundant write through smem
3519 tv0c->setMemoryType(MemoryType::Shared);
3520 tv2->setMemoryType(MemoryType::Shared);
3521
3522 tv0->computeAt(tv3, 0);
3523 tv1->computeAt(tv3, 0);
3524
3525 tv0c->axis(0)->parallelize(ParallelType::TIDx);
3526 tv2->axis(0)->parallelize(ParallelType::TIDy);
3527 tv2->axis(1)->parallelize(ParallelType::TIDx);
3528
3529 tv3->axis(0)->parallelize(ParallelType::TIDy);
3530 tv3->axis(1)->parallelize(ParallelType::TIDx);
3531
3532 // Utility class to make sure one block sync
3533 // is inserted by RAW pass.
3534 class SyncChecker : public kir::IrVisitor {
3535 public:
3536 using kir::IrVisitor::handle;
3537 int result() {
3538 return sync_seen_;
3539 }
3540
3541 private:
3542 void handle(kir::BlockSync*) final {
3543 sync_seen_++;
3544 }
3545
3546 private:
3547 int sync_seen_ = 0;
3548 } checker;
3549
3550 GpuLower gpulw(&fusion);
3551 checker.handle(gpulw.kernel()->topLevelExprs());
3552 TORCH_INTERNAL_ASSERT(
3553 checker.result() < 2, "More syncs were inserted than expected");
3554
3555 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3556
3557 at::Tensor t0 = at::randn({32}, options);
3558 at::Tensor t1 = at::randn({32, 32}, options);
3559
3560 FusionExecutor fe;
3561 fe.compileFusion(&fusion, {t0, t1});
3562 auto cg_outputs = fe.runFusion({t0, t1});
3563
3564 auto ref = t0 + t1;
3565
3566 testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
3567}
3568
3569// Test case for sync insertion after redundant predicated smem write
3570// Check that syncs are removed only when all paths are redundant.
3571TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) {
3572 Fusion fusion;
3573 FusionGuard fg(&fusion);
3574
3575 TensorView* tv0 = makeConcreteTensor({32});
3576 TensorView* tv1 = makeConcreteTensor({32, 32});
3577 fusion.addInput(tv0);
3578 fusion.addInput(tv1);
3579
3580 auto tv2 = broadcast(tv0, {true, false});
3581 auto tv3 = set(tv2);
3582 auto tv4 = add(tv3, tv1);
3583 auto tv5 = add(tv2, tv1);
3584
3585 fusion.addOutput(tv4);
3586 fusion.addOutput(tv5);
3587
3588 auto tv0c = tv0->cacheAfter();
3589
3590 // In this scheduling config,
3591 // tv0c -> tv2 -> tv3 is a redundant path for tidy
3592 // tv0c -> tv2 -> tv5 is not.
3593 // So we need a RAW sync in tv0c->tv2 to make sure
3594 // tv2 has the correct value to produce tv5.
3595 tv0c->setMemoryType(MemoryType::Shared);
3596 tv3->setMemoryType(MemoryType::Shared);
3597
3598 tv0c->axis(0)->parallelize(ParallelType::TIDx);
3599 tv2->axis(0)->parallelize(ParallelType::TIDy);
3600 tv2->axis(1)->parallelize(ParallelType::TIDx);
3601
3602 tv3->axis(0)->parallelize(ParallelType::TIDy);
3603 tv3->axis(1)->parallelize(ParallelType::TIDx);
3604
3605 tv5->axis(0)->parallelize(ParallelType::TIDy);
3606 tv5->axis(1)->parallelize(ParallelType::TIDx);
3607
3608 // Utility class to make sure one block sync
3609 // is inserted by RAW pass.
3610 class SyncChecker : public kir::IrVisitor {
3611 public:
3612 using kir::IrVisitor::handle;
3613 int result() {
3614 return sync_seen_;
3615 }
3616
3617 private:
3618 void handle(kir::BlockSync* sync) final {
3619 if (!sync->isWarHazardSync()) {
3620 sync_seen_++;
3621 }
3622 }
3623
3624 private:
3625 int sync_seen_ = 0;
3626 } checker;
3627
3628 GpuLower gpulw(&fusion);
3629 checker.handle(gpulw.kernel()->topLevelExprs());
3630
3631 // This is implicit checking. There are exactly 2 places
3632 // where RAW hazards happen: one producing tv2 and the other
3633 // producing tv3. This test case expect syncs in both of
3634 // these places so we check that 2 RAW syncs are inserted.
3635 TORCH_INTERNAL_ASSERT(
3636 checker.result() == 2,
3637 "Exactly 2 RAW sync expected for the two shared memory transfers");
3638
3639 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3640
3641 at::Tensor t0 = at::randn({32}, options);
3642 at::Tensor t1 = at::randn({32, 32}, options);
3643
3644 FusionExecutor fe;
3645 fe.compileFusion(&fusion, {t0, t1});
3646 auto cg_outputs = fe.runFusion({t0, t1});
3647
3648 auto ref = t0 + t1;
3649
3650 testValidate(&fusion, cg_outputs, {t0, t1}, {ref, ref}, __LINE__, __FILE__);
3651}
3652
3653// Unit test case for detecting thread redundant usage of shared tensors.
3654TEST_F(NVFuserTest, FusionRedundantUseCheck_CUDA) {
3655 Fusion fusion;
3656 FusionGuard fg(&fusion);
3657
3658 TensorView* tv0 = makeConcreteTensor({32, 32});
3659 fusion.addInput(tv0);
3660
3661 auto tv1 = set(tv0);
3662 auto tv2 = set(tv1);
3663 auto tv3 = set(tv2);
3664 auto tv4 = set(tv3);
3665
3666 auto tv5 = set(tv4);
3667
3668 auto tv6 = set(tv4);
3669 auto tv7 = set(tv6);
3670
3671 fusion.addOutput(tv5);
3672 fusion.addOutput(tv7);
3673
3674 tv2->setMemoryType(MemoryType::Shared);
3675 tv4->setMemoryType(MemoryType::Shared);
3676
3677 tv7->axis(-1)->parallelize(ParallelType::TIDx);
3678
3679 // Thread pred map cannot be built without an active lower
3680 // object. So would need to lower the whole fusion for
3681 // testing. However, lower also keeps an copy of the fusion
3682 // so the original pointers cannot be used to querry the
3683 // thread pred map. So have to traverse the new expr list
3684 // to find the pointers;
3685 GpuLower gpulw(&fusion);
3686
3687 TensorView *lowered_tv2 = nullptr, *lowered_tv4 = nullptr;
3688 auto used_vals = gpulw.kernel()->usedMathVals();
3689
3690 for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
3691 if (tv->name() == 2) {
3692 lowered_tv2 = tv;
3693 }
3694 if (tv->name() == 4) {
3695 lowered_tv4 = tv;
3696 }
3697 }
3698
3699 TORCH_INTERNAL_ASSERT(
3700 lowered_tv2 != nullptr && lowered_tv4 != nullptr,
3701 "tv2 or tv4 not lowered or mangled");
3702
3703 auto tv2_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv2);
3704 auto tv4_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv4);
3705
3706 // tv2 -> tv3 -> tv4 (shared) is the only use chain for tv2,
3707 // and tv4 is redundantly written in tidx so tv2 is redundantly
3708 // consumed in tidx.
3709 TORCH_INTERNAL_ASSERT(
3710 tv2_info.redundant_use_types.get(ParallelType::TIDx),
3711 "TV2 is redundantly used but not detected.");
3712
3713 // tv4->tv5 (global) is a redundant use chain, but
3714 // tv4->tv6->tv7 is not, so tv4 should not be detected as
3715 // a redundant used tensor in tidx.
3716 TORCH_INTERNAL_ASSERT(
3717 !tv4_info.redundant_use_types.get(ParallelType::TIDx),
3718 "TV4 is not redundantly used but not detected.");
3719}
3720
3721// Test a basic swizzle pattern
3722TEST_F(NVFuserTest, FusionSimpleSwizzle0_CUDA) {
3723 Fusion fusion;
3724 FusionGuard fg(&fusion);
3725
3726 auto tv0 = makeConcreteTensor({2, 32});
3727 fusion.addInput(tv0);
3728
3729 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3730 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
3731
3732 fusion.addOutput(tv2);
3733
3734 // Make a 2x8 Zshape tile
3735 tv1->split(-1, 16);
3736 tv1->split(-1, 8);
3737 // [O, 2, 8]
3738
3739 tv2->split(-1, 16);
3740 tv2->split(-1, 4);
3741 //[O, 4, 4]
3742
3743 tv1->computeAt(tv2, 1);
3744 tv1->swizzle(Swizzle2DType::ZShape, -2, -1);
3745
3746 FusionExecutor fe;
3747 fe.compileFusion(&fusion);
3748
3749 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3750 auto t0 = at::randn({2, 32}, options);
3751 auto t2 = t0 + 2.0;
3752 auto cg_outputs = fe.runFusion({t0});
3753
3754 testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
3755}
3756
3757// Test swizzle inlining
3758TEST_F(NVFuserTest, FusionSimpleSwizzle1_CUDA) {
3759 Fusion fusion;
3760 FusionGuard fg(&fusion);
3761
3762 auto tv0 = makeConcreteTensor({2, 32});
3763 fusion.addInput(tv0);
3764
3765 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3766 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
3767 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
3768
3769 fusion.addOutput(tv3);
3770
3771 // Make a 2x8 Zshape tile
3772 tv2->split(-1, 16);
3773 tv2->split(-1, 8);
3774 // [O, 2, 8]
3775
3776 tv3->split(-1, 16);
3777 tv3->split(-1, 4);
3778 //[O, 4, 4]
3779
3780 tv2->computeAt(tv3, 1);
3781 tv2->swizzle(Swizzle2DType::ZShape, -2, -1);
3782
3783 // Inlining a producer into a swizzled consumer is ok
3784 tv1->computeAt(tv2, -1);
3785
3786 FusionExecutor fe;
3787 fe.compileFusion(&fusion);
3788
3789 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3790 auto t0 = at::randn({2, 32}, options);
3791 auto t3 = t0 + 3.0;
3792 auto cg_outputs = fe.runFusion({t0});
3793
3794 testValidate(&fusion, cg_outputs, {t0}, {t3}, __LINE__, __FILE__);
3795}
3796
3797// Test sync insertion and memory check in parallelized swizzles.
3798// In this test, data is parallel written into smem in zcurve
3799// pattern and then read out and output to global mem unswizzled.
3800TEST_F(NVFuserTest, FusionSimpleSwizzle2_CUDA) {
3801 Fusion fusion;
3802 FusionGuard fg(&fusion);
3803
3804 auto tv0 = makeConcreteTensor({32, 32});
3805 fusion.addInput(tv0);
3806
3807 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3808 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
3809
3810 fusion.addOutput(tv2);
3811
3812 tv1->swizzle(Swizzle2DType::ZShape, -2, -1);
3813
3814 tv1->axis(0)->parallelize(ParallelType::TIDx);
3815 tv1->axis(1)->parallelize(ParallelType::TIDy);
3816
3817 tv2->axis(0)->parallelize(ParallelType::TIDx);
3818 tv2->axis(1)->parallelize(ParallelType::TIDy);
3819
3820 // Validation should fail since TV1 is not in shared
3821 // memory as required by sync info pass.
3822 ASSERT_ANY_THROW(GpuLower gpulw_throw(&fusion));
3823
3824 tv1->setMemoryType(MemoryType::Shared);
3825
3826 // Make sure that a sync is inserted:
3827 bool sync_found = false;
3828 GpuLower gpu_lw(&fusion);
3829 auto flattened_exps =
3830 ir_utils::flattenScopedExprs(gpu_lw.kernel()->topLevelExprs());
3831
3832 for (auto expr : flattened_exps) {
3833 if (expr->isA<kir::BlockSync>()) {
3834 sync_found = true;
3835 }
3836 // Will require a sync thread before any shared memory read.
3837 for (auto inp_tv : ir_utils::filterByType<TensorView>(expr->inputs())) {
3838 if (inp_tv->getMemoryType() == MemoryType::Shared) {
3839 TORCH_INTERNAL_ASSERT(
3840 sync_found, "Block sync required but not inserted");
3841 }
3842 }
3843 }
3844
3845 FusionExecutor fe;
3846 fe.compileFusion(&fusion);
3847
3848 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3849 auto t0 = at::randn({32, 32}, options);
3850 auto t2 = t0 + 2.0;
3851 auto cg_outputs = fe.runFusion({t0});
3852
3853 testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
3854}
3855
3856// Test BestEffortReplay behavior with swizzle op
3857TEST_F(NVFuserTest, FusionSwizzleMapping_CUDA) {
3858 Fusion fusion;
3859 FusionGuard fg(&fusion);
3860
3861 auto tv0 = makeConcreteTensor({2, 32});
3862 fusion.addInput(tv0);
3863
3864 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3865 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
3866 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
3867
3868 fusion.addOutput(tv3);
3869
3870 // Make a 2x8 Zshape tile
3871 tv2->split(-1, 16);
3872 tv2->split(-1, 8);
3873 // [O, 2, 8]
3874
3875 tv3->split(-1, 16);
3876 tv3->split(-1, 4);
3877 //[O, 4, 4]
3878
3879 tv2->computeAt(tv3, 1);
3880 tv2->swizzle(Swizzle2DType::ZShape, -2, -1);
3881
3882 // Inlining a producer into a swizzled consumer is ok
3883 tv1->computeAt(tv2, -1);
3884
3885 // Check BestEffortReplay behavior with skip swizzles option on.
3886 PairwiseRootDomainMap root_map(tv1, tv2);
3887
3888 // Check producer to consumer map,
3889 // i.e. unswizzled tensor to swizzled tensor map
3890 //----------------------------------------------------------
3891 auto p2c = BestEffortReplay::replayCasP(tv2, tv1, -1, root_map).getReplay();
3892 auto swizzle_x_it0 = p2c.find(tv1->axis(-2));
3893 auto swizzle_y_it0 = p2c.find(tv1->axis(-1));
3894 // P2C map should exist and both the x and y map should
3895 // map to the output of the swizzle op.
3896 TORCH_INTERNAL_ASSERT(
3897 swizzle_x_it0 != p2c.end() && swizzle_y_it0 != p2c.end());
3898 TORCH_INTERNAL_ASSERT(
3899 swizzle_x_it0->second == tv2->axis(-2) &&
3900 swizzle_y_it0->second == tv2->axis(-1));
3901
3902 // Check consumer to producer map,
3903 // i.e. swizzled tensor to unswizzled tensor map
3904 //----------------------------------------------------------
3905 auto c2p = BestEffortReplay::replayPasC(tv1, tv2, -1, root_map).getReplay();
3906
3907 auto swizzle_op = tv2->axis(-1)->definition()->as<Swizzle2D>();
3908
3909 // Find mapping for swizzle inputs
3910 auto swizzle_x_it1 = c2p.find(swizzle_op->inX());
3911 auto swizzle_y_it1 = c2p.find(swizzle_op->inY());
3912
3913 // Find mapping for swizzle outputs
3914 auto swizzle_x_it2 = c2p.find(swizzle_op->outX());
3915 auto swizzle_y_it2 = c2p.find(swizzle_op->outY());
3916
3917 // Input of swizzle ops will not be mapped to any
3918 // by BestEffortReplay, as BestEffortReplay has to be
3919 // one to one. IdGraph will further map them together.
3920 TORCH_INTERNAL_ASSERT(
3921 swizzle_x_it1 == c2p.end() && swizzle_y_it1 == c2p.end());
3922
3923 // Mapping for swizzle outputs should be mapped and should
3924 // also map to the corresponding axes on the unswizzled tensor.
3925 TORCH_INTERNAL_ASSERT(
3926 swizzle_x_it2 != c2p.end() && swizzle_y_it2 != c2p.end());
3927 TORCH_INTERNAL_ASSERT(
3928 swizzle_x_it2->second == tv1->axis(-2) &&
3929 swizzle_y_it2->second == tv1->axis(-1));
3930
3931 // Check id graph behavior
3932 //----------------------------------------------------------
3933 ComputeAtMap ca_map(&fusion);
3934 // Corresponding inputs and outputs of swizzle ops are
3935 // map through by exact and permissive map.
3936 TORCH_INTERNAL_ASSERT(
3937 ca_map.areMapped(tv1->axis(-2), swizzle_op->inX(), IdMappingMode::EXACT));
3938 TORCH_INTERNAL_ASSERT(
3939 ca_map.areMapped(tv1->axis(-1), swizzle_op->inY(), IdMappingMode::EXACT));
3940 TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3941 tv1->axis(-2), swizzle_op->outX(), IdMappingMode::EXACT));
3942 TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3943 tv1->axis(-1), swizzle_op->outY(), IdMappingMode::EXACT));
3944
3945 TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3946 tv1->axis(-2), swizzle_op->inX(), IdMappingMode::PERMISSIVE));
3947 TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3948 tv1->axis(-1), swizzle_op->inY(), IdMappingMode::PERMISSIVE));
3949 TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3950 tv1->axis(-2), swizzle_op->outX(), IdMappingMode::PERMISSIVE));
3951 TORCH_INTERNAL_ASSERT(ca_map.areMapped(
3952 tv1->axis(-1), swizzle_op->outY(), IdMappingMode::PERMISSIVE));
3953}
3954
3955// Test a basic loop swizzle pattern
3956TEST_F(NVFuserTest, FusionLoopSwizzle0_CUDA) {
3957 Fusion fusion;
3958 FusionGuard fg(&fusion);
3959
3960 auto tv0 = makeConcreteTensor({2, 32});
3961 fusion.addInput(tv0);
3962
3963 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3964 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
3965
3966 fusion.addOutput(tv2);
3967
3968 tv2->split(-1, 16);
3969 tv2->split(-1, 4);
3970 //[O, 4, 4]
3971
3972 tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
3973
3974 tv0->computeAt(tv2, -1);
3975
3976 FusionExecutor fe;
3977 fe.compileFusion(&fusion);
3978
3979 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
3980 auto t0 = at::randn({2, 32}, options);
3981 auto t2 = t0 + 2.0;
3982 auto cg_outputs = fe.runFusion({t0});
3983
3984 testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
3985}
3986
3987// Outer block zshape pattern
3988TEST_F(NVFuserTest, FusionLoopSwizzle1_CUDA) {
3989 Fusion fusion;
3990 FusionGuard fg(&fusion);
3991
3992 auto tv0 = makeContigTensor(2);
3993 fusion.addInput(tv0);
3994
3995 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
3996 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
3997
3998 fusion.addOutput(tv2);
3999
4000 tv2->split(-2, 8);
4001 tv2->split(-1, 4);
4002 //[I0o, I0i, I1o, I1i]
4003 tv2->reorder({{1, 2}, {2, 1}});
4004 //[I0o, I1o, I0i, I1i]
4005
4006 tv2->swizzle(Swizzle2DType::ZShape, 0, 1, SwizzleMode::Loop);
4007 tv0->computeAt(tv2, -1);
4008
4009 tv2->axis(0)->parallelize(ParallelType::BIDx);
4010 tv2->axis(1)->parallelize(ParallelType::BIDy);
4011
4012 FusionExecutor fe;
4013 fe.compileFusion(&fusion);
4014
4015 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4016 auto t0 = at::randn({45, 77}, options);
4017 auto t2 = t0 + 2.0;
4018 auto cg_outputs = fe.runFusion({t0});
4019
4020 testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
4021}
4022
4023// Test assertion in unsupported pattern: non-leaf loop swizzle.
4024TEST_F(NVFuserTest, FusionLoopSwizzleCheck0_CUDA) {
4025 Fusion fusion;
4026 FusionGuard fg(&fusion);
4027
4028 auto tv0 = makeConcreteTensor({2, 32});
4029 fusion.addInput(tv0);
4030
4031 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4032 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4033
4034 fusion.addOutput(tv2);
4035
4036 tv2->split(-1, 16);
4037 tv2->split(-1, 4);
4038 //[O, 4, 4]
4039
4040 // Swizzle the inner tile.
4041 tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
4042
4043 // Make swizzle output not a leaf domain.
4044 tv2->merge(-2);
4045
4046 tv0->computeAt(tv2, -1);
4047
4048 FusionExecutor fe;
4049 ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4050}
4051
4052// Test assertion in unsupported pattern: half-inlined loop swizzle.
4053TEST_F(NVFuserTest, FusionLoopSwizzleCheck1_CUDA) {
4054 Fusion fusion;
4055 FusionGuard fg(&fusion);
4056
4057 auto tv0 = makeConcreteTensor({2, 32});
4058 fusion.addInput(tv0);
4059
4060 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
4061 auto tv2 = add(tv1, IrBuilder::create<Double>(1));
4062 auto tv3 = add(tv2, IrBuilder::create<Double>(1));
4063
4064 fusion.addOutput(tv3);
4065
4066 //[O, 4, 4]
4067 tv2->split(-1, 16);
4068 tv2->split(-1, 4);
4069
4070 //[O, 4, 4]
4071 tv3->split(-1, 16);
4072 tv3->split(-1, 4);
4073
4074 // Swizzle inner tile of tv2
4075 tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
4076
4077 // Make tv2 swizzled and partially-inlined (unsupported).
4078 tv0->computeAt(tv3, -2);
4079
4080 FusionExecutor fe;
4081 ASSERT_ANY_THROW(fe.compileFusion(&fusion));
4082}
4083
4084TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
4085 Fusion fusion;
4086 FusionGuard fg(&fusion);
4087
4088 std::vector<int64_t> shape({10, 11});
4089
4090 auto tv0 = makeConcreteTensor(shape);
4091 fusion.addInput(tv0);
4092
4093 // [I, R]
4094 auto tv1 = sum(tv0, {1});
4095 // [I, B]
4096 auto tv2 = unsqueeze(tv1, -1);
4097 fusion.addOutput(tv2);
4098
4099 TORCH_CHECK(
4100 tv2->nDims() == 2, "Unexpected unsqueeze result: ", tv2->toString());
4101 TORCH_CHECK(
4102 tv2->axis(1)->isBroadcast(),
4103 "Unexpected unsqueeze result: ",
4104 tv2->toString());
4105
4106 // tv1 has only one non-reduction axis. An exception should be
4107 // thrown.
4108 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4109 ASSERT_ANY_THROW(unsqueeze(tv1, 2));
4110
4111 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4112 at::Tensor t0 = at::randn({10, 11}, options);
4113 std::vector<IValue> aten_inputs = {t0};
4114
4115 FusionExecutor fe;
4116 fe.compileFusion(&fusion, aten_inputs);
4117 auto cg_outputs = fe.runFusion(aten_inputs);
4118
4119 auto ref = t0.sum(1).unsqueeze(-1);
4120
4121 testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
4122}
4123
4124TEST_F(NVFuserTest, FusionSqueeze1_CUDA) {
4125 Fusion fusion;
4126 FusionGuard fg(&fusion);
4127
4128 std::vector<int64_t> shape({10, 11});
4129
4130 auto tv0 = makeConcreteTensor(shape);
4131 fusion.addInput(tv0);
4132
4133 // [I, B]
4134 auto tv1 = sum(tv0, {1}, true);
4135 // [I]
4136 auto tv2 = squeeze(tv1, {shape[0], 1});
4137 fusion.addOutput(tv2);
4138
4139 TORCH_CHECK(
4140 tv2->nDims() == 2, "Unexpected squeeze result: ", tv2->toString());
4141
4142 // [I, R]
4143 auto tv3 = sum(tv0, {1});
4144 // tv3 has only one non-reduction axis. The extent of the first axis
4145 // is not one, so squeeze should fail.
4146 // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
4147 ASSERT_ANY_THROW(squeeze(tv3, {shape[0], 1}));
4148
4149 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4150 at::Tensor t0 = at::randn({10, 11}, options);
4151 std::vector<IValue> aten_inputs = {t0};
4152
4153 FusionExecutor fe;
4154 fe.compileFusion(&fusion, aten_inputs);
4155 auto cg_outputs = fe.runFusion(aten_inputs);
4156
4157 auto ref = t0.sum(1, true).squeeze(-1);
4158
4159 testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
4160}
4161
4162TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
4163 Fusion fusion;
4164 FusionGuard fg(&fusion);
4165
4166 auto tv0 = makeSymbolicTensor(2);
4167 fusion.addInput(tv0);
4168 auto tv1 = set(tv0);
4169 auto tv2 = broadcast(tv1, {false, true, false});
4170 fusion.addOutput(tv2);
4171
4172 tv2->merge(-2, -1);
4173 tv2->merge(-2, -1);
4174 tv2->split(-1, 100);
4175 tv0->computeAt(tv2, -1);
4176
4177 GpuLower gpulw(&fusion);
4178 TORCH_CHECK(PredicatedChecker::isPredicated(tv1, gpulw));
4179
4180 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4181 at::Tensor t0 = at::randn({3, 4}, options);
4182
4183 FusionExecutor fe;
4184 fe.compileFusion(&fusion, {t0});
4185 auto cg_outputs = fe.runFusion({t0});
4186
4187 auto ref = t0.unsqueeze(1);
4188
4189 testValidate(fe.kernel(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
4190}
4191
4192// Repro of https://github.com/csarofeen/pytorch/issues/1777
4193TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) {
4194 // tv1 = 2.0 / tv0
4195 Fusion fusion;
4196 FusionGuard fg(&fusion);
4197
4198 TensorView* tv0 = makeSymbolicTensor(2);
4199 fusion.addInput(tv0);
4200 TensorView* tv1 = div(IrBuilder::create<Double>(2.0), tv0);
4201 fusion.addOutput(tv1);
4202
4203 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4204 auto t0 = at::randn({3, 3}, options);
4205 // There's no overload div(Scalar, Tensor) in ATen
4206 auto aten_output = at::div(
4207 at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0);
4208
4209 FusionExecutor fe;
4210 fe.compileFusion(&fusion, {t0});
4211 auto cg_outputs = fe.runFusion({t0});
4212
4213 testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__);
4214}
4215
4216// Repro of an issue of the reduction scheduler with a broadcast
4217// domain concretized to multiple domains that are not proven to have
4218// the same extent
4219TEST_F(NVFuserTest, FusionRepro1713_CUDA) {
4220 auto fusion = std::make_unique<Fusion>();
4221 FusionGuard fg(fusion.get());
4222
4223 auto tv0 = makeSymbolicTensor(2);
4224 auto tv1 = makeSymbolicTensor(2);
4225 auto tv2 = makeSymbolicTensor(1);
4226 fusion->addInput(tv0);
4227 fusion->addInput(tv1);
4228 fusion->addInput(tv2);
4229 auto tv3 = broadcast(tv2, {false, true});
4230
4231 auto tv4 = add(tv3, tv0);
4232
4233 auto tv5 = add(tv3, tv1);
4234 auto tv6 = sum(tv5, {0});
4235 fusion->addOutput(tv4);
4236 fusion->addOutput(tv6);
4237
4238 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4239 at::Tensor t0 = at::randn({1024, 204800}, options);
4240 // Original repro had the same shape as t0, but this should work
4241 // with a different extent at the second axis
4242 at::Tensor t1 = at::randn({1024, 123}, options);
4243 at::Tensor t2 = at::randn({1024}, options);
4244 std::vector<IValue> aten_inputs({t0, t1, t2});
4245
4246 FusionExecutorCache executor_cache(std::move(fusion));
4247 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4248
4249 auto t3 = t2.unsqueeze(-1);
4250 auto t4 = t3 + t0;
4251 auto t5 = t3 + t1;
4252 auto t6 = sum(t5, {0});
4253
4254 testValidate(
4255 executor_cache.fusion(),
4256 cg_outputs,
4257 {t0, t1, t2},
4258 {t4, t6},
4259 __LINE__,
4260 __FILE__);
4261}
4262
4263TEST_F(NVFuserTest, FusionExpand_CUDA) {
4264 auto fusion = std::make_unique<Fusion>();
4265 FusionGuard fg(fusion.get());
4266
4267 auto w = 2, x = 3, y = 4, z = 5;
4268
4269 // Test
4270 // a simple expand
4271 // Expand that's propagated
4272 // expand_as
4273 // symbolic expand
4274
4275 // x
4276 auto tv0 = makeSymbolicTensor(1);
4277 fusion->addInput(tv0);
4278
4279 auto tv1 = broadcast(tv0, {false, true});
4280 auto tv2 = expand(tv1, {tv0->axis(0)->extent(), IrBuilder::create<Int>(y)});
4281
4282 // x
4283 auto tv3 = makeSymbolicTensor(1);
4284 fusion->addInput(tv3);
4285 auto tv4 = broadcast(tv3, {false, true});
4286 auto tv5 = add(tv4, tv2);
4287 // [x, e_y]
4288
4289 // [x, y, z]
4290 auto tv6 = makeSymbolicTensor(3);
4291 fusion->addInput(tv6);
4292
4293 // Disjoint set op will cause a segmentation for just this op.
4294 auto tmp_7 = set(tv6);
4295 fusion->addOutput(tmp_7);
4296
4297 auto tv7 = broadcast(tv5, {false, false, true});
4298
4299 auto tv8 = expand_as(tv7, tv6);
4300 // [x, e_y, e_z]
4301
4302 auto w_symbolic = IrBuilder::create<Int>();
4303 fusion->addInput(w_symbolic);
4304
4305 auto tv9 = broadcast(tv8, {true, false, false, false});
4306 //[1, x, e_y, e_z]
4307
4308 auto tv10 = expand(
4309 tv9,
4310 {w_symbolic,
4311 tv9->axis(1)->extent(),
4312 tv9->axis(2)->expandedExtent(),
4313 tv9->axis(3)->expandedExtent()});
4314
4315 fusion->addOutput(tv10);
4316
4317 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4318 at::Tensor t0 = at::randn({x}, options);
4319 at::Tensor t3 = at::randn({x}, options);
4320 at::Tensor t6 = at::randn({x, y, z}, options);
4321
4322 FusionExecutorCache executor_cache(std::move(fusion));
4323
4324 auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6, w});
4325 auto cg_out = cg_outputs[1];
4326
4327 TORCH_INTERNAL_ASSERT(cg_out.size(0) == w);
4328 TORCH_INTERNAL_ASSERT(cg_out.size(1) == x);
4329 TORCH_INTERNAL_ASSERT(cg_out.size(2) == y);
4330 TORCH_INTERNAL_ASSERT(cg_out.size(3) == z);
4331 TORCH_INTERNAL_ASSERT(cg_out.stride(0) == 0);
4332 TORCH_INTERNAL_ASSERT(cg_out.stride(1) == 1);
4333 TORCH_INTERNAL_ASSERT(cg_out.stride(2) == 0);
4334 TORCH_INTERNAL_ASSERT(cg_out.stride(3) == 0);
4335
4336 auto t10 = t0.unsqueeze(-1)
4337 .expand({x, y})
4338 .add(t3.unsqueeze(-1))
4339 .unsqueeze(-1)
4340 .expand_as(t6)
4341 .unsqueeze(0)
4342 .expand({w, x, y, z});
4343
4344 testValidate(
4345 executor_cache.fusion(),
4346 cg_outputs,
4347 {t0, t3, t6, w},
4348 {t6, t10},
4349 __LINE__,
4350 __FILE__);
4351}
4352
4353TEST_F(NVFuserTest, FusionExpandIssue1751_CUDA) {
4354 auto fusion = std::make_unique<Fusion>();
4355 FusionGuard fg(fusion.get());
4356
4357 auto x = 3, y = 4, z = 5;
4358
4359 // y, z
4360 auto tv0 = makeSymbolicTensor(2);
4361 fusion->addInput(tv0);
4362
4363 auto tv1 = broadcast(tv0, {true, false, false});
4364
4365 // Two ways to propagate extents as is: use -1 or explicitly pass
4366 // the extent vals.
4367
4368 auto tv2 = expand(
4369 tv1,
4370 {IrBuilder::create<Int>(x),
4371 IrBuilder::create<Int>(-1),
4372 IrBuilder::create<Int>(-1)});
4373
4374 auto tv3 = expand(
4375 tv1,
4376 {IrBuilder::create<Int>(x),
4377 tv0->axis(0)->extent(),
4378 tv0->axis(1)->extent()});
4379
4380 fusion->addOutput(tv2);
4381 fusion->addOutput(tv3);
4382
4383 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4384 at::Tensor t0 = at::randn({y, z}, options);
4385
4386 FusionExecutorCache executor_cache(std::move(fusion));
4387
4388 auto cg_outputs = executor_cache.runFusionWithInputs({t0});
4389
4390 for (const auto& cg_out : cg_outputs) {
4391 TORCH_INTERNAL_ASSERT(cg_out.size(0) == x);
4392 TORCH_INTERNAL_ASSERT(cg_out.size(1) == y);
4393 TORCH_INTERNAL_ASSERT(cg_out.size(2) == z);
4394 }
4395
4396 auto t2 = t0.expand({x, y, z});
4397
4398 testValidate(
4399 executor_cache.fusion(), cg_outputs, {t0}, {t2, t2}, __LINE__, __FILE__);
4400}
4401
4402// TODO: Make sure the kernel uses the expanded concrete size instead
4403// of the symbolic size
4404TEST_F(NVFuserTest, FusionExpandToConcrete_CUDA) {
4405 auto fusion = std::make_unique<Fusion>();
4406 FusionGuard fg(fusion.get());
4407
4408 auto x = 3, y = 4;
4409
4410 auto tv0 = makeSymbolicTensor(1);
4411 fusion->addInput(tv0);
4412
4413 auto tv1 = broadcast(tv0, {true, false});
4414
4415 auto tv2 =
4416 expand(tv1, {IrBuilder::create<Int>(x), IrBuilder::create<Int>(y)});
4417
4418 fusion->addOutput(tv2);
4419
4420 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4421 at::Tensor t0 = at::randn({y}, options);
4422
4423 FusionExecutorCache executor_cache(std::move(fusion));
4424
4425 auto cg_outputs = executor_cache.runFusionWithInputs({t0});
4426
4427 for (const auto& cg_out : cg_outputs) {
4428 TORCH_INTERNAL_ASSERT(cg_out.size(0) == x);
4429 TORCH_INTERNAL_ASSERT(cg_out.size(1) == y);
4430 }
4431
4432 auto t2 = t0.expand({x, y});
4433
4434 testValidate(
4435 executor_cache.fusion(), cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
4436}
4437
4438TEST_F(NVFuserTest, FusionReproNoncontigBroadcast_CUDA) {
4439 auto fusion = std::make_unique<Fusion>();
4440 FusionGuard fg(fusion.get());
4441
4442 auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
4443 at::Tensor t0 = at::randn({4, 32, 16, 112, 112}, options).transpose(-1, -2);
4444 at::Tensor t1 = at::randn({32, 1, 112, 1}, options).transpose(-1, -2);
4445
4446 auto tv0 = TensorViewBuilder()
4447 .ndims(5)
4448 .contiguity({true, true, false, false, false}) // ttfff
4449 .shape({-1, -1, -1, -1, -1})
4450 .dtype(DataType::Half)
4451 .build();
4452 auto tv1 = TensorViewBuilder()
4453 .ndims(4)
4454 .contiguity({true, false, false, true}) // tfft
4455 .shape({-1, 1, 1, -1})
4456 .dtype(DataType::Half)
4457 .build();
4458
4459 fusion->addInput(tv0);
4460 fusion->addInput(tv1);
4461
4462 auto tv2 = add(tv0, tv1);
4463
4464 fusion->addOutput(tv2);
4465
4466 std::vector<IValue> aten_inputs({t0, t1});
4467
4468 FusionExecutorCache executor_cache(std::move(fusion));
4469 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4470
4471 auto t2 = t0 + t1;
4472
4473 testValidate(
4474 executor_cache.fusion(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
4475}
4476
4477namespace {
4478
4479// check that the resulting sibling are identical
4480void checkSiblingConsistency(TensorView* replay, TensorView* target) {
4481 auto replay_root = replay->getRootDomain();
4482 auto replay_dom = replay->domain()->domain();
4483 auto target_root = target->getRootDomain();
4484 auto target_dom = target->domain()->domain();
4485 std::unordered_map<IterDomain*, IterDomain*> target2replay_map;
4486 TORCH_CHECK(replay_root.size() == target_root.size());
4487 target2replay_map.reserve(replay_root.size());
4488 std::transform(
4489 target_root.begin(),
4490 target_root.end(),
4491 replay_root.begin(),
4492 std::inserter(target2replay_map, target2replay_map.begin()),
4493 [](auto a, auto b) { return std::make_pair(a, b); });
4494 BestEffortReplay replay_(replay_dom, target_dom, target2replay_map);
4495 auto r = replay_.getReplay();
4496 for (int64_t i = 0; i < (int64_t)replay_dom.size(); i++) {
4497 auto target_id = target_dom[i];
4498 auto replay_it = r.find(target_id);
4499 TORCH_CHECK(replay_it != r.end());
4500 TORCH_CHECK(
4501 replay_it->second == replay_dom[i],
4502 "IterDomain mismatch when checking ",
4503 replay,
4504 " and ",
4505 target,
4506 " at ",
4507 i,
4508 ", got ",
4509 replay_it->second,
4510 " and ",
4511 replay_dom[i]);
4512 }
4513};
4514
4515} // namespace
4516
4517TEST_F(NVFuserTest, FusionTransformPropagateSibling_CUDA) {
4518 // https://github.com/csarofeen/pytorch/issues/1760
4519 Fusion fusion;
4520 FusionGuard fg(&fusion);
4521
4522 auto tv0 = makeSymbolicTensor(2);
4523 fusion.addInput(tv0);
4524
4525 auto tvs = Welford(tv0, {1});
4526 fusion.addOutput(tvs.var_sum);
4527
4528 tvs.avg->split(1, 1);
4529 tvs.avg->split(1, 2);
4530 tvs.avg->split(1, 3);
4531 tvs.var_sum->split(1, 1);
4532 tvs.var_sum->split(1, 2);
4533 tvs.var_sum->split(1, 3);
4534 tvs.n->split(1, 1);
4535 tvs.n->split(1, 2);
4536 tvs.n->split(1, 3);
4537
4538 auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4});
4539
4540 TransformPropagatorWithCheck propagator(var_sum_rf);
4541 MaxRootDomainInfoSpanningTree(var_sum_rf).traverse(&propagator);
4542
4543 auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
4544
4545 std::vector<TensorView*> siblings[] = {{tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
4546 for (auto tensors : siblings) {
4547 for (auto t1 : tensors) {
4548 for (auto t2 : tensors) {
4549 TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
4550 }
4551 }
4552 }
4553}
4554
4555TEST_F(NVFuserTest, FusionTransformPropagateSelectorSibling_CUDA) {
4556 Fusion fusion;
4557 FusionGuard fg(&fusion);
4558
4559 auto tv0 = makeSymbolicTensor(2);
4560 fusion.addInput(tv0);
4561
4562 auto tvs = Welford(tv0, {1});
4563 fusion.addOutput(tvs.var_sum);
4564
4565 tvs.avg->split(1, 1);
4566 tvs.avg->split(1, 2);
4567 tvs.avg->split(1, 3);
4568 tvs.var_sum->split(1, 1);
4569 tvs.var_sum->split(1, 2);
4570 tvs.var_sum->split(1, 3);
4571 tvs.n->split(1, 1);
4572 tvs.n->split(1, 2);
4573 tvs.n->split(1, 3);
4574
4575 auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4});
4576
4577 struct DisableTv0 : public MaxInfoSpanningTree::Selector {
4578 TensorView* tv0;
4579 virtual bool allowC2P(TensorView* from, TensorView* to) override {
4580 return from != tv0 && to != tv0;
4581 };
4582 virtual bool allowP2C(TensorView* from, TensorView* to) override {
4583 return from != tv0 && to != tv0;
4584 };
4585 virtual bool allowSibling(TensorView* from, TensorView* to) override {
4586 return true;
4587 }
4588 DisableTv0(TensorView* tv0) : tv0(tv0) {}
4589 } selector1(tv0);
4590
4591 struct DisableTv0AndSibling : public DisableTv0 {
4592 virtual bool allowSibling(TensorView* from, TensorView* to) override {
4593 return false;
4594 }
4595 using DisableTv0::DisableTv0;
4596 } selector2(tv0);
4597
4598 TransformPropagatorWithCheck propagator(var_sum_rf);
4599 MaxRootDomainInfoSpanningTree good_path(var_sum_rf, &selector1);
4600 MaxRootDomainInfoSpanningTree bad_path(var_sum_rf, &selector2);
4601
4602 auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
4603
4604 auto check = [&]() {
4605 std::vector<TensorView*> siblings[] = {
4606 {tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
4607 for (auto tensors : siblings) {
4608 for (auto t1 : tensors) {
4609 for (auto t2 : tensors) {
4610 TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
4611 }
4612 }
4613 }
4614 };
4615
4616 bad_path.traverse(&propagator);
4617 ASSERT_ANY_THROW(check());
4618 good_path.traverse(&propagator);
4619 check();
4620}
4621
4622TEST_F(NVFuserTest, FusionTransformPropagatePosition_CUDA) {
4623 Fusion fusion;
4624 FusionGuard fg(&fusion);
4625
4626 auto tv0 = makeSymbolicTensor(4);
4627 auto tv1 = makeSymbolicTensor(6);
4628 fusion.addInput(tv0);
4629
4630 auto tv2 = broadcast(tv0, {false, false, true, false, false, true});
4631 auto tv3 = add(tv1, tv2);
4632 fusion.addOutput(tv3);
4633
4634 tv0->merge(2);
4635 tv0->merge(0);
4636 TransformPropagatorWithCheck propagator(tv0);
4637 MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator);
4638
4639 TORCH_CHECK(tv1->nDims() == 4);
4640}
4641
4642TEST_F(NVFuserTest, FusionIgnoreZeroDimReduction_CUDA) {
4643 auto fusion = std::make_unique<Fusion>();
4644 FusionGuard fg(fusion.get());
4645
4646 auto tv0 = makeSymbolicTensor(1);
4647 fusion->addInput(tv0);
4648 auto tv1 = sum(tv0, {0});
4649 // tv1 is effectively a zero-dim tensor as it only has a reduction
4650 // axis.
4651 // Reducing it further is converted to just a set op.
4652 auto tv2 = sum(tv1, {0});
4653 fusion->addOutput(tv2);
4654
4655 auto tv2_def = dynamic_cast<UnaryOp*>(tv2->definition());
4656 TORCH_CHECK(
4657 tv2_def != nullptr,
4658 "Expected UnaryOp but found ",
4659 tv2->definition()->toString());
4660
4661 TORCH_CHECK(
4662 tv2_def->getUnaryOpType() == UnaryOpType::Set,
4663 "Expected UnaryOpType::Set but found ",
4664 tv2_def->getUnaryOpType());
4665
4666 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4667 auto t0 = at::randn({12345}, options);
4668 std::vector<IValue> aten_inputs({t0});
4669
4670 FusionExecutorCache executor_cache(std::move(fusion));
4671 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4672
4673 auto ref = sum(t0, {0});
4674
4675 testValidate(
4676 executor_cache.fusion(),
4677 cg_outputs,
4678 aten_inputs,
4679 {ref},
4680 __LINE__,
4681 __FILE__);
4682}
4683
4684// Repro of issue #1770
4685TEST_F(NVFuserTest, FusionIssue1770Repro_CUDA) {
4686 auto fusion = std::make_unique<Fusion>();
4687 FusionGuard fg(fusion.get());
4688
4689 auto tv0 = makeSymbolicTensor(1);
4690 fusion->addInput(tv0);
4691 auto tv1 = makeSymbolicTensor(1);
4692 fusion->addInput(tv1);
4693
4694 auto tv2 = ge(tv0, tv1);
4695 auto tv3 =
4696 where(tv2, IrBuilder::create<Double>(1), IrBuilder::create<Double>(2));
4697 fusion->addOutput(tv3);
4698
4699 std::vector<int64_t> shape({999});
4700 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4701 at::Tensor t0 = at::randn(shape, options);
4702 at::Tensor t1 = at::randn(shape, options);
4703 std::vector<IValue> aten_inputs({t0, t1});
4704
4705 FusionExecutorCache executor_cache(std::move(fusion));
4706 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
4707
4708 auto ref = where(t0 >= t1, 1.0, 2.0);
4709
4710 testValidate(
4711 executor_cache.fusion(),
4712 cg_outputs,
4713 aten_inputs,
4714 {ref},
4715 __LINE__,
4716 __FILE__);
4717}
4718
4719TEST_F(NVFuserTest, FusionTransformPropagatorSelector_CUDA) {
4720 auto fusion = std::make_unique<Fusion>();
4721 FusionGuard fg(fusion.get());
4722
4723 auto tv0 = makeSymbolicTensor(1);
4724 fusion->addInput(tv0);
4725 auto tv1 = makeSymbolicTensor(1);
4726 fusion->addInput(tv1);
4727
4728 auto tv2 = add(tv0, tv1);
4729
4730 auto tv3 = sin(tv2);
4731 auto tv4 = cos(tv2);
4732
4733 fusion->addOutput(tv3);
4734 fusion->addOutput(tv4);
4735
4736 tv2->split(0, 10);
4737
4738 struct Selector : public MaxInfoSpanningTree::Selector {
4739 TensorView* tv0;
4740 TensorView* tv3;
4741 virtual bool allowC2P(TensorView* from, TensorView* to) override {
4742 return to == tv0;
4743 }
4744 virtual bool allowP2C(TensorView* from, TensorView* to) override {
4745 return to == tv3;
4746 }
4747 virtual bool allowSibling(TensorView* from, TensorView* to) override {
4748 return false;
4749 }
4750 Selector(TensorView* tv0, TensorView* tv3) : tv0(tv0), tv3(tv3) {}
4751 } selector(tv0, tv3);
4752
4753 TransformPropagatorWithCheck propagator(tv2);
4754 MaxRootDomainInfoSpanningTree(tv2, &selector).traverse(&propagator);
4755
4756 TORCH_CHECK(tv0->nDims() == 2);
4757 TORCH_CHECK(tv1->nDims() == 1);
4758 TORCH_CHECK(tv2->nDims() == 2);
4759 TORCH_CHECK(tv3->nDims() == 2);
4760 TORCH_CHECK(tv4->nDims() == 1);
4761}
4762
4763TEST_F(NVFuserTest, FusionTransformPropagatorPos_CUDA) {
4764 auto fusion = std::make_unique<Fusion>();
4765 FusionGuard fg(fusion.get());
4766
4767 auto tv0 = makeConcreteTensor({22, 105});
4768 fusion->addInput(tv0);
4769
4770 auto tv1 = sin(tv0);
4771 fusion->addOutput(tv1);
4772
4773 tv1->split(0, 2);
4774 tv1->split(-1, 3);
4775 tv1->split(-1, 5);
4776
4777 TransformPropagatorWithCheck propagator(tv1, 2);
4778 MaxRootDomainInfoSpanningTree(tv1, 2).traverse(&propagator);
4779
4780 auto expect = makeConcreteTensor({22, 105});
4781 expect->split(0, 2);
4782 TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv0));
4783}
4784
4785TEST_F(NVFuserTest, FusionMaxRootDomainInfoSpanningTreePrintTwice_CUDA) {
4786 auto fusion = std::make_unique<Fusion>();
4787 FusionGuard fg(fusion.get());
4788
4789 auto tv0 = makeSymbolicTensor(3);
4790 fusion->addInput(tv0);
4791
4792 auto tv1 = sum(tv0, {0});
4793 auto tv2 = neg(tv1);
4794
4795 fusion->addOutput(tv2);
4796
4797 tv1->split(0, 10);
4798
4799 struct Printer : public MaxInfoSpanningTree::Propagator {
4800 std::stringstream ss;
4801 virtual void propagateC2P(TensorView* from, TensorView* to) override {
4802 ss << "propagateC2P" << std::endl;
4803 ss << "from: " << from->name() << std::endl;
4804 ss << "to: " << to->name() << std::endl;
4805 }
4806 virtual void propagateP2C(TensorView* from, TensorView* to) override {
4807 ss << "propagateP2C" << std::endl;
4808 ss << "from: " << from->name() << std::endl;
4809 ss << "to: " << to->name() << std::endl;
4810 }
4811 virtual void propagateSibling(TensorView* from, TensorView* to) override {
4812 ss << "propagateSibling" << std::endl;
4813 ss << "from: " << from->name() << std::endl;
4814 ss << "to: " << to->name() << std::endl;
4815 }
4816 } printer1, printer2;
4817 printer1.ss << std::endl;
4818 printer2.ss << std::endl;
4819
4820 MaxRootDomainInfoSpanningTree path(tv1);
4821 path.traverse(&printer1);
4822 path.traverse(&printer2);
4823
4824 auto expect = R"ESCAPE(
4825propagateC2P
4826from: 1
4827to: 0
4828propagateP2C
4829from: 1
4830to: 2
4831)ESCAPE";
4832 TORCH_CHECK(printer1.ss.str() == expect);
4833 TORCH_CHECK(printer2.ss.str() == expect);
4834}
4835
4836TEST_F(NVFuserTest, FusionTransformPropagatorNoOverwrite_CUDA) {
4837 auto fusion = std::make_unique<Fusion>();
4838 FusionGuard fg(fusion.get());
4839
4840 auto tv0 = makeSymbolicTensor(1);
4841 fusion->addInput(tv0);
4842 auto tv1 = broadcast(tv0, {true, false, true});
4843 auto tv2 = sin(tv1);
4844 fusion->addOutput(tv2);
4845
4846 tv0->split(0, 2);
4847 tv2->split(1, 2);
4848 tv2->split(0, 4);
4849
4850 MaxRootDomainInfoSpanningTree path1(tv2);
4851 TransformPropagatorWithCheck propagator1(tv2);
4852 path1.traverse(&propagator1);
4853
4854 MaxRootDomainInfoSpanningTree path2(tv0);
4855 TransformPropagatorWithCheck propagator2(tv0);
4856 path2.traverse(&propagator2);
4857
4858 TORCH_CHECK(tv1->axis(0)->isBroadcast());
4859 TORCH_CHECK(tv1->axis(1)->isBroadcast());
4860 TORCH_CHECK(!tv1->axis(2)->isBroadcast());
4861 TORCH_CHECK(!tv1->axis(3)->isBroadcast());
4862 TORCH_CHECK(tv1->axis(4)->isBroadcast());
4863
4864 auto expect = makeSymbolicTensor(3);
4865 expect->split(1, 2);
4866 expect->split(0, 4);
4867 TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv1));
4868}
4869
4870TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) {
4871 Fusion fusion;
4872 FusionGuard fg(&fusion);
4873
4874 // Set up your input tensor views
4875 TensorView* tv0 = makeContigTensor(1);
4876 TensorView* tv1 = makeContigTensor(2);
4877
4878 // Register your inputs
4879 fusion.addInput(tv0);
4880 fusion.addInput(tv1);
4881
4882 auto tv2 = set(tv0);
4883 // [B, I]
4884 auto tv3 = broadcast(tv2, {true, false});
4885 auto tv4 = add(tv3, tv1);
4886 auto tv5 = set(tv4);
4887
4888 // Register your outputs
4889 fusion.addOutput(tv5);
4890
4891 tv5->split(0, 8);
4892 tv5->split(-1, 8);
4893
4894 // [Serial, TIDy, TIDX, Serial]
4895
4896 tv4->computeAt(tv5, -2);
4897 tv3->computeAt(tv4, -1);
4898 tv2->computeAt(tv3, 0);
4899 tv2->split(0, 8);
4900 tv2->axis(0)->parallelize(ParallelType::TIDx);
4901 tv1->computeAt(tv5, -2);
4902
4903 tv5->axis(1)->parallelize(ParallelType::TIDy);
4904 tv5->axis(2)->parallelize(ParallelType::TIDx);
4905
4906 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
4907
4908 at::Tensor in1 = at::randn({16}, options);
4909 at::Tensor in2 = at::randn({12, 16}, options);
4910
4911 FusionExecutor fe;
4912 fe.compileFusion(&fusion, {in1, in2});
4913 auto cg_outputs = fe.runFusion({in1, in2});
4914
4915 auto tv_ref = in1 + in2;
4916
4917 testValidate(&fusion, cg_outputs, {in1, in2}, {tv_ref}, __LINE__, __FILE__);
4918}
4919
4920TEST_F(NVFuserTest, FusionSkipReplay_CUDA) {
4921 {
4922 Fusion fusion;
4923 FusionGuard fg(&fusion);
4924
4925 TensorView* tv0 = makeContigTensor(1);
4926 TensorView* tv1 = makeContigTensor(2);
4927 fusion.addInput(tv0);
4928 fusion.addInput(tv1);
4929
4930 auto tv2 = broadcast(tv0, {false, true});
4931 auto tv3 = add(tv2, tv1);
4932 fusion.addOutput(tv3);
4933
4934 tv3->split(1, 2, false);
4935
4936 TransformPropagatorWithCheck propagator(tv3);
4937 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
4938 }
4939
4940 {
4941 Fusion fusion;
4942 FusionGuard fg(&fusion);
4943
4944 TensorView* tv0 = makeContigTensor(3);
4945 fusion.addInput(tv0);
4946
4947 auto tv1 = sum(tv0, {0, 2});
4948 auto tv2 = sin(tv1);
4949 fusion.addOutput(tv2);
4950
4951 tv0->split(1, 2, false);
4952
4953 TransformPropagatorWithCheck propagator(tv0);
4954 MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator);
4955 }
4956}
4957
4958TEST_F(NVFuserTest, FusionInlineRepro1803_CUDA) {
4959 Fusion fusion;
4960 FusionGuard fg(&fusion);
4961
4962 TensorView* tv0 = makeContigTensor(2);
4963
4964 fusion.addInput(tv0);
4965 auto tv1 = set(tv0);
4966 auto tvs = Welford(tv1, {1});
4967 auto tvo = set(tvs.var_sum);
4968 fusion.addOutput(tvo);
4969
4970 tvo->split(0, 16);
4971 tvo->axis(1)->parallelize(ParallelType::Unroll);
4972
4973 tv0->computeAt(tvo, -1, ComputeAtMode::BestEffort);
4974
4975 TORCH_CHECK(
4976 tvs.var_sum->getComputeAtPosition() == tvs.avg->getComputeAtPosition());
4977 TORCH_CHECK(
4978 tvs.var_sum->getComputeAtPosition() == tvs.n->getComputeAtPosition());
4979 TORCH_CHECK(tvs.var_sum->getComputeAtPosition() == 1);
4980}
4981
4982// Unit test for the transform selection logic
4983TEST_F(NVFuserTest, FusionBoundedDirectionSelection1_CUDA) {
4984 Fusion fusion;
4985 FusionGuard fg(&fusion);
4986
4987 TensorView* tv0 = makeContigTensor(2);
4988
4989 fusion.addInput(tv0);
4990 auto tv1 = set(tv0);
4991 auto tv2 = set(tv1);
4992 auto tv3 = add(tv2, tv1);
4993 fusion.addOutput(tv3);
4994
4995 tv3->split(-1, 5);
4996 tv3->split(-1, 8);
4997
4998 scheduler_utils::BoundedDirectionalTransformPropagator::backward(
4999 tv3, -1, {tv0, tv2});
5000
5001 // Check that the splits are replayed on tv2
5002 TORCH_INTERNAL_ASSERT(
5003 tv2->nDims() == tv3->nDims(),
5004 "Propagator didn't propagate to tv2: ",
5005 tv2->toString());
5006
5007 // Check that the splits are replayed on tv1 as well. Even though
5008 // one of its consumers, tv2, is part of the boundary, another
5009 // consumer is not a boundary, so tv1 should be transformed as well.
5010 TORCH_INTERNAL_ASSERT(
5011 tv1->nDims() == tv3->nDims(),
5012 "Propagator didn't propagate to tv1: ",
5013 tv1->toString());
5014}
5015
5016TEST_F(NVFuserTest, FusionIssueRepro1844_CUDA) {
5017 auto fusion = std::make_unique<Fusion>();
5018 FusionGuard fg(fusion.get());
5019
5020 std::vector<int64_t> shape = {2, 1, 768};
5021 std::vector<int64_t> sum_to_shape = {768};
5022 std::vector<int64_t> sum_to_axes = {0, 1};
5023 double kProb = 0.5;
5024
5025 std::vector<Int*> sum_to_symb;
5026 std::transform(
5027 sum_to_shape.begin(),
5028 sum_to_shape.end(),
5029 std::back_inserter(sum_to_symb),
5030 [](int s) -> Int* { return IrBuilder::create<Int>(s); });
5031
5032 TensorView* tv0 = makeContigConcreteTensor(shape);
5033 TensorView* tv1 = makeContigConcreteTensor(shape);
5034 TensorView* tv2 = makeContigConcreteTensor(shape, DataType::Bool);
5035
5036 fusion->addInput(tv0);
5037 fusion->addInput(tv1);
5038 fusion->addInput(tv2);
5039
5040 Double* prob = IrBuilder::create<Double>(kProb);
5041 auto grad_input = dropout_backward(tv1, tv2, prob);
5042 auto grad_gelu = gelu_backward(grad_input, tv0);
5043 auto grad_bias = sum_to(grad_gelu, sum_to_symb);
5044
5045 fusion->addOutput(grad_gelu);
5046 fusion->addOutput(grad_bias);
5047
5048 const auto options =
5049 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5050 const auto mask_options =
5051 at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
5052 at::manual_seed(0);
5053
5054 at::Tensor a = at::randn(shape, options);
5055 at::Tensor b = at::randn(shape, options);
5056 at::Tensor c = at::randn(shape, options);
5057 auto mask = at::gt(c, 0.0f);
5058 std::vector<IValue> aten_inputs = {a, b, mask};
5059
5060 FusionExecutorCache executor_cache(std::move(fusion));
5061 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
5062
5063 auto dinput = at::native_dropout_backward(b, mask, kProb);
5064 auto dgelu = at::gelu_backward(dinput, a, "none");
5065 auto dbias = dgelu.sum(sum_to_axes);
5066
5067 testValidate(
5068 executor_cache.fusion(),
5069 cg_outputs,
5070 aten_inputs,
5071 {dgelu, dbias},
5072 __LINE__,
5073 __FILE__);
5074}
5075
5076TEST_F(NVFuserTest, FusionInsertMagicZero1_CUDA) {
5077 Fusion fusion;
5078 FusionGuard fg(&fusion);
5079
5080 auto tv0 = makeSymbolicTensor(2);
5081 fusion.addInput(tv0);
5082
5083 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
5084 auto tv2 = set(tv1);
5085 fusion.addOutput(tv2);
5086
5087 tv2->split(0, 32);
5088 tv2->split(-1, 2);
5089 tv2->reorder({{1, 2}, {2, 1}});
5090 tv2->merge(0);
5091
5092 TransformPropagatorWithCheck propagator(tv2);
5093 MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
5094
5095 tv0->computeAt(tv2, 1);
5096
5097 // The predicate of tv2 should be protected with magic zero
5098 GpuLower gpulw(&fusion);
5099 TORCH_CHECK(
5100 PredicateMagicZeroChecker::isProtected(tv2, gpulw),
5101 "Failed to protect the predicates of ",
5102 tv2->toString());
5103}
5104
5105TEST_F(NVFuserTest, FusionRepro1860_CUDA) {
5106 auto fusion_ptr = std::make_unique<Fusion>();
5107 Fusion& fusion = *fusion_ptr;
5108 FusionGuard fg(&fusion);
5109 std::vector<bool> contiguity{true, false, false};
5110
5111 std::vector<int64_t> shape{1, -1, -1};
5112 TensorView* tv0 = makeContigConcreteTensor(shape);
5113 fusion.addInput(tv0);
5114 TensorView* tv1 = makeContigConcreteTensor(shape);
5115 fusion.addInput(tv1);
5116 TensorView* tv2 = makeContigConcreteTensor(shape);
5117 fusion.addInput(tv2);
5118
5119 std::vector<IterDomain*> domain1(3, nullptr);
5120 for (const auto i : c10::irange(3)) {
5121 if (i == 0) {
5122 domain1[i] =
5123 IterDomainBuilder(
5124 FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
5125 .iter_type(IterType::Broadcast)
5126 .build();
5127 } else {
5128 domain1[i] =
5129 IterDomainBuilder(
5130 FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
5131 .expanded_extent(IrBuilder::create<Int>(1 + i))
5132 .iter_type(IterType::Broadcast)
5133 .build();
5134 }
5135 }
5136
5137 TensorView* tv22 = IrBuilder::create<TensorView>(
5138 IrBuilder::create<TensorDomain>(domain1, contiguity), DataType::Float);
5139
5140 fusion.addInput(tv22);
5141
5142 auto tv3 = add(tv0, tv1);
5143 auto tv4 = softmax(tv3, 0);
5144 auto tv5 = add(tv4, tv22);
5145 fusion.addOutput(tv5);
5146
5147 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5148
5149 at::Tensor input1 = at::randn({1, 2, 3}, options);
5150 at::Tensor input2 = at::randn({1, 2, 3}, options);
5151 at::Tensor input3 = at::randn({1, 2, 3}, options);
5152 at::Tensor input4 = at::randn({1, 1, 1}, options).expand({1, 2, 3});
5153 std::vector<IValue> aten_inputs = {input1, input2, input3, input4};
5154
5155 FusionExecutorCache executor_cache(std::move(fusion_ptr));
5156 auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
5157}
5158
5159TEST_F(NVFuserTest, FusionExpandReduce_CUDA) {
5160 auto fusion = std::make_unique<Fusion>();
5161 FusionGuard fg(fusion.get());
5162
5163 auto tv0 = makeConcreteTensor({1, 8});
5164 fusion->addInput(tv0);
5165
5166 auto tv1 =
5167 expand(tv0, {IrBuilder::create<Int>(12), IrBuilder::create<Int>(8)});
5168
5169 auto tv2 = sum(tv1, {0});
5170 fusion->addOutput(tv2);
5171
5172 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5173 at::manual_seed(0);
5174 auto t0 = at::randn({1, 8}, options);
5175
5176 FusionExecutorCache executor_cache(std::move(fusion));
5177 auto cg_outputs = executor_cache.runFusionWithInputs({t0});
5178
5179 auto ref = t0.expand({12, 8}).sum({0});
5180
5181 testValidate(
5182 executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
5183}
5184
5185// Predicate elimination issue repro:
5186TEST_F(NVFuserTest, FusionExpandReduce2_CUDA) {
5187 auto fusion = std::make_unique<Fusion>();
5188 FusionGuard fg(fusion.get());
5189
5190 auto tv0 = makeConcreteTensor({1, 4});
5191 fusion->addInput(tv0);
5192
5193 auto tv1 =
5194 expand(tv0, {IrBuilder::create<Int>(3), IrBuilder::create<Int>(4)});
5195
5196 auto tv2 = sum(tv1, {0});
5197 fusion->addOutput(tv2);
5198
5199 // tv2[r{3}, i{4}]
5200 tv2->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
5201 tv2->axis(1)->parallelize(ParallelType::TIDy);
5202 tv2->split(0, NamedScalar::getParallelDim(ParallelType::BIDy), false);
5203 tv2->axis(0)->parallelize(ParallelType::BIDy);
5204 tv2->split(-1, NamedScalar::getParallelDim(ParallelType::TIDx));
5205 tv2->axis(-1)->parallelize(ParallelType::TIDx);
5206 tv2->axis(-2)->parallelize(ParallelType::BIDx);
5207 // [rBIDy, rO, rTIDy, iBIDx, iTIDx]
5208 tv2->reorder({{-2, 0}, {-1, 1}, {2, 2}});
5209 // [iBIDx, iTIDx, rTIDy, rBIDy, rO]
5210 auto tv3 = tv2->rFactor({-1});
5211
5212 TransformPropagatorWithCheck propagator(tv3);
5213 MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
5214 scheduler_utils::parallelizeAllLike(tv3);
5215 tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
5216
5217 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5218 at::manual_seed(0);
5219 auto t0 = at::randn({1, 4}, options);
5220
5221 FusionExecutor fe;
5222 fe.compileFusion(fusion.get(), {t0}, LaunchParams(-1, 2, -1, 4, 2, 1));
5223 auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, 2, -1, 4, 2, 1));
5224
5225 auto ref = t0.expand({3, 4}).sum({0});
5226
5227 testValidate(
5228 fusion.get(),
5229 cg_outputs,
5230 {t0},
5231 {ref},
5232 __LINE__,
5233 __FILE__,
5234 "",
5235 LaunchParams(-1, 2, -1, 4, 2, 1));
5236}
5237
5238TEST_F(NVFuserTest, FusionExpandBadShapeTest_CUDA) {
5239 auto fusion_ptr = std::make_unique<Fusion>();
5240 Fusion& fusion = *fusion_ptr;
5241 FusionGuard fg(&fusion);
5242 std::vector<bool> contiguity{false, false};
5243
5244 auto tv0 = makeSymbolicTensor(2);
5245 fusion.addInput(tv0);
5246
5247 std::vector<IterDomain*> domains = {
5248 IterDomainBuilder(
5249 FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>())
5250 .build(),
5251 IterDomainBuilder(
5252 FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
5253 .expanded_extent(IrBuilder::create<Int>(10))
5254 .iter_type(IterType::Broadcast)
5255 .build()};
5256
5257 // expand to 10
5258 TensorView* tv22 = IrBuilder::create<TensorView>(
5259 IrBuilder::create<TensorDomain>(domains, contiguity), DataType::Float);
5260
5261 fusion.addInput(tv22);
5262
5263 auto tv3 = add(tv0, tv22);
5264 fusion.addOutput(tv3);
5265
5266 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5267
5268 // Incompatible shapes
5269 at::Tensor input1 = at::randn({2, 3}, options);
5270 // Passing expand size of 5, not 10. Should cause an error
5271 at::Tensor input4 = at::randn({2, 1}, options).expand({2, 5});
5272
5273 std::vector<IValue> aten_inputs = {input1, input4};
5274
5275 FusionExecutorCache executor_cache(std::move(fusion_ptr));
5276 ASSERT_ANY_THROW(executor_cache.runFusionWithInputs(aten_inputs));
5277}
5278
5279TEST_F(
5280 NVFuserTest,
5281 FusionPointwiseScheduleWithBroadcastAndTrivialReduction_CUDA) {
5282 Fusion fusion;
5283 FusionGuard fg(&fusion);
5284
5285 auto tv0 = makeContigTensor(3);
5286 auto tv1 = makeContigTensor(2);
5287 fusion.addInput(tv0);
5288 fusion.addInput(tv1);
5289 auto tv2 = broadcast(tv0, {false, true, false, true, false, true});
5290 auto tv3 = sin(tv2);
5291 auto tv4 = add(tv3, tv1);
5292 auto tv5 = sum(tv4, {1});
5293 fusion.addOutput(tv5);
5294
5295 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5296 at::Tensor t0 = at::randn({100, 100, 10}, options);
5297 at::Tensor t1 = at::randn({10, 20}, options);
5298
5299 auto aten_output = (t0.view({100, 1, 100, 1, 10, 1}).sin() + t1).squeeze(1);
5300
5301 std::vector<IValue> aten_inputs = {t0, t1};
5302
5303 auto lparams = schedulePointwise(&fusion, aten_inputs);
5304
5305 FusionExecutor fe;
5306 fe.compileFusion(&fusion, aten_inputs, lparams);
5307 auto cg_outputs = fe.runFusion(aten_inputs, lparams);
5308
5309 testValidate(
5310 &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
5311}
5312
5313TEST_F(NVFuserTest, FusionInliningMismatchedDims1_CUDA) {
5314 Fusion fusion;
5315 FusionGuard fg(&fusion);
5316
5317 auto tv0 = makeConcreteTensor({2, 3, 4});
5318 fusion.addInput(tv0);
5319 auto tv1 = sin(tv0);
5320 auto tv2 = cos(tv1);
5321 auto tv3 = transpose(tv2, 1, 2);
5322 auto tv4 = exp(tv3);
5323 auto tv5 = tan(tv4);
5324 fusion.addOutput(tv5);
5325
5326 inlineMost();
5327
5328 TORCH_CHECK(tv5->getComputeAtPosition() == 3);
5329 TORCH_CHECK(tv4->getComputeAtPosition() == 3);
5330 TORCH_CHECK(tv3->getComputeAtPosition() == 3);
5331 TORCH_CHECK(tv2->getComputeAtPosition() == 1);
5332 TORCH_CHECK(tv1->getComputeAtPosition() == 3);
5333
5334 const auto options =
5335 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5336 at::Tensor input = at::randn({2, 3, 4}, options);
5337 auto output = input.sin().cos().transpose(1, 2).exp().tan();
5338
5339 FusionExecutor fe;
5340 fe.compileFusion(&fusion, {input});
5341 auto cg_outputs = fe.runFusion({input});
5342
5343 testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5344}
5345
5346TEST_F(NVFuserTest, FusionInliningMismatchedDims2_CUDA) {
5347 Fusion fusion;
5348 FusionGuard fg(&fusion);
5349
5350 auto tv0 = makeConcreteTensor({2, 3, 4});
5351 fusion.addInput(tv0);
5352 auto tv1 = sin(tv0);
5353 auto tv2 = cos(tv1);
5354 auto tv3 = transpose(tv2, 1, 2);
5355 auto tv4 = exp(tv3);
5356 auto tv5 = tan(tv4);
5357 fusion.addOutput(tv5);
5358
5359 inlineAllAt(tv5, -1, true);
5360
5361 TORCH_CHECK(tv5->getComputeAtPosition() == 3);
5362 TORCH_CHECK(tv4->getComputeAtPosition() == 3);
5363 TORCH_CHECK(tv3->getComputeAtPosition() == 3);
5364 TORCH_CHECK(tv2->getComputeAtPosition() == 1);
5365 TORCH_CHECK(tv1->getComputeAtPosition() == 1);
5366
5367 const auto options =
5368 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5369 at::Tensor input = at::randn({2, 3, 4}, options);
5370 auto output = input.sin().cos().transpose(1, 2).exp().tan();
5371
5372 FusionExecutor fe;
5373 fe.compileFusion(&fusion, {input});
5374 auto cg_outputs = fe.runFusion({input});
5375
5376 testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5377}
5378
5379TEST_F(NVFuserTest, FusionInliningMismatchedDims3_CUDA) {
5380 Fusion fusion;
5381 FusionGuard fg(&fusion);
5382
5383 auto tv0 = makeConcreteTensor({2, 3, 4});
5384 fusion.addInput(tv0);
5385 auto tv1 = sin(tv0);
5386 // broadcasting
5387 auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
5388 auto tv3 = relu(tv2);
5389 // trivial reduction
5390 auto tv4 = sum(tv3, {1, 3, 5});
5391 auto tv5 = cos(tv4);
5392 auto tv6 = transpose(tv5, 1, 2);
5393 auto tv7 = exp(tv6);
5394 auto tv8 = tan(tv7);
5395 fusion.addOutput(tv8);
5396
5397 for (auto tv : {tv2, tv3, tv4}) {
5398 tv->merge(0);
5399 tv->merge(1);
5400 tv->merge(2);
5401 }
5402
5403 inlineMost();
5404
5405 TORCH_CHECK(tv8->getComputeAtPosition() == 3);
5406 TORCH_CHECK(tv7->getComputeAtPosition() == 3);
5407 TORCH_CHECK(tv6->getComputeAtPosition() == 3);
5408 TORCH_CHECK(tv5->getComputeAtPosition() == 1);
5409 TORCH_CHECK(tv4->getComputeAtPosition() == 3);
5410 TORCH_CHECK(tv3->getComputeAtPosition() == 3);
5411 TORCH_CHECK(tv2->getComputeAtPosition() == 3);
5412 TORCH_CHECK(tv1->getComputeAtPosition() == 3);
5413
5414 const auto options =
5415 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5416 at::Tensor input = at::randn({2, 3, 4}, options);
5417 auto output = input.sin().relu().cos().transpose(1, 2).exp().tan();
5418
5419 FusionExecutor fe;
5420 fe.compileFusion(&fusion, {input});
5421 auto cg_outputs = fe.runFusion({input});
5422
5423 testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5424}
5425
5426TEST_F(NVFuserTest, FusionInliningMismatchedDims4_CUDA) {
5427 Fusion fusion;
5428 FusionGuard fg(&fusion);
5429
5430 auto tv0 = makeConcreteTensor({2, 3, 4});
5431 fusion.addInput(tv0);
5432 auto tv1 = sin(tv0);
5433 auto tv2 = exp(tv1);
5434 auto tv3 = relu(tv2);
5435 auto tv4 = cos(tv3);
5436 auto tv5 = tan(tv4);
5437 fusion.addOutput(tv5);
5438
5439 tv3->merge(1);
5440 inlineMost();
5441
5442 TORCH_CHECK(tv5->getComputeAtPosition() == 3);
5443 TORCH_CHECK(tv4->getComputeAtPosition() == 3);
5444 TORCH_CHECK(tv3->getComputeAtPosition() == 1);
5445 TORCH_CHECK(tv2->getComputeAtPosition() == 1);
5446 TORCH_CHECK(tv1->getComputeAtPosition() == 3);
5447
5448 const auto options =
5449 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5450 at::Tensor input = at::randn({2, 3, 4}, options);
5451 auto output = input.sin().exp().relu().cos().tan();
5452
5453 FusionExecutor fe;
5454 fe.compileFusion(&fusion, {input});
5455 auto cg_outputs = fe.runFusion({input});
5456
5457 testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5458}
5459
5460TEST_F(NVFuserTest, FusionInliningBroadcast_CUDA) {
5461 Fusion fusion;
5462 FusionGuard fg(&fusion);
5463
5464 auto tv0 = makeConcreteTensor({2, 3, 4});
5465 fusion.addInput(tv0);
5466 auto tv1 = sin(tv0);
5467 // broadcasting
5468 auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
5469 auto tv3 = cos(tv2);
5470 auto tv4 = tan(tv3);
5471 fusion.addOutput(tv4);
5472
5473 for (auto tv : {tv2, tv3, tv4}) {
5474 tv->merge(0);
5475 tv->merge(1);
5476 tv->merge(2);
5477 }
5478
5479 inlineMost();
5480
5481 TORCH_CHECK(tv4->getComputeAtPosition() == 3);
5482 TORCH_CHECK(tv3->getComputeAtPosition() == 3);
5483 TORCH_CHECK(tv2->getComputeAtPosition() == 3);
5484 TORCH_CHECK(tv1->getComputeAtPosition() == 3);
5485
5486 const auto options =
5487 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5488 at::Tensor input = at::randn({2, 3, 4}, options);
5489 auto output = input.sin().view({2, 1, 3, 1, 4, 1}).cos().tan();
5490
5491 FusionExecutor fe;
5492 fe.compileFusion(&fusion, {input});
5493 auto cg_outputs = fe.runFusion({input});
5494
5495 testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5496}
5497
5498TEST_F(NVFuserTest, FusionInliningBroadcastTrivialReduction_CUDA) {
5499 Fusion fusion;
5500 FusionGuard fg(&fusion);
5501
5502 auto tv0 = makeConcreteTensor({2, 3, 4});
5503 fusion.addInput(tv0);
5504 auto tv1 = sin(tv0);
5505 // broadcasting
5506 auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
5507 auto tv3 = tan(tv2);
5508 // trivial reduction
5509 auto tv4 = sum(tv3, {1, 3, 5});
5510 auto tv5 = cos(tv4);
5511 auto tv6 = exp(tv5);
5512 fusion.addOutput(tv6);
5513
5514 for (auto tv : {tv2, tv3, tv4}) {
5515 tv->merge(0);
5516 tv->merge(1);
5517 tv->merge(2);
5518 }
5519
5520 inlineMost();
5521
5522 TORCH_CHECK(tv6->getComputeAtPosition() == 3);
5523 TORCH_CHECK(tv5->getComputeAtPosition() == 3);
5524 TORCH_CHECK(tv4->getComputeAtPosition() == 3);
5525 TORCH_CHECK(tv3->getComputeAtPosition() == 3);
5526 TORCH_CHECK(tv2->getComputeAtPosition() == 3);
5527 TORCH_CHECK(tv1->getComputeAtPosition() == 3);
5528
5529 const auto options =
5530 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5531 at::Tensor input = at::randn({2, 3, 4}, options);
5532 auto output = input.sin().tan().cos().exp();
5533
5534 FusionExecutor fe;
5535 fe.compileFusion(&fusion, {input});
5536 auto cg_outputs = fe.runFusion({input});
5537
5538 testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
5539}
5540
5541TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayTrivialReduction_CUDA) {
5542 Fusion fusion;
5543 FusionGuard fg(&fusion);
5544
5545 auto tv0 = makeConcreteTensor({2, 1, 3, 1, 4, 1});
5546 fusion.addInput(tv0);
5547 auto tv1 = sum(tv0, {1, 3, 5});
5548 auto tv2 = sin(tv1);
5549 fusion.addOutput(tv1);
5550
5551 for (auto tv : {tv0, tv1}) {
5552 tv->merge(0);
5553 tv->merge(1);
5554 tv->merge(2);
5555 }
5556
5557 TORCH_CHECK(
5558 TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3);
5559 TORCH_CHECK(
5560 TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3);
5561 TORCH_CHECK(
5562 TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3);
5563 TORCH_CHECK(
5564 TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3);
5565}
5566
5567TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayBroadcast_CUDA) {
5568 Fusion fusion;
5569 FusionGuard fg(&fusion);
5570
5571 auto tv0 = makeConcreteTensor({2, 3, 4});
5572 fusion.addInput(tv0);
5573 auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
5574 auto tv2 = sin(tv1);
5575 fusion.addOutput(tv2);
5576
5577 for (auto tv : {tv1, tv2}) {
5578 tv->merge(0);
5579 tv->merge(1);
5580 tv->merge(2);
5581 }
5582
5583 TORCH_CHECK(
5584 TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3);
5585 TORCH_CHECK(
5586 TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3);
5587 TORCH_CHECK(
5588 TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3);
5589 TORCH_CHECK(
5590 TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3);
5591}
5592
5593TEST_F(NVFuserTest, FusionIdGraphTrivialReduction_CUDA) {
5594 Fusion fusion;
5595 FusionGuard fg(&fusion);
5596
5597 auto tv0 = makeConcreteTensor({2, 3, 4});
5598 fusion.addInput(tv0);
5599 auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
5600 auto tv2 = sum(tv1, {1, 3, 5});
5601 auto tv3 = sin(tv2);
5602 fusion.addOutput(tv3);
5603
5604 for (auto tv : {tv1, tv2}) {
5605 tv->merge(0);
5606 tv->merge(1);
5607 tv->merge(2);
5608 }
5609
5610 inlineMost();
5611
5612 ComputeAtMap ca_map(&fusion);
5613
5614 auto all_tvs = ir_utils::allTvs(&fusion);
5615 for (auto tv1 : all_tvs) {
5616 for (auto tv2 : all_tvs) {
5617 if (tv1->isFusionInput() || tv2->isFusionInput()) {
5618 continue;
5619 }
5620 for (int i : c10::irange(3)) {
5621 auto id1 = tv1->axis(i);
5622 auto id2 = tv2->axis(i);
5623 TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::LOOP));
5624 TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::PERMISSIVE));
5625 }
5626 }
5627 }
5628}
5629
5630TEST_F(NVFuserTest, FusionPrint_CUDA) {
5631 auto dtypes = {
5632 at::kFloat,
5633 at::kDouble,
5634 at::kHalf,
5635 at::kBFloat16,
5636 at::kInt,
5637 at::kLong,
5638 at::kBool};
5639 for (auto dtype : dtypes) {
5640 auto fusion = std::make_unique<Fusion>();
5641 FusionGuard fg(fusion.get());
5642
5643 auto tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype));
5644 fusion->addInput(tv0);
5645 auto tv1 = print(tv0);
5646 auto tv2 = sin(tv1);
5647 fusion->addOutput(tv2);
5648
5649 // There is no way to check if anything is printed to the console, but we
5650 // can validate that when print exist, compilation and computation are not
5651 // broken.
5652 auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
5653 at::Tensor t0 = at::arange(2, options).to(dtype);
5654
5655 FusionExecutorCache executor_cache(std::move(fusion));
5656 auto cg_outputs = executor_cache.runFusionWithInputs({t0});
5657
5658 testValidate(
5659 executor_cache.fusion(),
5660 cg_outputs,
5661 {t0},
5662 {t0.sin()},
5663 __LINE__,
5664 __FILE__);
5665 }
5666}
5667
5668TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) {
5669 const auto options =
5670 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5671
5672 at::Tensor a = at::randn({123, 456}, options);
5673 at::Tensor b = at::randn({123, 456}, options);
5674 at::Tensor c = at::randn({321, 654}, options);
5675
5676 using return_t =
5677 std::pair<std::unique_ptr<FusionExecutorCache>, std::vector<at::Tensor>>;
5678 auto matched_add = [](at::Tensor a, at::Tensor b) -> return_t {
5679 auto fusion = std::make_unique<Fusion>();
5680 FusionGuard fg(fusion.get());
5681
5682 Val* s1 = IrBuilder::create<Int>();
5683 Val* s2 = IrBuilder::create<Int>();
5684 auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1, s2});
5685 TensorView* tv0 = builder.build();
5686 TensorView* tv1 = builder.build();
5687
5688 fusion->addInput(tv0);
5689 fusion->addInput(tv1);
5690
5691 auto tv2 = add(tv0, tv1);
5692
5693 fusion->addOutput(tv2);
5694
5695 auto executor_cache =
5696 std::make_unique<FusionExecutorCache>(std::move(fusion));
5697 auto cg_outputs = executor_cache->runFusionWithInputs({a, b});
5698 return {std::move(executor_cache), std::move(cg_outputs)};
5699 };
5700
5701 {
5702 auto ret1 = matched_add(a, b);
5703 testValidate(
5704 ret1.first->fusion(), ret1.second, {a, b}, {a + b}, __LINE__, __FILE__);
5705 }
5706
5707 {
5708 EXPECT_THAT(
5709 [&]() { matched_add(a, c); },
5710 ::testing::ThrowsMessage<c10::Error>(
5711 ::testing::HasSubstr("Attempting to bind")));
5712 }
5713}
5714
5715TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) {
5716 auto fusion = std::make_unique<Fusion>();
5717 FusionGuard fg(fusion.get());
5718
5719 Val* s1 = IrBuilder::create<Int>();
5720 auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1});
5721 TensorView* tv0 = builder.build();
5722
5723 fusion->addInput(tv0);
5724
5725 auto tv1 = add(tv0, s1);
5726
5727 fusion->addOutput(tv1);
5728
5729 const auto options =
5730 at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5731
5732 at::Tensor a = at::zeros({123}, options);
5733
5734 FusionExecutorCache executor_cache(std::move(fusion));
5735 auto cg_outputs = executor_cache.runFusionWithInputs({a});
5736
5737 testValidate(
5738 executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__);
5739}
5740
5741TEST_F(NVFuserTest, FusionDependencyCheck_CUDA) {
5742 Fusion fusion;
5743 FusionGuard fg(&fusion);
5744
5745 TensorView* tv0 = makeSymbolicTensor(1);
5746 TensorView* tv1 = makeSymbolicTensor(1);
5747 TensorView* tv2 = makeSymbolicTensor(1);
5748 TensorView* tv3 = makeSymbolicTensor(1);
5749
5750 auto tv4 = add(tv0, tv1);
5751 auto tv5 = add(tv0, tv2);
5752 auto tv6 = add(tv0, tv3);
5753
5754 auto tv7 = add(tv1, tv2);
5755 auto tv8 = add(tv1, tv3);
5756
5757 auto tv9 = add(tv2, tv3);
5758
5759 {
5760 auto all_vals = DependencyCheck::getAllValsBetween(
5761 {tv0, tv1}, {tv4, tv5, tv6, tv7, tv8, tv9});
5762 std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end());
5763 std::vector<Val*> results({tv0, tv1, tv4, tv5, tv6, tv7, tv8});
5764 for (auto result : results) {
5765 TORCH_CHECK(all_vals_set.count(result) > 0);
5766 all_vals_set.erase(result);
5767 }
5768 TORCH_CHECK(all_vals_set.empty());
5769 }
5770
5771 auto tv10 = add(tv6, tv7);
5772 {
5773 auto all_vals = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv10});
5774 std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end());
5775 std::vector<Val*> results({tv0, tv1, tv6, tv7, tv10});
5776 for (auto result : results) {
5777 TORCH_CHECK(all_vals_set.count(result) > 0);
5778 all_vals_set.erase(result);
5779 }
5780 TORCH_CHECK(all_vals_set.empty());
5781 }
5782}
5783
5784// Repro for issue #1925
5785TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
5786 Fusion fusion;
5787 FusionGuard fg(&fusion);
5788
5789 auto tv0 = makeSymbolicTensor(4);
5790 auto tv1 = makeConcreteTensor({-1, -1, -1, 1});
5791 fusion.addInput(tv0);
5792 fusion.addInput(tv1);
5793 auto tv2 = add(tv0, tv1);
5794 fusion.addOutput(tv2);
5795
5796 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5797 at::Tensor input0 = at::randn({1, 1, 333, 1}, options);
5798 at::Tensor input1 = at::randn({1, 1, 333, 1}, options);
5799
5800 auto lparams = scheduleTranspose(&fusion, {input0, input1});
5801
5802 FusionExecutor fe;
5803 fe.compileFusion(&fusion, {input0, input1}, lparams);
5804 auto outputs = fe.runFusion({input0, input1}, lparams);
5805
5806 auto tv_ref = input0 + input1;
5807
5808 testValidate(
5809 &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
5810}
5811
5812// Repro for issue #1873
5813TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
5814 Fusion fusion;
5815 FusionGuard fg(&fusion);
5816
5817 auto tv0 = makeContigTensor(1);
5818 auto tv1 = makeContigTensor(2);
5819 fusion.addInput(tv0);
5820 fusion.addInput(tv1);
5821 auto tv2 = set(tv0);
5822 auto tv3 = broadcast(tv2, {true, false});
5823 auto tv4 = add(tv3, tv1);
5824 fusion.addOutput(tv4);
5825
5826 tv4->merge(0);
5827 tv4->split(0, 32);
5828
5829 tv0->computeAt(tv4, 1);
5830
5831 tv2->split(-1, 8);
5832
5833 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5834 at::Tensor t0 = at::randn({123}, options);
5835 at::Tensor t1 = at::randn({3, 123}, options);
5836
5837 FusionExecutor fe;
5838 fe.compileFusion(&fusion, {t0, t1});
5839
5840 auto outputs = fe.runFusion({t0, t1});
5841
5842 auto tv_ref = t0 + t1;
5843
5844 testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
5845}
5846
5847TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
5848 // https://github.com/csarofeen/pytorch/issues/1926
5849 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5850 auto fusion = fusion_ptr.get();
5851 FusionGuard fg(fusion);
5852
5853 TensorView* tv0 = makeSymbolicTensor(2);
5854 fusion->addInput(tv0);
5855 auto tv1 = set(tv0);
5856 auto tv2 = set(tv1);
5857 fusion->addOutput(tv2);
5858
5859 tv1->setMemoryType(MemoryType::Shared);
5860 for (auto tv : {tv1, tv2}) {
5861 tv->split(0, 4);
5862 tv->reorder({{1, -1}});
5863 tv->split(1, 8);
5864 tv->merge(0);
5865 tv->split(0, 1);
5866 tv->axis(0)->parallelize(ParallelType::BIDx);
5867 tv->axis(1)->parallelize(ParallelType::Unswitch);
5868 }
5869 tv1->merge(2);
5870 tv2->reorder({{2, 3}});
5871 tv2->merge(2);
5872 for (auto tv : {tv1, tv2}) {
5873 tv->axis(-1)->parallelize(ParallelType::TIDx);
5874 }
5875
5876 inlineMost();
5877
5878 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
5879 at::Tensor t0 = at::randn({5, 5}, options);
5880
5881 FusionExecutor fe;
5882 fe.compileFusion(fusion, {t0});
5883 auto cg_outputs = fe.runFusion({t0});
5884 auto out = cg_outputs[0];
5885
5886 testValidate(fusion, {out}, {t0}, {t0}, __LINE__, __FILE__);
5887}
5888
5889TEST_F(NVFuserTest, AsyncCompilation_CUDA) {
5890 auto fusion = std::make_unique<Fusion>();
5891 FusionGuard fg(fusion.get());
5892
5893 TensorView* tv0 = makeSymbolicTensor(2);
5894 TensorView* tv1 = makeSymbolicTensor(1);
5895 TensorView* tv2 = makeSymbolicTensor(2);
5896
5897 fusion->addInput(tv0);
5898 fusion->addInput(tv1);
5899 fusion->addInput(tv2);
5900
5901 TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
5902 TensorView* tv4 =
5903 max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
5904 TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce,
5905 // keeps normalization scheduler away)
5906 TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce)
5907
5908 fusion->addOutput(tv6);
5909
5910 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5911
5912 at::Tensor t0 = at::randn({8, 5}, options);
5913 at::Tensor t1 = at::randn({5}, options);
5914 at::Tensor t2 = at::randn({8, 5}, options);
5915
5916 auto t3 = t0.add(1.0);
5917 auto t4 = std::get<0>(at::max(t3, 0));
5918 auto t5 = t4.add(t1);
5919 auto t6 = t5.add(t2);
5920
5921 FusionExecutorCache executor_cache(std::move(fusion));
5922
5923 std::vector<IValue> aten_inputs = {t0, t1, t2};
5924
5925 executor_cache.compileFusionAsync(aten_inputs);
5926
5927 while (!executor_cache.isCompiled(aten_inputs)) {
5928 std::this_thread::sleep_for(std::chrono::milliseconds(20));
5929 printf(".");
5930 }
5931
5932 auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
5933
5934 TORCH_CHECK(
5935 executor_cache.getMostRecentKernelRuntime()->isSegmented(),
5936 "segmentation didn't happen");
5937 TORCH_CHECK(
5938 executor_cache.getMostRecentKernelRuntime()
5939 ->fusionSegments()
5940 ->groups()
5941 .size() == 2,
5942 "segmentation didn't happen as expected");
5943
5944 testValidate(
5945 executor_cache.fusion(), outputs, aten_inputs, {t6}, __LINE__, __FILE__);
5946}
5947
5948TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
5949 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5950 auto fusion = fusion_ptr.get();
5951 FusionGuard fg(fusion);
5952
5953 TensorView* tv0 = makeConcreteTensor({1, 1});
5954 TensorView* tv1 = makeConcreteTensor({-1});
5955 fusion->addInput(tv0);
5956 fusion->addInput(tv1);
5957 auto tv2 = sum(tv0, {1});
5958 auto tv3 = add(tv2, tv1);
5959 fusion->addOutput(tv3);
5960
5961 tv0->merge(0);
5962
5963 MaxRootDomainInfoSpanningTree tree(tv0);
5964 TransformPropagatorWithCheck tp(tv0);
5965 tree.traverse(&tp);
5966
5967 inlineMost();
5968
5969 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
5970 at::Tensor t0 = at::randn({1, 1}, options);
5971 at::Tensor t1 = at::randn({10}, options);
5972
5973 FusionExecutor fe;
5974 fe.compileFusion(fusion, {t0, t1});
5975 auto cg_outputs = fe.runFusion({t0, t1});
5976 auto out = cg_outputs[0];
5977
5978 testValidate(
5979 fusion, {out}, {t0, t1}, {t1 + t0.flatten()}, __LINE__, __FILE__);
5980}
5981
5982TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) {
5983 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
5984 auto fusion = fusion_ptr.get();
5985 FusionGuard fg(fusion);
5986
5987 TensorView* tv0 = makeConcreteTensor({-1, 1, 1});
5988 TensorView* tv1 = makeConcreteTensor({-1, -1});
5989 fusion->addInput(tv0);
5990 fusion->addInput(tv1);
5991 auto tv2 = sum(tv0, {1});
5992 auto tv3 = add(tv2, tv1);
5993 fusion->addOutput(tv3);
5994
5995 tv2->merge(1);
5996 tv2->merge(0);
5997
5998 MaxRootDomainInfoSpanningTree tree(tv0);
5999 TransformPropagatorWithCheck tp(tv0);
6000 tree.traverse(&tp);
6001
6002 inlineMost();
6003
6004 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6005 at::Tensor t0 = at::randn({10, 1, 1}, options);
6006 at::Tensor t1 = at::randn({10, 10}, options);
6007
6008 FusionExecutor fe;
6009 fe.compileFusion(fusion, {t0, t1});
6010 auto cg_outputs = fe.runFusion({t0, t1});
6011 auto out = cg_outputs[0];
6012
6013 testValidate(
6014 fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__);
6015}
6016
6017// Simple test case exercising the null scheduler path.
6018TEST_F(NVFuserTest, FusionNullScheduler_CUDA) {
6019 auto fusion = std::make_unique<Fusion>();
6020 FusionGuard fg(fusion.get());
6021
6022 auto tv0 = makeConcreteTensor({1, 1, 1});
6023 fusion->addInput(tv0);
6024
6025 auto tv1 = sum(tv0, {0, 1, 2});
6026
6027 fusion->addOutput(tv1);
6028
6029 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6030 at::Tensor t0 = at::randn({1, 1, 1}, options);
6031
6032 std::vector<IValue> aten_inputs({t0});
6033
6034 FusionExecutorCache executor_cache(std::move(fusion));
6035 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6036
6037 auto t1 = t0.sum({0, 1, 2});
6038
6039 testValidate(
6040 executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
6041
6042 auto groups =
6043 executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6044
6045 // Check that all groups on the resulting runtime are null.
6046 for (auto group : groups) {
6047 TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6048 }
6049}
6050
6051// Simple test case exercising the null scheduler path.
6052TEST_F(NVFuserTest, FusionNullScheduler2_CUDA) {
6053 auto fusion = std::make_unique<Fusion>();
6054 FusionGuard fg(fusion.get());
6055
6056 auto tv0 = makeConcreteTensor({0, 1, 9223372036854775807L});
6057 fusion->addInput(tv0);
6058
6059 auto tv1 = sum(tv0, {0, 1, 2});
6060
6061 fusion->addOutput(tv1);
6062
6063 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6064 at::Tensor t0 = at::randn({0, 1, 9223372036854775807L}, options);
6065
6066 std::vector<IValue> aten_inputs({t0});
6067
6068 FusionExecutorCache executor_cache(std::move(fusion));
6069 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6070
6071 auto t1 = t0.sum({0, 1, 2});
6072
6073 testValidate(
6074 executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
6075
6076 auto groups =
6077 executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6078
6079 // Check that all groups on the resulting runtime are null.
6080 for (auto group : groups) {
6081 TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6082 }
6083}
6084
6085// Simple test case exercising the null scheduler path.
6086TEST_F(NVFuserTest, FusionNullScheduler3_CUDA) {
6087 auto fusion = std::make_unique<Fusion>();
6088 FusionGuard fg(fusion.get());
6089
6090 auto tv0 = TensorViewBuilder().ndims(0).build();
6091 auto tv1 = TensorViewBuilder().ndims(0).build();
6092 fusion->addInput(tv0);
6093 fusion->addInput(tv1);
6094 auto tv2 = add(tv0, tv1);
6095 fusion->addOutput(tv2);
6096
6097 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6098 at::Tensor t0 = at::randn({}, options);
6099 at::Tensor t1 = at::randn({}, options);
6100
6101 std::vector<IValue> aten_inputs({t0, t1});
6102
6103 FusionExecutorCache executor_cache(std::move(fusion));
6104 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6105
6106 testValidate(
6107 executor_cache.fusion(),
6108 cg_outputs,
6109 {t0, t1},
6110 {t0 + t1},
6111 __LINE__,
6112 __FILE__);
6113
6114 auto groups =
6115 executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6116
6117 // Check that all groups on the resulting runtime are null.
6118 for (auto group : groups) {
6119 TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6120 }
6121}
6122
6123TEST_F(NVFuserTest, FusionEmpty_CUDA) {
6124 auto fusion = std::make_unique<Fusion>();
6125 FusionGuard fg(fusion.get());
6126
6127 auto tv0 = makeConcreteTensor({10, 10, 10});
6128 auto tv1 = makeConcreteTensor({10, 10, 10});
6129 fusion->addInput(tv0);
6130 fusion->addInput(tv1);
6131 fusion->addOutput(tv0);
6132 fusion->addOutput(tv1);
6133
6134 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6135 at::Tensor t0 = at::randn({10, 10, 10}, options);
6136 at::Tensor t1 = at::randn({10, 10, 10}, options);
6137
6138 std::vector<IValue> aten_inputs({t0, t1});
6139
6140 FusionExecutorCache executor_cache(std::move(fusion));
6141 auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
6142
6143 testValidate(
6144 executor_cache.fusion(),
6145 cg_outputs,
6146 {t0, t1},
6147 {t0, t1},
6148 __LINE__,
6149 __FILE__);
6150
6151 auto groups =
6152 executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
6153
6154 // Check that all groups on the resulting runtime are null.
6155 for (auto group : groups) {
6156 TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
6157 }
6158}
6159
6160TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
6161 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6162 auto fusion = fusion_ptr.get();
6163 FusionGuard fg(fusion);
6164
6165 TensorView* tv0 = makeConcreteTensor({1, 1});
6166 TensorView* tv1 = makeConcreteTensor({-1, 1, 1});
6167 fusion->addInput(tv0);
6168 fusion->addInput(tv1);
6169 auto tv2 = set(tv0);
6170 auto tv3 = broadcast(tv2, {true, false, false});
6171 auto tv4 = add(tv3, tv1);
6172
6173 fusion->addOutput(tv4);
6174
6175 tv4->merge(-2);
6176 tv4->merge(-1);
6177
6178 tv0->computeAt(tv4, -1);
6179 tv1->computeAt(tv4, -1);
6180
6181 ComputeAtMap ca_map(fusion);
6182
6183 // FIXME: This is the concerning part that would motivate some
6184 // more formalization on concrete/permissive mapping:
6185 // exact mapping should ideally imply permissive mapping.
6186 auto tv4_inner_node = tv4->axis(0)->definition()->input(1)->as<IterDomain>();
6187 TORCH_CHECK(
6188 ca_map.areMapped(tv2->axis(0), tv4_inner_node, IdMappingMode::EXACT));
6189 TORCH_CHECK(!ca_map.areMapped(
6190 tv2->axis(0), tv4_inner_node, IdMappingMode::PERMISSIVE));
6191
6192 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6193 at::Tensor t0 = at::randn({1, 1}, options);
6194 at::Tensor t1 = at::randn({2, 1, 1}, options);
6195
6196 FusionExecutor fe;
6197 fe.compileFusion(fusion, {t0, t1});
6198 auto cg_outputs = fe.runFusion({t0, t1});
6199 auto out = cg_outputs[0];
6200
6201 testValidate(
6202 fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__);
6203}
6204
6205TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
6206 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6207 auto fusion = fusion_ptr.get();
6208 FusionGuard fg(fusion);
6209
6210 TensorView* tv0 = makeSymbolicTensor(2);
6211 fusion->addInput(tv0);
6212 auto tv1 = sin(tv0);
6213 auto tv2 = cos(tv1);
6214 fusion->addOutput(tv2);
6215
6216 tv1->inlineAt(-1);
6217
6218 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6219 at::Tensor t0 = at::randn({100, 2}, options);
6220
6221 FusionExecutor fe;
6222 fe.compileFusion(fusion, {t0});
6223 auto cg_outputs = fe.runFusion({t0});
6224 auto out = cg_outputs[0];
6225
6226 testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
6227}
6228
6229TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
6230 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6231 auto fusion = fusion_ptr.get();
6232 FusionGuard fg(fusion);
6233
6234 TensorView* tv0 = makeConcreteTensor({-1, -1});
6235 TensorView* tv1 = makeConcreteTensor({-1, -1});
6236 fusion->addInput(tv0);
6237 fusion->addInput(tv1);
6238 // Note: tv2 is not needed. Kept it here since previously there was an
6239 // assertion from sorting in codegen.
6240 auto tv2 = add(tv1, IrBuilder::create<Double>(3.141));
6241 fusion->addOutput(tv0);
6242
6243 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6244 at::Tensor t0 = at::randn({10, 4}, options);
6245 at::Tensor t1 = at::randn({10, 4}, options);
6246
6247 FusionExecutorCache fec(std::move(fusion_ptr));
6248 auto cg_outputs = fec.runFusionWithInputs({t0, t1});
6249
6250 testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);
6251
6252 // Second run to ensure cache hit handles trivial forwarding properly
6253 TORCH_CHECK(fec.isCompiled({t0, t1}));
6254 auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
6255 testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
6256}
6257
6258TEST_F(NVFuserTest, FusionTrivialInputForwarding2_CUDA) {
6259 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6260 auto fusion = fusion_ptr.get();
6261 FusionGuard fg(fusion);
6262
6263 TensorView* tv0 = makeSymbolicTensor(0);
6264 fusion->addInput(tv0);
6265 fusion->addOutput(tv0);
6266
6267 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6268 at::Tensor t0 = at::randn({}, options);
6269
6270 FusionExecutorCache fec(std::move(fusion_ptr));
6271 auto cg_outputs = fec.runFusionWithInputs({t0});
6272
6273 testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6274
6275 // Second run to ensure cache hit handles trivial forwarding properly
6276 TORCH_CHECK(fec.isCompiled({t0}));
6277 auto cg_outputs2 = fec.runFusionWithInputs({t0});
6278 testValidate(fusion, cg_outputs2, {t0}, {t0}, __LINE__, __FILE__);
6279}
6280
6281// Simplified repro of issue #2008
6282TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) {
6283 auto fusion_ptr = std::make_unique<Fusion>();
6284 Fusion& fusion = *fusion_ptr;
6285 FusionGuard fg(fusion_ptr.get());
6286
6287 std::vector<int64_t> shape({10, 1, 1});
6288
6289 auto tv0 = makeConcreteTensor(shape);
6290 fusion.addInput(tv0);
6291
6292 auto tv1 = add(tv0, IrBuilder::create<Double>(1));
6293 auto tv2 = sum(tv1, {1, 2});
6294 auto tv3 = broadcast(tv2, {false, true, true});
6295 fusion.addOutput(tv3);
6296
6297 tv0->merge(-2, -1)->merge(-2, -1)->split(0, 4);
6298
6299 MaxRootDomainInfoSpanningTree tree(tv0);
6300 TransformPropagator tp(tv0);
6301 tree.traverse(&tp);
6302
6303 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6304 at::Tensor t0 = at::randn(shape, options);
6305 std::vector<IValue> aten_inputs({t0});
6306
6307 FusionExecutor fe;
6308 fe.compileFusion(fusion_ptr.get(), aten_inputs);
6309 auto outputs = fe.runFusion(aten_inputs);
6310
6311 testValidate(&fusion, outputs, aten_inputs, {t0 + 1}, __LINE__, __FILE__);
6312}
6313
6314namespace {
6315
6316size_t getVecSizeForPointwise(FusionExecutorCache& fec) {
6317 auto most_recent_params =
6318 fec.getMostRecentKernelRuntime()->getMostRecentExecutorLog().params;
6319 auto params = dynamic_cast<PointwiseParams*>(most_recent_params.get());
6320 if (params->vectorize) {
6321 return params->unroll_factor;
6322 }
6323 return 1;
6324}
6325
6326} // namespace
6327
6328TEST_F(NVFuserTest, FusionVectorizeStrideContiguity2D_CUDA) {
6329 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6330 auto fusion = fusion_ptr.get();
6331 FusionGuard fg(fusion);
6332
6333 TensorView* tv0 =
6334 TensorViewBuilder().ndims(2).contiguity({false, true}).build();
6335 fusion->addInput(tv0);
6336 auto tv1 = set(tv0);
6337 fusion->addOutput(tv1);
6338
6339 FusionExecutorCache fec(std::move(fusion_ptr));
6340 fec.profile(true);
6341
6342 std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {18, 2}, {32, 4}};
6343
6344 for (auto pair : size_and_vec) {
6345 auto size = pair.first;
6346 auto vec = pair.second;
6347 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6348 at::Tensor t0 = at::randn({1000000, size}, options).narrow(1, 0, 16);
6349 auto cg_outputs = fec.runFusionWithInputs({t0});
6350
6351 TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6352
6353 testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6354 }
6355}
6356
6357TEST_F(NVFuserTest, FusionVectorizeStrideContiguity3D_CUDA) {
6358 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6359 auto fusion = fusion_ptr.get();
6360 FusionGuard fg(fusion);
6361
6362 TensorView* tv0 =
6363 TensorViewBuilder().ndims(3).contiguity({false, true, true}).build();
6364 fusion->addInput(tv0);
6365 auto tv1 = set(tv0);
6366 fusion->addOutput(tv1);
6367
6368 FusionExecutorCache fec(std::move(fusion_ptr));
6369 fec.profile(true);
6370
6371 std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {10, 2}, {16, 4}};
6372
6373 for (auto pair : size_and_vec) {
6374 auto size = pair.first;
6375 auto vec = pair.second;
6376 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6377 at::Tensor t0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8);
6378 auto cg_outputs = fec.runFusionWithInputs({t0});
6379
6380 TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6381
6382 testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6383 }
6384}
6385
6386TEST_F(NVFuserTest, FusionVectorizeStrideContiguity5D_CUDA) {
6387 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6388 auto fusion = fusion_ptr.get();
6389 FusionGuard fg(fusion);
6390
6391 TensorView* tv0 = TensorViewBuilder()
6392 .ndims(5)
6393 .contiguity({false, true, false, true, true})
6394 .build();
6395 fusion->addInput(tv0);
6396 auto tv1 = set(tv0);
6397 fusion->addOutput(tv1);
6398
6399 FusionExecutorCache fec(std::move(fusion_ptr));
6400 fec.profile(true);
6401
6402 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6403
6404 std::vector<std::tuple<int, int, int>> sizes_and_vec{
6405 {9, 17, 1}, {9, 10, 2}, {9, 16, 4}};
6406
6407 for (auto tup : sizes_and_vec) {
6408 auto size1 = std::get<0>(tup);
6409 auto size2 = std::get<1>(tup);
6410 auto vec = std::get<2>(tup);
6411 at::Tensor t0 = at::randn({4, size1, 12345, size2, 3}, options)
6412 .narrow(1, 0, 8)
6413 .narrow(3, 0, 4);
6414 auto cg_outputs = fec.runFusionWithInputs({t0});
6415
6416 TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6417
6418 testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6419 }
6420}
6421
6422TEST_F(NVFuserTest, FusionVectorizeStrideContiguitySelfOverlapping_CUDA) {
6423 std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
6424 auto fusion = fusion_ptr.get();
6425 FusionGuard fg(fusion);
6426
6427 TensorView* tv0 = TensorViewBuilder()
6428 .ndims(5)
6429 .contiguity({false, true, false, true, true})
6430 .build();
6431 fusion->addInput(tv0);
6432 auto tv1 = set(tv0);
6433 fusion->addOutput(tv1);
6434
6435 FusionExecutorCache fec(std::move(fusion_ptr));
6436 fec.profile(true);
6437
6438 auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
6439
6440 std::vector<std::tuple<int, int, int, int>> sizes_strides_and_vec{
6441 {4, 4, 4, 4},
6442 {4, 4, 2, 2},
6443 {4, 2, 4, 2},
6444 {2, 4, 4, 2},
6445 {4, 4, 1, 1},
6446 {4, 1, 4, 1},
6447 {1, 4, 4, 1},
6448 {2, 2, 2, 2},
6449 {2, 2, 1, 1},
6450 {2, 1, 2, 1},
6451 {1, 2, 2, 1}};
6452
6453 for (auto tup : sizes_strides_and_vec) {
6454 auto size = std::get<0>(tup);
6455 auto stride1 = std::get<1>(tup);
6456 auto stride2 = std::get<2>(tup);
6457 auto vec = std::get<3>(tup);
6458 std::vector<int64_t> shape = {4, 4, 12345, size, 3};
6459 std::vector<int64_t> stride = {stride1, stride2 * 12345, stride2, 3, 1};
6460 at::Tensor t0 = at::empty_strided(shape, stride, options);
6461 t0.random_();
6462 auto cg_outputs = fec.runFusionWithInputs({t0});
6463 TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
6464 testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
6465 }
6466}
6467
6468TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
6469 Fusion fusion;
6470 FusionGuard fg(&fusion);
6471
6472 // requires ampere+ GPU
6473 if (!deviceMajorMinorCheck(8)) {
6474 GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
6475 return;
6476 }
6477
6478 auto tv0 = makeContigTensor(1);
6479
6480 fusion.addInput(tv0);
6481
6482 auto tv1 = set(tv0);
6483
6484 fusion.addOutput(tv1);
6485
6486 auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync);
6487 tv_cache->setMemoryType(MemoryType::Shared);
6488
6489 tv1->split(0, 16);
6490 tv0->computeAt(tv1, 1);
6491
6492 tv_cache->circularBuffer(10);
6493
6494 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
6495 at::Tensor input1 = at::randn({255}, options);
6496
6497 // Add check that the cp async op has an inlined predicate.
6498 class InlinedCpAsyncPredChecker : public kir::IrVisitor {
6499 public:
6500 using kir::IrVisitor::handle;
6501
6502 private:
6503 void handle(kir::IfThenElse* ite) final {
6504 auto prev_within_ite = within_ite_;
6505 within_ite_ = true;
6506 kir::IrVisitor::handle(ite);
6507 within_ite_ = prev_within_ite;
6508 }
6509
6510 void handle(LoadStoreOp* ldst) final {
6511 if (ldst->opType() == LoadStoreOpType::CpAsync) {
6512 TORCH_INTERNAL_ASSERT(!within_ite_, "CPASYNC predicate not inlined");
6513 TORCH_INTERNAL_ASSERT(
6514 ldst->predicate()->hasValue() &&
6515 !ldst->predicate()->value()->isConst(),
6516 "CPASYNC predicate is not generated");
6517 }
6518 }
6519
6520 private:
6521 bool within_ite_ = false;
6522 } pred_checker;
6523
6524 // Check that cp async is inlined:
6525 GpuLower gpulw(&fusion);
6526 pred_checker.handle(gpulw.kernel()->topLevelExprs());
6527
6528 FusionExecutor fe;
6529 fe.compileFusion(&fusion, {input1});
6530 auto cg_outputs = fe.runFusion({input1});
6531
6532 testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__);
6533}
6534
6535// Test file size should be up to 10K LoC. Create a new file for more tests.
6536
6537} // namespace jit
6538} // namespace torch
6539#endif // #if defined(USE_CUDA)
6540